In [None]:
import pandas as pd
import random
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score, log_loss
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy.stats import chi2_contingency, f_oneway
from sklearn.feature_selection import RFE

# Importing Data

In [None]:
data_train = pd.read_csv('Mushroom_datasets/mushroom_train.csv')
data_test = pd.read_csv('Mushroom_datasets/mushroom_test.csv')
data_train.head()

# Trival System

In [None]:
N = data_train.shape[0]
N1 = data_train[data_train['class']=='p'].shape[0]
N2 = data_train[data_train['class']=='e'].shape[0]
N_test = data_test['class'].shape[0]
y_pred = []

for _ in range(N_test):
    if random.random() < N1/N:
        y_pred.append('p')
    else:
        y_pred.append('e')

acc = 0
for i in range(N_test):
    if y_pred[i] == data_test['class'][i]:
        acc+=1
        
print("Accuracy of test dataset is", acc*100/N_test, "%")

In [None]:
f_score = f1_score(data_test['class'], y_pred, pos_label='e')
print("F1 score:", f_score)

In [None]:
cm = confusion_matrix(data_test['class'], y_pred)
cm_df = pd.DataFrame(cm, index=['Actual P', 'Actual E'], columns=['Predicted P', 'Predicted E'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title("Confussion Matrix")
plt.show()

# One-Hot Encoding

In [None]:
classes = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']
for i in classes:
    print("Unique values of", i, ':', data_train[i].unique())

temp_train = pd.get_dummies(data_train, columns=classes)
temp_test = pd.get_dummies(data_test, columns=classes)
le = LabelEncoder()
temp_train['class'] = le.fit_transform(temp_train['class']) 
temp_test['class'] = le.fit_transform(temp_test['class']) 

In [None]:
temp_train.head()

# 1) Standardizing Dataset

In [None]:
scaler = StandardScaler()
X_train = temp_train.drop('class', axis=1)
X_train[['cap-diameter', 'stem-height', 'stem-width']] = scaler.fit_transform(X_train[['cap-diameter', 'stem-height', 'stem-width']])
y_train = temp_train['class']
X_test = temp_test.drop('class', axis=1)
X_test[['cap-diameter', 'stem-height', 'stem-width']] = scaler.fit_transform(X_test[['cap-diameter', 'stem-height', 'stem-width']])
y_test = temp_test['class']

# 2) Baseline System

In [None]:
class_means = X_train.groupby(y_train).mean()
y_pred = []
for i in range(len(X_test)):
    distances = []
    for j in range(len(class_means)):
        distances.append(euclidean(X_test.iloc[i], class_means.iloc[j]))
    
    y_pred.append(distances.index(min(distances)))
    
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

In [None]:
f_score = f1_score(y_test, y_pred)
print("F1 score:", f_score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual E', 'Actual P'], columns=['Predicted E', 'Predicted P'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title("Confussion Matrix")
plt.show()

# 3) Feature Engineering

# a) PCA feature engineering

In [None]:
accuracy_train = []
accuracy_test = []
log_loss_train = []
log_loss_test = []
for i in range(1, 92, 10):
    pca = PCA(n_components=i)
    X_pca_train = pca.fit_transform(X_train)
    X_pca_test = pca.fit_transform(X_test)

    lr = LogisticRegression(max_iter=10000)
    lr.fit(X_pca_train, y_train)
    y_pred_train = lr.predict(X_pca_train)
    y_pred = lr.predict(X_pca_test)
    
    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))

print("Best decomposition value:", accuracy_test.index(max(accuracy_test))*10+1)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

# b) FDA feature engineering

In [None]:
fld = LinearDiscriminantAnalysis(n_components=1)
fld.fit(X_train, y_train)

X_train_fld = fld.transform(X_train)
X_test_fld = fld.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_fld, y_train)
y_pred = lr.predict(X_test_fld)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

# c) UFS feature engineering

### Using chi-squared statistical test

In [None]:
scores = []
for i in range(X_train.shape[1]):
    _, p, _, _ = chi2_contingency(pd.crosstab(X_train.iloc[:, i], y_train))
    scores.append(p)

feature_scores = sorted(zip(X_train.columns, scores), key=lambda x: x[1])

accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, len(feature_scores), 10):
    selected_features = [j[0] for j in feature_scores[:i]]
    new_train = X_train[selected_features]
    new_test = X_test[selected_features]
    features_list.append(selected_features)
    
    lr = LogisticRegression(max_iter=10000)
    lr.fit(new_train, y_train)
    y_pred_train = lr.predict(new_train)
    y_pred = lr.predict(new_test)

    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("-"*50)
print("Best feature set:", features_list[accuracy_test.index(max(accuracy_test))])
print("-"*50)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

### Using ANOVA F-test statistical test

In [None]:
scores = []
for i in range(X_train.shape[1]):
    class_data = []
    for j in range(len(np.unique(X_train.iloc[:, i]))):
        class_data.append(y_train[X_train.iloc[:, i] == j])
    _, p = f_oneway(*class_data)
    scores.append(p)

feature_scores = sorted(zip(X_train.columns, scores), key=lambda x: x[1])

accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, len(feature_scores), 10):
    selected_features = [j[0] for j in feature_scores[:i]]
    new_train = X_train[selected_features]
    new_test = X_test[selected_features]
    features_list.append(selected_features)
    
    lr = LogisticRegression(max_iter=10000)
    lr.fit(new_train, y_train)
    y_pred_train = lr.predict(new_train)
    y_pred = lr.predict(new_test)

    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("-"*50)
print("Best feature set:", features_list[accuracy_test.index(max(accuracy_test))])
print("-"*50)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

# d) RFE feature engineering

In [None]:
from sklearn.feature_selection import RFE

accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, 92, 10):
    lr = LogisticRegression(max_iter=10000)
    rfe = RFE(lr, n_features_to_select=i)
    fit = rfe.fit(X_train, y_train)
    best_features = X_train.columns[fit.support_]
    features_list.append(best_features)
    final_model = LogisticRegression(max_iter=10000).fit(X_train[best_features], y_train)
    y_pred_train = final_model.predict(X_train[best_features])
    y_pred = final_model.predict(X_test[best_features])
    
    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

In [None]:
accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, X_train.shape[1], 10):
    num_features = i
    selected_features = []
    lr = LogisticRegression(max_iter=10000)

    for i in range(num_features):
        score = []
        for col in X_train.columns:
            if col not in selected_features:
                features = selected_features + [col]
                lr.fit(X_train[features], y_train)
                score.append(lr.score(X_train[features], y_train))
        
        selected_feature = X_train.columns[np.argmin(score)]
        selected_features.append(selected_feature)
    
    new_train = X_train[selected_features]
    features_list.append(selected_features)
    new_train = X_train[selected_features]
    new_test = X_test[selected_features]
    lr.fit(new_train, y_train)
    y_pred_train = lr.predict(new_train)
    y_pred = lr.predict(new_test)

    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("-"*50)
print("Best feature set:", features_list[accuracy_test.index(max(accuracy_test))])
print("-"*50)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

# e) SFE feature engineering

This took too much time to compute. mention a bit about this in report

# Label Encoding

In [None]:
classes = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season', 'class']
for i in classes:
    print("Unique values of", i, ':', data_train[i].unique())

le = LabelEncoder() 
temp_train = le.fit_transform(data_train[classes])
temp_test = le.fit_transform(data_test[classes])

In [None]:
temp_train.head()

# 1) Standardizing Dataset

In [None]:
scaler = StandardScaler()
X_train = temp_train.drop('class', axis=1)
X_train[['cap-diameter', 'stem-height', 'stem-width']] = scaler.fit_transform(X_train[['cap-diameter', 'stem-height', 'stem-width']])
y_train = temp_train['class']
X_test = temp_test.drop('class', axis=1)
X_test[['cap-diameter', 'stem-height', 'stem-width']] = scaler.fit_transform(X_test[['cap-diameter', 'stem-height', 'stem-width']])
y_test = temp_test['class']

# 2) Baseline System

In [None]:
class_means = X_train.groupby(y_train).mean()
y_pred = []
for i in range(len(X_test)):
    distances = []
    for j in range(len(class_means)):
        distances.append(euclidean(X_test.iloc[i], class_means.iloc[j]))
    
    y_pred.append(distances.index(min(distances)))
    
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

In [None]:
f_score = f1_score(y_test, y_pred)
print("F1 score:", f_score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual E', 'Actual P'], columns=['Predicted E', 'Predicted P'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title("Confussion Matrix")
plt.show()

# 3) Feature Engineering

# a) PCA feature engineering

In [None]:
accuracy_train = []
accuracy_test = []
log_loss_train = []
log_loss_test = []
for i in range(1, 92, 10):
    pca = PCA(n_components=i)
    X_pca_train = pca.fit_transform(X_train)
    X_pca_test = pca.fit_transform(X_test)

    lr = LogisticRegression(max_iter=10000)
    lr.fit(X_pca_train, y_train)
    y_pred_train = lr.predict(X_pca_train)
    y_pred = lr.predict(X_pca_test)
    
    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))

print("Best decomposition value:", accuracy_test.index(max(accuracy_test))*10+1)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

# b) FDA feature engineering

In [None]:
fld = LinearDiscriminantAnalysis(n_components=1)
fld.fit(X_train, y_train)

X_train_fld = fld.transform(X_train)
X_test_fld = fld.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_fld, y_train)
y_pred = lr.predict(X_test_fld)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

# c) UFS feature engineering

### Using chi-squared statistical test

In [None]:
scores = []
for i in range(X_train.shape[1]):
    _, p, _, _ = chi2_contingency(pd.crosstab(X_train.iloc[:, i], y_train))
    scores.append(p)

feature_scores = sorted(zip(X_train.columns, scores), key=lambda x: x[1])

accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, len(feature_scores), 10):
    selected_features = [j[0] for j in feature_scores[:i]]
    new_train = X_train[selected_features]
    new_test = X_test[selected_features]
    features_list.append(selected_features)
    
    lr = LogisticRegression(max_iter=10000)
    lr.fit(new_train, y_train)
    y_pred_train = lr.predict(new_train)
    y_pred = lr.predict(new_test)

    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("-"*50)
print("Best feature set:", features_list[accuracy_test.index(max(accuracy_test))])
print("-"*50)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

### Using ANOVA F-test statistical test

In [None]:
scores = []
for i in range(X_train.shape[1]):
    class_data = []
    for j in range(len(np.unique(X_train.iloc[:, i]))):
        class_data.append(y_train[X_train.iloc[:, i] == j])
    _, p = f_oneway(*class_data)
    scores.append(p)

feature_scores = sorted(zip(X_train.columns, scores), key=lambda x: x[1])

accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, len(feature_scores), 10):
    selected_features = [j[0] for j in feature_scores[:i]]
    new_train = X_train[selected_features]
    new_test = X_test[selected_features]
    features_list.append(selected_features)
    
    lr = LogisticRegression(max_iter=10000)
    lr.fit(new_train, y_train)
    y_pred_train = lr.predict(new_train)
    y_pred = lr.predict(new_test)

    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("-"*50)
print("Best feature set:", features_list[accuracy_test.index(max(accuracy_test))])
print("-"*50)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()

# d) RFE feature engineering

In [None]:
accuracy_train = []
accuracy_test = []
features_list = []
log_loss_train = []
log_loss_test = []
for i in range(1, X_train.shape[1], 10):
    num_features = i
    selected_features = []
    lr = LogisticRegression(max_iter=10000)

    for i in range(num_features):
        score = []
        for col in X_train.columns:
            if col not in selected_features:
                features = selected_features + [col]
                lr.fit(X_train[features], y_train)
                score.append(lr.score(X_train[features], y_train))
        
        selected_feature = X_train.columns[np.argmin(score)]
        selected_features.append(selected_feature)
    
    new_train = X_train[selected_features]
    features_list.append(selected_features)
    new_train = X_train[selected_features]
    new_test = X_test[selected_features]
    lr.fit(new_train, y_train)
    y_pred_train = lr.predict(new_train)
    y_pred = lr.predict(new_test)

    log_loss_train.append(log_loss(y_train, y_pred_train))
    log_loss_test.append(log_loss(y_test, y_pred))
    accuracy_train.append(accuracy_score(y_train, y_pred_train))
    accuracy_test.append(accuracy_score(y_test, y_pred))
    
print("Best number of features to select:", len(features_list[accuracy_test.index(max(accuracy_test))]))
print("-"*50)
print("Best feature set:", features_list[accuracy_test.index(max(accuracy_test))])
print("-"*50)
print("Best accuracy of test dataset:", max(accuracy_test)*100, "%")
plt.plot(range(1, 92, 10), np.array(accuracy_train)*100, label='Train dataset')
plt.plot(range(1, 92, 10), np.array(accuracy_test)*100, label='Test dataset')
plt.xlabel("Number of features to select")
plt.ylabel("Accuracy(%)")
plt.legend()
plt.show()

plt.plot(range(1, 92, 10), log_loss_train, label='Train dataset')
plt.plot(range(1, 92, 10), log_loss_test, label='Test dataset')
plt.xlabel("Number of feature reductions")
plt.ylabel("Log Loss")
plt.legend()
plt.show()