In [None]:
import pandas as pd
import random
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import euclidean
from sklearn.metrics import accuracy_score

# Importing Data

In [None]:
data_train = pd.read_csv('Mushroom_datasets/mushroom_train.csv')
data_test = pd.read_csv('Mushroom_datasets/mushroom_test.csv')
data_train.head()

# Trival System

In [None]:
N = data_train.shape[0]
N1 = data_train[data_train['class']=='p'].shape[0]
N2 = data_train[data_train['class']=='e'].shape[0]
N_test = data_test['class'].shape[0]
y_pred = []

for _ in range(N_test):
    if random.random() < N1/N:
        y_pred.append('p')
    else:
        y_pred.append('e')

acc = 0
for i in range(N_test):
    if y_pred[i] == data_test['class'][i]:
        acc+=1
        
print("Accuracy of test dataset is", acc*100/N, "%")

In [None]:
f_score = f1_score(data_test['class'], y_pred, pos_label='e')
print("F1 score:", f_score)

In [None]:
cm = confusion_matrix(data_test['class'], y_pred)
cm_df = pd.DataFrame(cm, index=['Actual P', 'Actual E'], columns=['Predicted P', 'Predicted E'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title("Confussion Matrix")
plt.show()

# One-Hot Encoding

In [None]:
classes = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']
for i in classes:
    print("Unique values of", i, ':', data_train[i].unique())

temp_train = pd.get_dummies(data_train, columns=classes)
temp_test = pd.get_dummies(data_test, columns=classes)

# Baseline System

In [None]:
X_train = temp_train.drop('class', axis=1)
y_train = temp_train['class']
X_test = temp_test.drop('class', axis=1)
y_test = temp_test['class']
class_means = X_train.groupby(y_train).mean()
    
y_pred = []
class_labels = class_means.index.values
for i in range(len(X_test)):
    distances = []
    for j in range(len(class_means)):
        dist = euclidean(X_test.iloc[i], class_means.iloc[j])
        distances.append(dist)
    pred_idx = distances.index(min(distances))
    pred_label = class_labels[pred_idx]
    y_pred.append(pred_label)
    
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

In [None]:
f_score = f1_score(y_test, y_pred, pos_label='e')
print("F1 score:", f_score)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual P', 'Actual E'], columns=['Predicted P', 'Predicted E'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title("Confussion Matrix")
plt.show()

# Using PCA feature engineering

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=34)
X_pca_train = pca.fit_transform(X_train)
X_pca_test = pca.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_pca_train, y_train)
y_pred = lr.predict(X_pca_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

# Using FDA feature engineering

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

fld = LinearDiscriminantAnalysis(n_components=1)
fld.fit(X_train, y_train)

X_train_fld = fld.transform(X_train)
X_test_fld = fld.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_fld, y_train)
y_pred = lr.predict(X_test_fld)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

# Using UFS feature engineering

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_train)
y_t = label_encoder.fit_transform(y_test)

best_select = SelectKBest(score_func=f_regression, k=60)
best_select.fit(X_train, y)

best_features = best_select.get_support(indices=True)
features = X_train.columns[best_features]
new_train = X_train[features]
new_test = X_test[features]

lr = LogisticRegression(max_iter=10000)
lr.fit(new_train, y)
y_pred = lr.predict(new_test)

accuracy = accuracy_score(y_t, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

# Using RFE feature engineering

In [None]:
from sklearn.feature_selection import RFE

lr = LogisticRegression(max_iter=10000)
rfe = RFE(lr, n_features_to_select=50)
fit = rfe.fit(X_train, y_train)
selected_features = X_train.columns[fit.support_]
final_model = LogisticRegression(max_iter=10000).fit(X_train[selected_features], y_train)
y_pred = final_model.predict(X_test[selected_features])

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")

# Using SFE feature engineering

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

lr = LogisticRegression(max_iter=10000)
sfe = SequentialFeatureSelector(lr, n_features_to_select=1)
sfe.fit(X_train, y_train)
X_train_trans = sfe.transform(X_train)
X_test_trans = sfe.transform(X_test)
lr.fit(X_train_trans, y_train)
y_pred = lr.predict(X_test_trans)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of test dataset:", accuracy*100, "%")