In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../../datasets/mushrooms.csv')

In [3]:
df['class'].unique()

array(['p', 'e'], dtype=object)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
df['class'].value_counts()

class
e    4208
p    3916
Name: count, dtype: int64

In [6]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [7]:
from sklearn.calibration import LabelEncoder


labelencoder = LabelEncoder()

for col in df.columns:
    df[col] = labelencoder.fit_transform(df[col])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [8]:
# Unique values in all columns
for col in df.columns:
    print(f'{col}: {len(df[col].unique())} levels - {df[col].unique()}')

class: 2 levels - [1 0]
cap-shape: 6 levels - [5 0 4 2 3 1]
cap-surface: 4 levels - [2 3 0 1]
cap-color: 10 levels - [4 9 8 3 2 5 0 7 1 6]
bruises: 2 levels - [1 0]
odor: 9 levels - [6 0 3 5 2 1 8 7 4]
gill-attachment: 2 levels - [1 0]
gill-spacing: 2 levels - [0 1]
gill-size: 2 levels - [1 0]
gill-color: 12 levels - [ 4  5  2  7 10  3  9  1  0  8 11  6]
stalk-shape: 2 levels - [0 1]
stalk-root: 5 levels - [3 2 1 4 0]
stalk-surface-above-ring: 4 levels - [2 0 1 3]
stalk-surface-below-ring: 4 levels - [2 0 3 1]
stalk-color-above-ring: 9 levels - [7 3 6 4 0 2 5 1 8]
stalk-color-below-ring: 9 levels - [7 6 3 0 4 2 8 5 1]
veil-type: 1 levels - [0]
veil-color: 4 levels - [2 0 1 3]
ring-number: 3 levels - [1 2 0]
ring-type: 5 levels - [4 0 2 1 3]
spore-print-color: 9 levels - [2 3 6 1 7 5 4 8 0]
population: 6 levels - [3 2 0 4 5 1]
habitat: 7 levels - [5 1 3 0 4 6 2]


In [9]:
df.describe().T.to_latex(
    buf='./mushrooms_describe.tex',
    float_format="{:0.2f}".format,
    caption='Mushrooms dataset description',
    label='tab:mushrooms_describe'
)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   class                     8124 non-null   int64
 1   cap-shape                 8124 non-null   int64
 2   cap-surface               8124 non-null   int64
 3   cap-color                 8124 non-null   int64
 4   bruises                   8124 non-null   int64
 5   odor                      8124 non-null   int64
 6   gill-attachment           8124 non-null   int64
 7   gill-spacing              8124 non-null   int64
 8   gill-size                 8124 non-null   int64
 9   gill-color                8124 non-null   int64
 10  stalk-shape               8124 non-null   int64
 11  stalk-root                8124 non-null   int64
 12  stalk-surface-above-ring  8124 non-null   int64
 13  stalk-surface-below-ring  8124 non-null   int64
 14  stalk-color-above-ring    8124 non-null 

In [11]:
# Scaler
scaler = RobustScaler()

features = pd.DataFrame(scaler.fit_transform(df.drop('class', axis=1)), columns=df.columns[1:])
target = df['class']

In [12]:
len(target)

8124

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.2, random_state=2023)

In [13]:
# # Hiperparameters with GridSearchCV for Models: GaussianNB, Decision Tree, KNN and Random Forest
# from sklearn.model_selection import GridSearchCV


# # GaussianNB
# params_gnb = {'var_smoothing': np.logspace(0, -9, num=100)}
# grid_gnb = GridSearchCV(estimator=GaussianNB(), param_grid=params_gnb, cv=5, verbose=1, n_jobs=-1)
# grid_gnb.fit(X_train, y_train)
# print(f"Model: GaussianNB | Best params: {grid_gnb.best_params_}\n")

# # Decision Tree
# params_dt = {'criterion': ['gini', 'entropy'],
#                 'max_depth': [3, 5, 7, 9, 11, 13, 15],
#                 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
#                 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
# grid_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params_dt, cv=5, verbose=1, n_jobs=-1)
# grid_dt.fit(X_train, y_train)
# print(f"Model: DT | Best params: {grid_dt.best_params_}\n")

# # KNN
# params_knn = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
#                 'weights': ['uniform', 'distance'],
#                 'metric': ['euclidean', 'manhattan', 'minkowski']}
# grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params_knn, cv=5, verbose=1, n_jobs=-1)
# grid_knn.fit(X_train, y_train)
# print(f"Model: KNN | Best params: {grid_knn.best_params_}\n")

# # Random Forest
# params_rf = {'n_estimators': [100, 200, 300, 400, 500],
#              'criterion': ['gini', 'entropy'],
#                 'max_depth': [3, 5, 7, 9, 11, 13, 15],
#                 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
#                 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
# grid_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_rf, cv=5, verbose=1, n_jobs=-1)
# grid_rf.fit(X_train, y_train)
# print(f"Model: RF | Best params: {grid_rf.best_params_}\n")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Model: GaussianNB | Best params: {'var_smoothing': 0.001}

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits
Model: DT | Best params: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}

Fitting 5 folds for each of 42 candidates, totalling 210 fits
Model: KNN | Best params: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}

Fitting 5 folds for each of 5040 candidates, totalling 25200 fits
Model: RF | Best params: {'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}



In [13]:
models_with_best_params = {
    'GaussianNB': GaussianNB(var_smoothing=0.001),
    'DT': DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_leaf=1, min_samples_split=2),
    'KNN': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='uniform'),
    'RF': RandomForestClassifier(criterion='gini', max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=100),
}

In [14]:
# Metrics: Accuracy, Precision, Recall, F1
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score


def get_metrics():
    metrics = {
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'f1': f1_score,
        'roc': roc_auc_score
    }
    return metrics


def get_metrics_df(y_true, y_pred, name):
    metrics = get_metrics()
    df = pd.DataFrame()
    for metric_name, metric in metrics.items():
        df[metric_name] = [metric(y_true, y_pred)]
    return df

In [15]:
SEEDS = [24, 42, 206, 602, 412, 214, 754, 457, 2023, 3202]

In [16]:
run_counter = 0
results = dict()

# Splitting dataset
X = df.drop(columns='class')
y = df['class']

for seed in SEEDS:
    run_counter += 1
    print(f'%{run_counter} - Running for seed: {seed}')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=seed)

    models_with_best_params = {
        'GaussianNB': GaussianNB(var_smoothing=0.001),
        'DT': DecisionTreeClassifier(criterion='gini', max_depth=7, min_samples_leaf=1, min_samples_split=2),
        'KNN': KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='uniform'),
        'RF': RandomForestClassifier(criterion='gini', max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=100),
    }

    for name, model in models_with_best_params.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[f"{model.__class__.__name__}_{seed}"] = get_metrics_df(y_test, y_pred, name)

%1 - Running for seed: 24
%2 - Running for seed: 42
%3 - Running for seed: 206
%4 - Running for seed: 602
%5 - Running for seed: 412
%6 - Running for seed: 214
%7 - Running for seed: 754
%8 - Running for seed: 457
%9 - Running for seed: 2023
%10 - Running for seed: 3202


In [17]:
results.keys()

dict_keys(['GaussianNB_24', 'DecisionTreeClassifier_24', 'KNeighborsClassifier_24', 'RandomForestClassifier_24', 'GaussianNB_42', 'DecisionTreeClassifier_42', 'KNeighborsClassifier_42', 'RandomForestClassifier_42', 'GaussianNB_206', 'DecisionTreeClassifier_206', 'KNeighborsClassifier_206', 'RandomForestClassifier_206', 'GaussianNB_602', 'DecisionTreeClassifier_602', 'KNeighborsClassifier_602', 'RandomForestClassifier_602', 'GaussianNB_412', 'DecisionTreeClassifier_412', 'KNeighborsClassifier_412', 'RandomForestClassifier_412', 'GaussianNB_214', 'DecisionTreeClassifier_214', 'KNeighborsClassifier_214', 'RandomForestClassifier_214', 'GaussianNB_754', 'DecisionTreeClassifier_754', 'KNeighborsClassifier_754', 'RandomForestClassifier_754', 'GaussianNB_457', 'DecisionTreeClassifier_457', 'KNeighborsClassifier_457', 'RandomForestClassifier_457', 'GaussianNB_2023', 'DecisionTreeClassifier_2023', 'KNeighborsClassifier_2023', 'RandomForestClassifier_2023', 'GaussianNB_3202', 'DecisionTreeClassif

In [18]:
gaussian_keys = [key for key in results.keys() if 'Gaussian' in key]
dt_keys = [key for key in results.keys() if 'DecisionTreeClassifier' in key]
knn_keys = [key for key in results.keys() if 'KNeighborsClassifier' in key]
rf_keys = [key for key in results.keys() if 'RandomForestClassifier' in key]

In [19]:
print(len(gaussian_keys))
assert len(gaussian_keys) == len(dt_keys) == len(knn_keys) == len(rf_keys)

10


In [20]:
# Gaussian accuracy mean
from tabulate import tabulate


# Gaussian accuracy mean
gaussian_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in gaussian_keys])

# Gaussian accuracy std
gaussian_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in gaussian_keys])

# Gaussian precision mean
gaussian_precision_mean = np.mean([results[key]['precision'].values[0] for key in gaussian_keys])

# Gaussian precision std
gaussian_precision_std = np.std([results[key]['precision'].values[0] for key in gaussian_keys])

# Gaussian recall mean
gaussian_recall_mean = np.mean([results[key]['recall'].values[0] for key in gaussian_keys])

# Gaussian recall std
gaussian_recall_std = np.std([results[key]['recall'].values[0] for key in gaussian_keys])

# Gaussian f1 mean
gaussian_f1_mean = np.mean([results[key]['f1'].values[0] for key in gaussian_keys])

# Gaussian f1 std
gaussian_f1_std = np.std([results[key]['f1'].values[0] for key in gaussian_keys])

# Gaussian roc mean
gaussian_roc_mean = np.mean([results[key]['roc'].values[0] for key in gaussian_keys])

# Gaussian roc std
gaussian_roc_std = np.std([results[key]['roc'].values[0] for key in gaussian_keys])


# Print tabulate
print(tabulate(
    [
        ['GaussianNB', 
         gaussian_accuracy_mean, gaussian_accuracy_std, 
         gaussian_precision_mean, gaussian_precision_std, 
         gaussian_recall_mean, gaussian_recall_std, 
         gaussian_f1_mean, gaussian_f1_std, 
         gaussian_roc_mean, gaussian_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# DT accuracy mean
dt_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in dt_keys])

# DT accuracy std
dt_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in dt_keys])

# DT precision mean
dt_precision_mean = np.mean([results[key]['precision'].values[0] for key in dt_keys])

# DT precision std
dt_precision_std = np.std([results[key]['precision'].values[0] for key in dt_keys])

# DT recall mean
dt_recall_mean = np.mean([results[key]['recall'].values[0] for key in dt_keys])

# DT recall std
dt_recall_std = np.std([results[key]['recall'].values[0] for key in dt_keys])

# DT f1 mean
dt_f1_mean = np.mean([results[key]['f1'].values[0] for key in dt_keys])

# DT f1 std
dt_f1_std = np.std([results[key]['f1'].values[0] for key in dt_keys])

# DT roc mean
dt_roc_mean = np.mean([results[key]['roc'].values[0] for key in dt_keys])

# DT roc std
dt_roc_std = np.std([results[key]['roc'].values[0] for key in dt_keys])


# Print tabulate
print(tabulate(
    [
        ['DecisionTreeClassifier', dt_accuracy_mean, dt_accuracy_std, 
         dt_precision_mean, dt_precision_std, 
         dt_recall_mean, dt_recall_std, 
         dt_f1_mean, dt_f1_std, 
         dt_roc_mean, dt_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# KNN accuracy mean
knn_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in knn_keys])

# KNN accuracy std
knn_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in knn_keys])

# KNN precision mean
knn_precision_mean = np.mean([results[key]['precision'].values[0] for key in knn_keys])

# KNN precision std
knn_precision_std = np.std([results[key]['precision'].values[0] for key in knn_keys])

# KNN recall mean
knn_recall_mean = np.mean([results[key]['recall'].values[0] for key in knn_keys])

# KNN recall std
knn_recall_std = np.std([results[key]['recall'].values[0] for key in knn_keys])

# KNN f1 mean
knn_f1_mean = np.mean([results[key]['f1'].values[0] for key in knn_keys])

# KNN f1 std
knn_f1_std = np.std([results[key]['f1'].values[0] for key in knn_keys])

# KNN roc mean
knn_roc_mean = np.mean([results[key]['roc'].values[0] for key in knn_keys])

# KNN roc std
knn_roc_std = np.std([results[key]['roc'].values[0] for key in knn_keys])


# Print tabulate
print(tabulate(
    [
        ['KNeighborsClassifier', knn_accuracy_mean, knn_accuracy_std, 
         knn_precision_mean, knn_precision_std, 
         knn_recall_mean, knn_recall_std, 
         knn_f1_mean, knn_f1_std, 
         knn_roc_mean, knn_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# RF accuracy mean
rf_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in rf_keys])

# RF accuracy std
rf_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in rf_keys])

# RF precision mean
rf_precision_mean = np.mean([results[key]['precision'].values[0] for key in rf_keys])

# RF precision std
rf_precision_std = np.std([results[key]['precision'].values[0] for key in rf_keys])

# RF recall mean
rf_recall_mean = np.mean([results[key]['recall'].values[0] for key in rf_keys])

# RF recall std
rf_recall_std = np.std([results[key]['recall'].values[0] for key in rf_keys])

# RF f1 mean
rf_f1_mean = np.mean([results[key]['f1'].values[0] for key in rf_keys])

# RF f1 std
rf_f1_std = np.std([results[key]['f1'].values[0] for key in rf_keys])

# RF roc mean
rf_roc_mean = np.mean([results[key]['roc'].values[0] for key in rf_keys])

# RF roc std
rf_roc_std = np.std([results[key]['roc'].values[0] for key in rf_keys])


# Print tabulate
print(tabulate(
    [
        ['RandomForestClassifier', rf_accuracy_mean, rf_accuracy_std, 
         rf_precision_mean, rf_precision_std, 
         rf_recall_mean, rf_recall_std, 
         rf_f1_mean, rf_f1_std, 
         rf_roc_mean, rf_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))

+------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------+
| Model      |   Accuracy Mean |   Accuracy Std |   Precision Mean |   Precision Std |   Recall Mean |   Recall Std |   F1 Mean |   F1 Std |   ROC Mean |   ROC Std |
|------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------|
| GaussianNB |           0.916 |       0.006397 |           0.9455 |        0.008339 |        0.8771 |     0.009716 |      0.91 | 0.007109 |     0.9148 |  0.006442 |
+------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------+
+------------------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----