# Mushroom Prediction: A Preliminary Notebook

In [7]:
## This is a Juptyer notebook for the Kaggle Project: Mushroom Classification
# %pip install ydata-profiling
# %pip install numpy
# %pip install --upgrade pandas
# %pip install --upgrade matplotlib
# %pip install --upgrade seaborn
# %pip install --upgrade scikit-learn
# %pip install --upgrade scipy
# %pip install --upgrade catboost
# %pip install --upgrade xgboost
# %pip install --upgrade lightgbm
# %pip install ipywidgets


In [8]:
## Import libaries
import os

## Data analysis and wrangling
import numpy as np
import pandas as pd
import random as rnd

## Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
from ydata_profiling import ProfileReport
%matplotlib inline 
from scipy.stats import boxcox

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef


# Machine learning_ Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier

# # Model selection
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


#Palette
palette = ['#328ca9', '#0e6ea9', '#2c4ea3', '#193882', '#102446']

# Set the style of the visualization
sns.set(style="whitegrid")

# Set the configuration of sklearn
SEED = 42 # for reproducibility

## 5. Feature Preprocessing

Next, we preprocess the data by further imputing the missing values, one hot encoding the options in every features and label encoding the target

In [9]:
# Read the data



# Specify the data types for columns with mixed types
dtype_spec = {
    'cap-diameter': 'float16',
    'stem-height': 'float16',
    'stem-width': 'float16',
    'does-bruise-or-bleed':'category',
    'has-ring':'category'
}

train_df = pd.read_csv(r'Output\\Cleaned_Data\\train_cleaned.csv',dtype=dtype_spec)
test_df = pd.read_csv(r'Output\\Cleaned_Data\\test_cleaned.csv',dtype=dtype_spec)
y = pd.read_csv(r'Output\\Cleaned_Data\\target.csv',dtype='category')



In [10]:
# visualizing pipeline
set_config(display='diagram')

from utils import PreprocessData
from sklearn.preprocessing import LabelEncoder

# Preprocess the data
X, preprocessor = PreprocessData(train_df)
X1, preprocessor = PreprocessData(test_df)
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y).ravel()  

  y = column_or_1d(y, warn=True)


## 6. 1st Level models

Then, we proceed to construct the 1st level models, which begins by defining the models (#6.1) and their parameters (#6.2). In this project, we will tune the hyperparameters by RandomizedSearchCV, and thus a parameter grid is defined in Section 6.2.

### 6.1. Model list

In [11]:
from utils import sort_dict

# Define a list of models for prediction
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=SEED),
    "Random Forest Classifier": RandomForestClassifier(random_state=SEED),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=SEED),
    "XGBClassifier": XGBClassifier(random_state=SEED),
    "MLP Classifier": MLPClassifier(random_state=SEED),
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=SEED),
    "AdaBoost Classifier": AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth=1), algorithm='SAMME',random_state=SEED),
    "Dummy Classifier": DummyClassifier(strategy='most_frequent',random_state=SEED)  # DummyClassifier for sanity check
}

# Sort the models
classifiers=sort_dict(classifiers)

### 6.2. Model parameters

In [12]:
# Define the parameters for the hyperparamter tuning of models by RandomizedSearchCV
import scipy.stats as stats

params_classifiers = {

    "Logistic Regression": {
        'solver': ['newton-cg', 'sag', 'lbfgs'],  
        'penalty': ['l2'],  
        'C': [0.1],
        'max_iter': [100, 200, 300]
    },

    "Random Forest Classifier": {
        'n_estimators': [64, 128, 256],
        'max_depth': [8, 16, 32, 64],
        'criterion': ['entropy'],
        'warm_start': [False]
    },

    "Gradient Boosting Classifier": {
        'learning_rate': stats.loguniform(1e-2, 1e-1),
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
    },


    "XGBClassifier": {
            'objective':['binary:logistic'],
            'max_depth': [3, 5, 7, 9],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
            'n_estimators': [16, 32, 64, 128, 256]
        },


    "MLP Classifier": {
        'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': stats.loguniform(1e-5, 1e-2),
        'learning_rate': ['constant', 'adaptive']
    },
    
    "Extra Trees Classifier": {
        'n_estimators': [128, 256,524],
        'criterion': ['entropy'],
        'max_features': [10, 20, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
        'warm_start': [False]
    },

    "AdaBoost Classifier": {
        'n_estimators': [50, 100, 200, 300],
        'learning_rate': stats.loguniform(1e-4, 1e-1),
    },

    "Dummy Classifier": {}
}

# Sort the parameters
params_classifiers = sort_dict(params_classifiers)

### 6.3. Model training and hyperparameters tuning

#### a. Setup scoring method for the model optimization

In [13]:
from sklearn.model_selection import StratifiedKFold

# Setup the KFold
NFOLDS = 3 # set folds for out-of-fold prediction
kf = StratifiedKFold(n_splits= NFOLDS,shuffle=True, random_state=SEED)

### 6.3.b. Hyperparameter tuning using a smaller set of data

<div style="border: 2px solid #999999; padding: 10px; border-radius: 5px; background-color: #282828; max-width: 97.5%; overflow-x: auto;">
<p>
<br>- Because we have a huge dataset, I decided to use a subset of which for tuning the hyperparameters and select models. The selected models will then be retrained in the whole dataset.
<br>- We made a subset of sample size= 100000, and
<br>- We selected models which show MCC score > 0.8 for further training.
</p>
</div>

In [14]:
from utils import model_evaluation

# Sampling data for hyperparameter tuning
sample_size = 100000  # sample size for tuning
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, shuffle=True, random_state=SEED, stratify=y)

# Split the data into training and validation sets
X_train_sample, X_val_sample, y_train_sample, y_val_sample = train_test_split(X_sample, y_sample, test_size=0.2, shuffle=True, random_state=SEED,stratify=y_sample)

# MCC Scores
model_list_tuning, MCC_train_list_tuning, MCC_val_list_tuning,y_train_pred_list_tuning, model_params_tuning = model_evaluation(classifiers, X_train_sample, y_train_sample, X_val_sample, y_val_sample, kf, params= params_classifiers, mode='tuning')

# Display the scores
pd.DataFrame(list(zip(model_list_tuning, MCC_train_list_tuning, MCC_val_list_tuning)), columns=['Model Name', 'MCC_Score_Train_sample', 'MCC_Score_val_sample']).sort_values(by=["MCC_Score_val_sample"],ascending=True)

Running model: AdaBoost Classifier
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Model-tuning success: AdaBoost Classifier Best Parameters: {'learning_rate': 0.006251373574521752, 'n_estimators': 200}
Predicting
Model-prediction success: AdaBoost Classifier MCC_train: 0.27887373091080625  , MCC_val: 0.26983651962177213


Running model: Dummy Classifier
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Model-tuning success: Dummy Classifier Best Parameters: {}




Predicting
Model-prediction success: Dummy Classifier MCC_train: 0.0  , MCC_val: 0.0


Running model: Extra Trees Classifier
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Model-tuning success: Extra Trees Classifier Best Parameters: {'warm_start': False, 'n_estimators': 256, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 10, 'criterion': 'entropy', 'bootstrap': False}
Predicting
Model-prediction success: Extra Trees Classifier MCC_train: 0.9845917266473769  , MCC_val: 0.9755824261402724


Running model: Gradient Boosting Classifier
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Model-tuning success: Gradient Boosting Classifier Best Parameters: {'learning_rate': 0.06021310185147604, 'n_estimators': 128, 'subsample': 0.7}
Predicting
Model-prediction success: Gradient Boosting Classifier MCC_train: 0.8426947952989458  , MCC_val: 0.8474524381488361


Running model: Logistic Regression
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Mo

Unnamed: 0,Model Name,MCC_Score_Train_sample,MCC_Score_val_sample
1,Dummy Classifier,0.0,0.0
0,AdaBoost Classifier,0.278874,0.269837
4,Logistic Regression,0.464186,0.457967
3,Gradient Boosting Classifier,0.842695,0.847452
5,MLP Classifier,0.988299,0.975381
2,Extra Trees Classifier,0.984592,0.975582
7,XGBClassifier,0.993417,0.978914
6,Random Forest Classifier,1.0,0.980628


In [15]:
import copy
# Obtain a subset of models that has MCC_Score_val > 0.8
models_selected = []
parameters_selected =[]
for i in range(len(list(classifiers))):
    if MCC_val_list_tuning[i] > 0.8:
        print(list(classifiers.keys())[i])
        models_selected.append(list(classifiers.keys())[i])
        parameters_selected.append(model_params_tuning[i])

# Display the selected models
models_selected

# Select the models from the classifiers dictionary
classifiers_selected = {key: copy.deepcopy(classifiers[key]) for key in models_selected}

# add the parameters to the selected models
params_classifiers_selected = {key: parameters_selected[i] for i,key in enumerate(models_selected)}

Extra Trees Classifier
Gradient Boosting Classifier
MLP Classifier
Random Forest Classifier
XGBClassifier


In [16]:
classifiers_selected 

{'Extra Trees Classifier': ExtraTreesClassifier(random_state=42),
 'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42),
 'MLP Classifier': MLPClassifier(random_state=42),
 'Random Forest Classifier': RandomForestClassifier(random_state=42),
 'XGBClassifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
            

In [17]:
params_classifiers_selected

{'Extra Trees Classifier': {'warm_start': False,
  'n_estimators': 256,
  'min_samples_split': 5,
  'min_samples_leaf': 2,
  'max_features': 10,
  'criterion': 'entropy',
  'bootstrap': False},
 'Gradient Boosting Classifier': {'learning_rate': 0.06021310185147604,
  'n_estimators': 128,
  'subsample': 0.7},
 'MLP Classifier': {'activation': 'tanh',
  'alpha': 0.0024526126311336773,
  'hidden_layer_sizes': (100,),
  'learning_rate': 'constant',
  'solver': 'adam'},
 'Random Forest Classifier': {'warm_start': False,
  'n_estimators': 256,
  'max_depth': 32,
  'criterion': 'entropy'},
 'XGBClassifier': {'objective': 'binary:logistic',
  'n_estimators': 128,
  'max_depth': 9,
  'colsample_bytree': 0.6}}

In [18]:
# Split the entire data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the selected models with the entire training data
model_list, MCC_train_list, MCC_val_list, y_train_pred_list, oof_predictions_df, val_predictions_df = model_evaluation(classifiers_selected , X_train_sample, y_train_sample, X_val_sample, y_val_sample, kf, params= params_classifiers_selected, mode='training')


Running model: Extra Trees Classifier
training fold: # 1
training fold: # 2
training fold: # 3
Model-training success: Extra Trees Classifier
Model: Extra Trees Classifier oof predictions and val predictions saved successfully
Predicting
Model-prediction success: Extra Trees Classifier MCC_train: 0.9908697793725013  , MCC_val: 0.9736660791126864


Running model: Gradient Boosting Classifier
training fold: # 1
training fold: # 2
training fold: # 3
Model-training success: Gradient Boosting Classifier
Model: Gradient Boosting Classifier oof predictions and val predictions saved successfully
Predicting
Model-prediction success: Gradient Boosting Classifier MCC_train: 0.8569308286661608  , MCC_val: 0.8624816628440494


Running model: MLP Classifier
training fold: # 1
training fold: # 2




training fold: # 3
Model-training success: MLP Classifier
Model: MLP Classifier oof predictions and val predictions saved successfully
Predicting
Model-prediction success: MLP Classifier MCC_train: 0.9821675281604441  , MCC_val: 0.9721505620936095


Running model: Random Forest Classifier
training fold: # 1
training fold: # 2
training fold: # 3
Model-training success: Random Forest Classifier
Model: Random Forest Classifier oof predictions and val predictions saved successfully
Predicting
Model-prediction success: Random Forest Classifier MCC_train: 0.9931658002058593  , MCC_val: 0.9801253521320538


Running model: XGBClassifier
training fold: # 1
training fold: # 2
training fold: # 3
Model-training success: XGBClassifier
Model: XGBClassifier oof predictions and val predictions saved successfully
Predicting
Model-prediction success: XGBClassifier MCC_train: 0.9825517259791553  , MCC_val: 0.9759874531974972


Model saved: XGBClassifier
