In [1]:
##############################################################################
## EE559 Final Project ===> Mushroom Classification.
## Created by Sudesh Kumar Santhosh Kumar and Thejesh Chandar Rao.
## Date: 7th May, 2023
## Tested in Python 3.10.9 using conda environment version 22.9.0.
##############################################################################

# Final Project -> Classification of Mushrooms (EE - 559)

## TRAINING MACHINE LEARNING MODELS

# Importing all necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from utils.models import Model
from utils.Encoding import StatisticalEncoding, OneHotEncoding

from utils.Transform import standardizeData
from utils.Transform import transformTrainData_PCA

## Defining the model path.
models_path = "../models/"

## Defining the results dict.
results = {
    "acc" : {

    },

    "cer" : {
        
    }

}

## Loading the best dataset which is found to be generated using PCA to transform the most correlated features (130) to 60 features which got better results during Model Selection.

In [3]:
train_data = pd.read_csv("./dataset/Train/mushroom_train_final.csv")


In [4]:
train_data.head()

Unnamed: 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10,...,feature 52,feature 53,feature 54,feature 55,feature 56,feature 57,feature 58,feature 59,feature 60,class
0,-2.044244,-1.075445,-2.996931,-1.327966,-1.915506,1.712827,-1.914133,-2.025665,-0.033132,-0.732019,...,-0.209448,-0.136648,-0.176574,0.009994,0.612039,-0.316286,-0.421195,0.074456,0.296745,1.0
1,5.08306,-5.806476,0.618635,-2.898718,-0.206868,-2.450926,4.243412,-0.068073,1.028012,-1.690258,...,0.111929,0.099122,-0.343758,-0.08237,0.4688,-0.062343,0.542883,0.355637,-0.139385,1.0
2,4.992989,-4.590646,2.178452,1.408402,-2.567983,1.140362,-1.632072,1.84652,4.827835,-0.966505,...,0.18165,-0.110957,-0.008778,0.055724,0.051953,-0.111583,0.015937,-0.018558,-0.045008,0.0
3,-0.664443,7.782707,1.17503,-5.433635,-2.679231,1.323999,0.099112,-0.280119,-0.702643,1.433643,...,0.566764,0.116995,0.839153,0.333643,0.1859,0.148845,-0.095078,0.27123,0.190818,1.0
4,1.893219,9.76226,0.935071,2.232627,1.676629,-2.394506,0.775407,-2.175666,-1.397459,-0.874789,...,-0.231412,0.024973,-0.217571,-0.039586,-0.119344,0.114267,-0.009614,0.126086,0.052722,0.0


## Separating Input Features and class labels from the dataset. Also Encoding the class labels from "p" -> 1 and "e" -> 0

In [5]:
## The below X_train is already Standardized and Shuffled.

# Select all the features except labels,
X_train = train_data.drop('class', axis=1) 

# Select only the 'class' column.
y_train = train_data['class']  

## Converting the Dataframe to numpy array to train the ML Models.

In [6]:
X_train_np = X_train.to_numpy() 
y_train_np = y_train.to_numpy()

## Generating the Final Test Dataset by following all the pre-processing, feature selection and feature transformation steps we followed for the training set.

### Loading the Raw Test Data with Categorical Features and Duplicates.

In [7]:
test_data_raw = pd.read_csv("./dataset/mushroom_test.csv")

# Select all the features except labels.
X_test_raw = test_data_raw.drop('class', axis=1)

# Select only the 'class' column.
y_test_raw = test_data_raw['class']  

### Performing Statistical Encoding to convert the Categorical to Numerical Features.

In [8]:
categoricalFeatures = [col for col in X_test_raw.columns if X_test_raw[col].dtype == "object"]
numericalFeatures = [col for col in X_test_raw.columns if X_test_raw[col].dtype == "float64"]
classes, class_index, class_count = np.unique(y_test_raw, return_index=True, return_counts=True, axis=None)


test_data_enc = StatisticalEncoding(dataFrame=test_data_raw, categoricalFeatures=categoricalFeatures, numericalFeatures=numericalFeatures)
X_test_enc = test_data_enc.drop('class', axis=1)  # Select all the features except labels,

### Checking the Test Dataset for Categorical Features

In [9]:
categoricalFeatures = [col for col in X_test_enc.columns if X_test_enc[col].dtype == "object"]
numericalFeatures = [col for col in X_test_enc.columns if X_test_enc[col].dtype == "float64"]

In [10]:
print(f"Categorical Features: {categoricalFeatures}")
print(f"Numerical Features: {numericalFeatures}")
print(f"Total number of Categorical Features: {len(categoricalFeatures)}")
print(f"Total number of Numerical Features: {len(numericalFeatures)}")

Categorical Features: []
Numerical Features: ['cap-diameter', 'stem-height', 'stem-width', 'cap-shape_cap-diameter_mean', 'cap-shape_cap-diameter_min', 'cap-shape_cap-diameter_max', 'cap-shape_cap-diameter_median', 'cap-shape_stem-height_mean', 'cap-shape_stem-height_min', 'cap-shape_stem-height_max', 'cap-shape_stem-height_median', 'cap-shape_stem-width_mean', 'cap-shape_stem-width_min', 'cap-shape_stem-width_max', 'cap-shape_stem-width_median', 'cap-surface_cap-diameter_mean', 'cap-surface_cap-diameter_min', 'cap-surface_cap-diameter_max', 'cap-surface_cap-diameter_median', 'cap-surface_stem-height_mean', 'cap-surface_stem-height_min', 'cap-surface_stem-height_max', 'cap-surface_stem-height_median', 'cap-surface_stem-width_mean', 'cap-surface_stem-width_min', 'cap-surface_stem-width_max', 'cap-surface_stem-width_median', 'cap-color_cap-diameter_mean', 'cap-color_cap-diameter_min', 'cap-color_cap-diameter_max', 'cap-color_cap-diameter_median', 'cap-color_stem-height_mean', 'cap-color_

### Generating the DataFrame with the top 130 features we obtained during the generation train dataset and standardizing them.

In [11]:
top_features_130 = ['gill-attachment_cap-diameter_mean', 'gill-attachment_stem-width_min', 'gill-attachment_stem-width_median', 'gill-attachment_cap-diameter_median', 'stem-width', 'stem-color_stem-height_mean', 'gill-attachment_cap-diameter_min', 'gill-attachment_stem-width_mean', 'gill-attachment_stem-height_median', 'cap-diameter', 'cap-shape_cap-diameter_median', 'gill-attachment_stem-height_mean', 'stem-color_stem-width_median', 'stem-color_stem-height_median', 'stem-color_cap-diameter_median', 'ring-type_stem-width_mean', 'gill-attachment_cap-diameter_max', 'gill-attachment_stem-height_min', 'cap-surface_stem-height_max', 'ring-type_cap-diameter_mean', 'stem-color_stem-width_mean', 'cap-shape_stem-height_median', 'gill-color_cap-diameter_median', 'stem-height', 'stem-color_stem-height_max', 'season_stem-width_max', 'season_cap-diameter_mean', 'season_stem-height_max', 'gill-attachment_stem-width_max', 'cap-shape_stem-width_max', 'gill-spacing_stem-width_min', 'gill-spacing_stem-height_min', 'cap-color_stem-height_max', 'cap-shape_stem-height_mean', 'ring-type_cap-diameter_median', 'cap-shape_stem-height_max', 'gill-spacing_stem-width_max', 'gill-spacing_stem-width_median', 'season_cap-diameter_min', 'season_stem-width_mean', 'gill-color_stem-height_max', 'gill-color_stem-width_median', 'season_stem-height_mean', 'ring-type_stem-width_median', 'cap-shape_stem-width_mean', 'habitat_cap-diameter_mean', 'habitat_cap-diameter_min', 'cap-surface_stem-height_median', 'habitat_cap-diameter_median', 'habitat_stem-width_median', 'cap-shape_stem-width_median', 'cap-surface_stem-height_mean', 'gill-color_stem-width_max', 'habitat_stem-width_mean', 'cap-color_stem-width_mean', 'stem-color_stem-height_min', 'ring-type_stem-height_max', 'cap-color_stem-height_min', 'cap-color_stem-width_max', 'habitat_stem-height_median', 'gill-color_stem-height_mean', 'gill-spacing_stem-height_max', 'gill-spacing_cap-diameter_max', 'gill-spacing_stem-width_mean', 'cap-shape_cap-diameter_mean', 'cap-surface_stem-width_median', 'ring-type_stem-height_mean', 'cap-color_stem-height_median', 'habitat_stem-width_min', 'habitat_stem-height_mean', 'cap-surface_cap-diameter_max', 'stem-color_cap-diameter_mean', 'gill-spacing_cap-diameter_median', 'cap-shape_cap-diameter_min', 'habitat_stem-width_max', 'gill-color_stem-width_mean', 'cap-surface_cap-diameter_median', 'cap-shape_stem-height_min', 'ring-type_stem-height_median', 'gill-color_cap-diameter_mean', 'gill-color_stem-height_min', 'cap-surface_cap-diameter_mean', 'gill-spacing_cap-diameter_min', 'cap-color_cap-diameter_min', 'stem-color_cap-diameter_min', 'has-ring_stem-width_median', 'has-ring_cap-diameter_min', 'has-ring_stem-height_mean', 'has-ring_cap-diameter_mean', 'has-ring_stem-height_median', 'has-ring_cap-diameter_median', 'has-ring_stem-width_max', 'has-ring_cap-diameter_max', 'has-ring_stem-width_mean', 'has-ring_stem-height_max', 'cap-color_stem-height_mean', 'ring-type_cap-diameter_min', 'gill-spacing_stem-height_mean', 'gill-spacing_cap-diameter_mean', 'habitat_cap-diameter_max', 'cap-surface_stem-width_mean', 'ring-type_cap-diameter_max', 'gill-color_stem-width_min', 'ring-type_stem-height_min', 'cap-surface_cap-diameter_min', 'gill-color_stem-height_median', 'stem-color_stem-width_max', 'cap-color_stem-width_median', 'ring-type_stem-width_max', 'does-bruise-or-bleed_stem-height_max', 'does-bruise-or-bleed_stem-width_median', 'does-bruise-or-bleed_cap-diameter_mean', 'does-bruise-or-bleed_stem-width_max', 'does-bruise-or-bleed_cap-diameter_median', 'does-bruise-or-bleed_stem-height_median', 'does-bruise-or-bleed_stem-height_mean', 'does-bruise-or-bleed_cap-diameter_min', 'does-bruise-or-bleed_stem-height_min', 'does-bruise-or-bleed_stem-width_mean', 'does-bruise-or-bleed_cap-diameter_max', 'does-bruise-or-bleed_stem-width_min', 'cap-color_stem-width_min', 'gill-color_cap-diameter_min', 'stem-color_cap-diameter_max', 'cap-shape_stem-width_min', 'ring-type_stem-width_min', 'gill-spacing_stem-height_median', 'season_cap-diameter_median', 'gill-attachment_stem-height_max', 'habitat_stem-height_min']

X_test_top = X_test_enc[top_features_130]
    
X_test_top_std = standardizeData(X_test_top)

Standardized the Train Data!


## Performing Dimension Reduction using PCA and obtaining 60 features which tend to have performed really well in Training Phase.

In [12]:
# Setting the number of features as 60.
X_test_pca = transformTrainData_PCA(X_test_top_std, components=60)

In [13]:
y_test_np = y_test_raw.to_numpy()

In [14]:
y_test_np.shape

(18321,)

## Instantiating the Model class with an object.

In [15]:
model = Model()

# Training the Models!

## Reference systems and analysis.

## Trivial System.

In [16]:
train_accuracy_trivial = model.trivial_system(Y_true = y_train_np)
results["acc"]["trivial"] = train_accuracy_trivial
results["cer"]["trivial"] = 100 - train_accuracy_trivial

## BaseLine System

In [17]:
## Training the Baseline Model.
results["acc"]["baseline"], results["cer"]["baseline"] = model.final_train(X_train, y_train, model_name="baseline")

### Training the Non-probablistic Models (KNN and Perceptron), Support Vector Machine, Probabilistic Models (Gaussian Naive Bayes), Neural Network (MLP with 2 hidden layers)

### Non-Probablistic Models

In [18]:
## Training the Perceptron Model.
results["acc"]["perceptron"], results["cer"]["perceptron"] = model.final_train(X_train, y_train, model_name="perceptron")

In [19]:
## Training the KNN Model.
results["acc"]["KNN"], results["cer"]["KNN"] = model.final_train(X_train, y_train, model_name="KNN")

### Support Vector Machine

In [20]:
results["acc"]["SVM"], results["cer"]["SVM"] = model.final_train(X_train, y_train, model_name="SVM")

### Probabilistic Model (Gaussian Naive Bayes)

In [21]:
results["acc"]["NB"], results["cer"]["NB"] = model.final_train(X_train, y_train, model_name="BayesClf")

### MLP

In [22]:
results["acc"]["MLP"], results["cer"]["MLP"] = model.final_train(X_train, y_train, model_name="MLP")

In [23]:
results["acc"]

{'trivial': 50.801687763713076,
 'baseline': 66.04781997187061,
 'perceptron': 68.300515705579,
 'KNN': 99.97421472105016,
 'SVM': 99.28035630567275,
 'NB': 74.37646507266761,
 'MLP': 99.92733239568683}