In [None]:
import joblib
import pandas as pd
from sklearn import metrics
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Creating 5 Stratified K Fold cross validation sets

In [None]:
TRAINING_PATH='../input/mushroom-classification/mushrooms.csv'

df=pd.read_csv(TRAINING_PATH)
df.head()

In [None]:
TRAINING_FOLDS_PATH='./'

import pandas as pd

df_train=pd.read_csv(TRAINING_PATH)
df_train.head()

df_train['class'].value_counts()

df_train['kfolds']=-1
df_train=df_train.sample(frac=1).reset_index(drop=True)
df_train.head()

from sklearn import model_selection

strat_kf=model_selection.StratifiedKFold(n_splits=5)

for fold,(trn_,val_) in enumerate(strat_kf.split(X=df_train,y=df_train['class'])):
  df_train.loc[val_,'kfolds']=fold
df_train.head()

df_train.to_csv(TRAINING_FOLDS_PATH+'train_folds.csv')

# File Paths

In [None]:
TRAINING_PATH='./train_folds.csv'
MODEL_PATH='./'
SUBMISSION_FILES_PATH='./Submissions/'

# Data Exploration¶
1. Null Values
2. Number of unique values

In [None]:
df=pd.read_csv(TRAINING_PATH)
df.head()

In [None]:
df.describe()

In [None]:
# Count the number of null values in each column
df.isna().sum()

In [None]:
# Total number of unique values in each column
df.nunique()

In [None]:
len(df)

In [None]:
df=df.drop(['Unnamed: 0'],axis=1)
df.head()

In [None]:
# Check for class imbalance 
df['class'].value_counts()

In [None]:
# Checking for any numerical data. If present, it has to be scaled etc.

columns = df.columns
numerical_columns = df._get_numeric_data().columns
numerical_columns

# Feature Selection : Removing some categorical features based on Cramer's V

*References :*
1. https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
2. https://www.kaggle.com/chrisbss1/cramer-s-v-correlation-matrix

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns


def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

In [None]:
rows= []

for var1 in df:
  col = []
  for var2 in df :
    confusion_matrix = pd.crosstab(df[var1], df[var2])
     # Cramer's V test
    col.append(round(cramers_v(confusion_matrix.values),2)) # Keeping of the rounded value of the Cramer's V  
  rows.append(col)
  
cramers_results = np.array(rows)
df_corr = pd.DataFrame(cramers_results, columns = df.columns, index =df.columns)



df_corr

In [None]:
# Since veil-type has NaN values in the correlation matrix
df_corr = df_corr.drop(['veil-type'],axis=0)
df_corr = df_corr.drop(['veil-type'],axis=1)

# Since kfolds is not required for correlation
df_corr = df_corr.drop(['kfolds'],axis=0)
df_corr = df_corr.drop(['kfolds'],axis=1)
df_corr

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(30,15))
sns.heatmap(df_corr, annot=True, cmap=plt.cm.CMRmap_r)

plt.show()

### High correlation between 2 features indicate some sort of duplication or that 1 feature can be represented in the form of other. So we only need 1 of those 2 features and we can remove 1. We can set a threshold (here, 0.7); above which if 2 features have correlation, we can drop 1.

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(df_corr, 0.7)
print(f"No of features which can be removed : {len(set(corr_features))}")
print(f"Removable features : {corr_features}")

In [None]:
df = df.drop(list(corr_features), axis=1)
df.head()

# One hot encode categorical features

In [None]:
columns_to_one_hot_encode = list(df.columns)
columns_to_one_hot_encode.remove('class')
columns_to_one_hot_encode.remove('kfolds')
columns_to_one_hot_encode 

In [None]:
# One hot encode the categorical columns - All except the target column "class" and the "kfolds" column

df=pd.get_dummies(data=df,columns=columns_to_one_hot_encode)
df.head()

In [None]:
df['class'] = df['class'].replace({'e':0,'p':1})
df.head()

### Move the class and kfolds column to the end

In [None]:
# Move the target and kfolds column to the last

df=df[[column for column in df if column not in['class','kfolds']]+['class','kfolds']]
df.head()

In [None]:
def run(fold,df,models,target_name, save_model, print_details=False):
  
  # print(df.head())
  # Training and validation sets
  df_train=df[df['kfolds']!=fold].reset_index(drop=True)
  df_valid=df[df['kfolds']==fold].reset_index(drop=True)


  # x and y of training dataset
  x_train=df_train.drop(target_name,axis=1).values
  y_train=df_train[target_name].values

  # x and y of validation dataset
  x_valid=df_valid.drop(target_name,axis=1).values
  y_valid=df_valid[target_name].values

  # accuracy => will store accuracies of the models  (same for confusion_matrices)
  accuracy=[]
  confusion_matrices=[]
  classification_report=[]

  for model_name,model_constructor in list(models.items()):
    clf=model_constructor
    clf.fit(x_train,y_train)

    # preds_train, preds_valid => predictions when training and validation x are fed into the trained model
    preds_train=clf.predict(x_train)
    preds_valid=clf.predict(x_valid)

    acc_train=metrics.accuracy_score(y_train,preds_train)
    acc_valid=metrics.accuracy_score(y_valid,preds_valid)

    f1_train = metrics.f1_score(y_train,preds_train)
    f1_valid = metrics.f1_score(y_valid,preds_valid)

    conf_matrix=metrics.confusion_matrix(y_valid,preds_valid)
    class_report=metrics.classification_report(y_valid,preds_valid)

    accuracy.append(acc_valid)
    confusion_matrices.append(conf_matrix)
    classification_report.append(class_report)

    if(print_details==True):
      print(f'Model => {model_name} => Fold = {fold} => Training Accuracy = {acc_train} => Validation Accuracy = {acc_valid}')

    if(save_model==True):
      joblib.dump(clf, f"{MODEL_PATH}{model_name}_F1_{f1_valid}_ACC_{acc_valid}_FOLD_{fold}.bin")

  if(print_details==True):
    print('\n--------------------------------------------------------------------------------------------\n')
    
  return accuracy,confusion_matrices,classification_report

# Hyperparameter Tuning for different models using Optuna
Models :

1. XGB Classifier
2. SVM Classifier
3. Random Forest Classifier
4. Decision Tree Classifier

### 1. Random Forest

In [None]:
import optuna
from functools import partial

def optimize_rfc(trial,df,total_folds,target_name):
    criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
    n_estimators = trial.suggest_int('n_estimators', 100, 1500)
    max_depth = trial.suggest_int("max_depth", 3, 30)
    max_features = trial.suggest_uniform("max_features", 0.01, 1.0)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 100)
    
    model = RandomForestClassifier(
        n_estimators = n_estimators, 
        max_depth = max_depth, 
        max_features = max_features, 
        min_samples_leaf = min_samples_leaf,
        min_samples_split = min_samples_split,
        criterion = criterion
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_rfc = partial(optimize_rfc, df = df, total_folds = 5,target_name = 'class')
study_rfc = optuna.create_study(direction = 'maximize')
study_rfc.optimize(optimization_function_rfc, n_trials=15)

In [None]:
rfc_best_params = study_rfc.best_trial.params
rfc_best_params

### 2. XGBoost Classifier

In [None]:
def optimize_xgb(trial,df,total_folds,target_name):
    
    learning_rate = trial.suggest_uniform("learning_rate", 0.01, 1.0)
    gamma = trial.suggest_uniform("gamma", 0.05, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 30)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
    subsample = trial.suggest_uniform("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.5, 1.0)
    reg_lambda = trial.suggest_uniform("reg_lambda", 0.01, 1.0)
    reg_alpha = trial.suggest_uniform("reg_alpha", 0.01, 1.0)
    
    model = XGBClassifier(
        learning_rate = learning_rate,
        gamma = gamma,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        reg_lambda = reg_lambda,
        reg_alpha = reg_alpha
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_xgb = partial(optimize_xgb, df = df, total_folds = 5,target_name = 'class')
study_xgb = optuna.create_study(direction = 'maximize')
study_xgb.optimize(optimization_function_xgb, n_trials=15)

In [None]:
xgb_best_params = study_xgb.best_trial.params
xgb_best_params

### 3. SVM Classifier

In [None]:
def optimize_svc(trial,df,total_folds,target_name):
    
    C = trial.suggest_uniform("C", 0.001, 1000)
    gamma = trial.suggest_categorical("gamma", ['auto'])
    class_weight = trial.suggest_categorical("class_weight", ['balanced'])
    
    model = SVC(
        C = C,
        gamma = gamma,
        class_weight = class_weight
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_svc = partial(optimize_svc, df = df, total_folds = 5,target_name = 'class')
study_svc = optuna.create_study(direction = 'maximize')
study_svc.optimize(optimization_function_svc, n_trials=15)

In [None]:
svc_best_params = study_svc.best_trial.params
svc_best_params

### 4. Decision Tree Classifier

In [None]:
def optimize_dt(trial,df,total_folds,target_name):
    criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
    max_depth = trial.suggest_int("max_depth", 3, 30)
    max_features = trial.suggest_uniform("max_features", 0.01, 1.0)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 100)
    
    model = DecisionTreeClassifier(
        max_depth = max_depth, 
        max_features = max_features, 
        min_samples_leaf = min_samples_leaf,
        min_samples_split = min_samples_split,
        criterion = criterion
    )
    
    accuracies = []
    
    for fold in range(total_folds):
        
        df_train=df[df['kfolds']!=fold].reset_index(drop=True)
        df_valid=df[df['kfolds']==fold].reset_index(drop=True)


        # x and y of training dataset
        x_train=df_train.drop(target_name,axis=1).values
        y_train=df_train[target_name].values

        # x and y of validation dataset
        x_valid=df_valid.drop(target_name,axis=1).values
        y_valid=df_valid[target_name].values
        
        model.fit(x_train, y_train)
        preds= model.predict(x_valid)
        
        fold_acc = metrics.accuracy_score(y_valid, preds)
        accuracies.append(fold_acc)
        
    return np.mean(accuracies)

optimization_function_dt = partial(optimize_dt, df = df, total_folds = 5,target_name = 'class')
study_dt = optuna.create_study(direction = 'maximize')
study_dt.optimize(optimization_function_dt, n_trials=15)

In [None]:
dt_best_params = study_dt.best_trial.params
dt_best_params

# Fit and Predict the models

In [None]:
XGB_model=XGBClassifier(**xgb_best_params)
SVM_model=SVC(**svc_best_params)
RFC_model=RandomForestClassifier(**rfc_best_params)
DT_model=DecisionTreeClassifier(**dt_best_params)
models={
    'XGB Classifier' : XGB_model,
    'SVM Classifier' : SVM_model,
    'Random Forest Classifier' : RFC_model,
    'Decision Tree Classifier' : DT_model
    }

accuracies,confusion_matrices,classification_reports=[],[],[]
for f in range(5):
  accuracy,confusion_matrix,classification_report=run(f,df,models=models,target_name='class', save_model= True, print_details=True)
  accuracies.append(accuracy)
  confusion_matrices.append(confusion_matrix)
  classification_reports.append(classification_report)

# Heatmap of the Confusion Matrix

In [None]:
def plot_confusion_matrix(fold_num, models, title):
    
    classifier_num = list(models.keys()).index(title)
    
    cf_matrix = confusion_matrices[fold_num][classifier_num]
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    plt.figure(figsize=(10,6))
    plt.title(title, fontsize=20)
    sns.heatmap(cf_matrix, annot=labels, fmt='', annot_kws={"fontsize" : 20})
    plt.show()

In [None]:
plot_confusion_matrix(fold_num = 0, models = models, title = "XGB Classifier")

In [None]:
plot_confusion_matrix(fold_num = 0, models = models, title = "SVM Classifier")

In [None]:
plot_confusion_matrix(fold_num = 0, models = models, title = "Random Forest Classifier")

In [None]:
plot_confusion_matrix(fold_num = 0, models = models, title = "Decision Tree Classifier")

# Conclusion : Since the dataset was already cleaned and it was an easy dataset so all the 4 models ie. XGBoost, Decision Tree, Random Forest and SVM gave 100% accuracies on both train and test data