# Import Libraries

In [None]:
##### Importing modules #####
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Phase 1

In [None]:
##### Retrive and Reading Data #####
file = '/kaggle/input/mushroom-classification/mushrooms.csv'

data = pd.read_csv(file)
data.columns = data.columns.str.replace('-','_')
data.head()

In [None]:
##### Remove Rare Labels #####
# Arbitrary value
data.drop("veil_type", axis=1, inplace=True)  

In [None]:
##### Variable separation #####

target = 'class'
features = np.array([col for col in data.columns
                     if col != target])

In [None]:
##### Train test split #####

train, test = train_test_split(data,
                               random_state=1,
                               test_size=0.30, 
                               stratify=data[target])

print("No. of data points in training set : ", len(train))
print("No. of data points in testing set : ", len(test))

In [None]:
##### Independent and dependent features #####

X_train = train.drop("class", axis=1)
y_train = train['class']

X_test = test.drop("class", axis=1)
y_test = test['class']

In [None]:
##### Feature engineering #####

### 1. Ordinal Encoding ###

enc = OrdinalEncoder(dtype=int)
_ = enc.fit(X_train)

X_train_encoded = pd.DataFrame(enc.transform(X_train),
                               index=X_train.index,
                               columns=X_train.columns)

X_test_encoded = pd.DataFrame(enc.transform(X_test),
                              index=X_test.index,
                              columns=X_train.columns)

### 2. Label Encoding ###

label_enc = LabelEncoder()
_ = label_enc.fit(y_train)

y_train_encoded = pd.Series(label_enc.transform(y_train),
                            index=y_train.index, 
                            name="y_train")

y_test_encoded = pd.Series(label_enc.transform(y_test),
                           index=y_test.index, 
                           name="y_test")

In [None]:
### categories - mapping by ordinal encoder ###

for i, feat in enumerate(enc.categories_):
    label_map = dict(zip(feat, range(len(feat))))
    
    print(f"{i+1}. {features[i]}\n{label_map}\n")

In [None]:
X_train_encoded.head()

In [None]:
### index => class ###

label_enc.classes_ 
# 0 => e or edible 
# 1 => p or poisionous

In [None]:
y_train_encoded.head()

## Feature Selection

In [None]:
##### Feature selection #####

feature_selector = SelectKBest(chi2, k=8) # k= 5, 6, 8
_ = feature_selector.fit(X_train_encoded,
                         y_train_encoded)

scores = sorted(zip(features,
                    feature_selector.scores_,
                    feature_selector.get_support()),
                key=lambda x: x[1],
                reverse=True)

print("FEATURE\t\t\t    SCORE   KEEP")

for feat, score, res in scores:
    print(f"{feat:<25}  {score:7.2f}  {res}")

In [None]:
##### Final features from Feature selection #####
new_features = features[feature_selector.get_support()]
print(f"Selected features:\n {new_features}")

In [None]:
##### Creating New dataset with selected features #####

new_data = data[np.append(new_features, 'class')]
new_data.to_csv('final_data.csv', index=False)

new_data.head()

# Phase 2
### Final data with selected features

In [None]:
##### Train test split #####

train, test = train_test_split(new_data,
                               random_state=1,
                               test_size=0.30,
                               stratify=data[target])

print("No. of data points in training set : ", len(train))
print("No. of data points in testing set : ", len(test))

##### Independent and dependent features #####

X_train = train.drop("class", axis=1)
y_train = train['class']

X_test = test.drop("class", axis=1)
y_test = test['class']

In [None]:
##### Feature engineering #####

### 1. Ordinal Encoding ###

enc = OrdinalEncoder(dtype=int)
_ = enc.fit(X_train)

X_train_encoded = pd.DataFrame(enc.transform(X_train),
                               index=X_train.index,
                               columns=X_train.columns)

X_test_encoded = pd.DataFrame(enc.transform(X_test),
                              index=X_test.index,
                              columns=X_train.columns)

### 2. Label Encoding ###

label_enc = LabelEncoder()
_ = label_enc.fit(y_train)

y_train_encoded = pd.Series(label_enc.transform(y_train),
                            index=y_train.index,
                            name="y_train")

y_test_encoded = pd.Series(label_enc.transform(y_test),
                           index=y_test.index,
                           name="y_test")

In [None]:
for i, col in enumerate(enc.categories_):
    print(f"'{new_features[i]}' : {list(col)},")

# index = numerical representation 
# Ex: for bruises, 'f' => 0 and 't' => 1

In [None]:
X_train_encoded.head()

## Model Selection

In [None]:
##### Machine Learning Algorithm Selection #####

### Model Selection ###

models = [
    # GLM
    LogisticRegression(random_state=1),
    # Nearest Neighbor
    KNeighborsClassifier(),
    # SVM
    SVC(probability=True, random_state=1),
    # Trees
    DecisionTreeClassifier(random_state=1),
    # Ensemble
    RandomForestClassifier(random_state=1),

    AdaBoostClassifier(random_state=1),
    
    XGBClassifier(random_state=1)
]

##### K-FOld Cross validation #####
cv_split = ShuffleSplit(n_splits=10,
                        test_size=.3,
                        train_size=.7,
                        random_state=0)

##### create dataframe to compare model metrics #####

columns = ['Name',
           'Parameters',
           'TrainAccuracyMean',
           'TestAccuracyMean',
           'AvgTrainingTime'
           ]

compare = pd.DataFrame(columns=columns)

##### Model selection process #####

for row_index, model in enumerate(models):

    model_name = model.__class__.__name__

    print(f"Training started for {model_name}")
    
    # model cross validation results
    cv_results = cross_validate(model,
                                X_train_encoded,
                                y_train_encoded,
                                cv=cv_split,
                                scoring='accuracy',
                                return_train_score=True,
                                n_jobs=-1)
    
    ##### Add cv results to comparision dataframe #####
    compare.loc[row_index, 'Name'] = model_name
    
    compare.loc[row_index,
                'Parameters'] = str(model.get_params())
    
    compare.loc[row_index, 
                'TrainAccuracyMean'] = cv_results['train_score'].mean()
    
    compare.loc[row_index,
                'TestAccuracyMean'] = cv_results['test_score'].mean()
    
    compare.loc[row_index,
                'AvgTrainingTime'] = cv_results['fit_time'].mean()

    print(f"Training completed!!\n")
    
    compare.sort_values(by=['TestAccuracyMean'],
                        ascending=False,
                        inplace=True)

In [None]:
compare

In [None]:
##### Plot Results #####

plt.figure(figsize=(10, 5))
sns.barplot(x='TestAccuracyMean', y='Name', data=compare)
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Model Name')
plt.tight_layout()

### Hyperparameter tuning ###

In [None]:
##### Final Model => Decision Tree Classifier #####

### Hyperparameter tuning ###

param_dist = {
    'max_depth': [4, 6, 8, 10, 12, 14, 16, 20],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [5, 10, 20, 30, 40, 50],
    'max_features': [0.2, 0.4, 0.6, 0.8, 1],
    'max_leaf_nodes': [8, 16, 32, 64, 128, 256],
}


tree = DecisionTreeClassifier(random_state=1)

search = RandomizedSearchCV(estimator=tree,
                             param_distributions=param_dist,
                             cv=3,
                             n_iter=30,
                             verbose=1,
                             random_state=1)


_ = search.fit(X_train_encoded, y_train_encoded)

### best model and params ###

best_model = search.best_estimator_
best_params = search.best_params_
best_score = search.best_score_

print(f"\nTuned Decision Tree Parameters:\n{best_params}")
print(f"\nBest score: {best_score}")

# Phase 3

## Final Pipeline

In [None]:
##### Final Pipeline #####

ordinal_encoder = OrdinalEncoder(dtype=int)

estimator = DecisionTreeClassifier(**best_params)


pipeline = Pipeline([
    ("ordinal_encoder", ordinal_encoder), 
    ("estimator", estimator)])


print("Training Started..\n")

tic = time.time()
_ = pipeline.fit(X_train, y_train_encoded)
tac = time.time()

print("Training completed!\n")
print(f"Training time :{tac-tic} seconds\n")

y_train_preds = pipeline.predict(X_train)
y_test_preds = pipeline.predict(X_test)

print("Train accuracy :",
      accuracy_score(y_train_preds, y_train_encoded))

print("Test accuracy :",
      accuracy_score(y_test_preds, y_test_encoded))

pickle.dump(pipeline, open("pipeline.pkl", "wb"))
print("\nPipeline saved!")

## Evaluation

In [None]:
##### Evaluation #####

### Classification Report ###

print(classification_report(y_test_encoded, y_test_preds))

### Confusion matrix ###

cm = confusion_matrix(y_true=y_test_encoded, 
                      y_pred=y_test_preds)

classNames = ['Edible', 'Poision']
tick_marks = np.arange(len(classNames))
s = [['TN', 'FP'], ['FN', 'TP']]

plt.figure(figsize=(5, 5))
plt.imshow(cm, interpolation='nearest', 
           cmap=plt.cm.Wistia)

plt.title('Edible or Poisionous Mushroom Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.xticks(tick_marks, classNames, rotation=0)
plt.yticks(tick_marks, classNames)

for i in range(2):
    for j in range(2):
        plt.text(j, i, str(s[i][j])+" = "+str(cm[i][j]))

## Upvote my notebook.💛
### Thank you..!!