### Model Training

In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing csv data as pandas dataframe
df = pd.read_csv("dataset/mushrooms.csv")

In [3]:
#Shows first 5 records
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
#Number of rows and columns in dataset
df.shape

(8124, 23)

In [5]:
#Columns of dataset
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [6]:
#Total number of categorical value in each column of dataset
for col in df.columns:
  print(df[col].value_counts())

class
e    4208
p    3916
Name: count, dtype: int64
cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64
cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64
cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64
bruises
f    4748
t    3376
Name: count, dtype: int64
odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64
gill-attachment
f    7914
a     210
Name: count, dtype: int64
gill-spacing
c    6812
w    1312
Name: count, dtype: int64
gill-size
b    5612
n    2512
Name: count, dtype: int64
gill-color
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: count, dtype: int64
stalk-shape
t    4608
e    3516
Name: count, dtype: int64
stalk-root
b    3776
?    2480
e    1120
c     556
r     192
Name: coun

In [7]:
#Here we drop the constant value of column from dataset
df.drop('veil-type', axis = 1, inplace = True)

In [8]:
df['stalk-root'].value_counts()

stalk-root
b    3776
?    2480
e    1120
c     556
r     192
Name: count, dtype: int64

In [9]:
#Here we replace missing value '?' with most frequent value 
df['stalk-root'] = df['stalk-root'].replace('?', 'b')
df['stalk-root'].value_counts()

stalk-root
b    6256
e    1120
c     556
r     192
Name: count, dtype: int64

In [10]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g


In [11]:
#split dataset into training data and testing data
X= df.drop('class', axis = 1)
y= df['class']

In [12]:
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,s,w,w,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,s,w,w,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,s,w,w,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,s,w,w,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,s,w,w,w,o,e,n,a,g


In [13]:
X.shape

(8124, 21)

In [14]:
y.shape

(8124,)

In [15]:
#Converting categorical variable into numerical variable using OneHotEncoding technique
ohe = OneHotEncoder(handle_unknown= 'ignore', sparse= False)
#Feature Scaling 
scaler = StandardScaler(with_mean= False)


In [16]:
X = ohe.fit_transform(X)

In [17]:
X

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
X.shape

(8124, 115)

In [19]:
X_scale = scaler.fit_transform(X)

In [20]:
X_scale

array([[0.        , 0.        , 0.        , ..., 0.        , 4.8086936 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [4.3626101 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 2.05216015, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [21]:
y.ndim

1

In [22]:
y = y.replace({'p':1, 'e':0})

In [23]:
y = np.array(y)

In [24]:
type(y)

numpy.ndarray

In [25]:
X.shape, y.shape

((8124, 115), (8124,))

In [26]:
#Splitting dataset into training data and testing data
x_train, x_test, y_train, y_test = train_test_split(X_scale,y, test_size = 0.2, random_state= 0)

### Create an Evaluate Function to give all metrics after model training

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def evaluate_model(true, predicted):
    score = accuracy_score(true, predicted)
    cm = confusion_matrix(true, predicted)
    report = classification_report(true, predicted)
    return score, cm, report

In [28]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(), 
}
model_list = []
scores =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluate Train and Test dataset
    model_train_score , model_train_cm, model_train_report = evaluate_model(y_train, y_train_pred)

    model_test_score , model_test_cm, model_test_report = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_score))
    print("- Confusion Matrix: \n",model_train_cm)
    print("- Classification Report: \n",model_train_report)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_score))
    print("- Confusion Matrix: \n",model_test_cm)
    print("- Classification Report: \n",model_test_report)

    scores.append(model_test_score)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy Score: 1.0000
- Confusion Matrix: 
 [[3356    0]
 [   0 3143]]
- Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3356
           1       1.00      1.00      1.00      3143

    accuracy                           1.00      6499
   macro avg       1.00      1.00      1.00      6499
weighted avg       1.00      1.00      1.00      6499

----------------------------------
Model performance for Test set
- Accuracy Score: 1.0000
- Confusion Matrix: 
 [[852   0]
 [  0 773]]
- Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       852
           1       1.00      1.00      1.00       773

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



K-Neighbors Classifier
Mo

### Results

In [29]:
pd.DataFrame(list(zip(model_list, scores)), columns=['Model Name', 'Scores']).sort_values(by=["Scores"],ascending=False)

Unnamed: 0,Model Name,Scores
0,Logistic Regression,1.0
1,K-Neighbors Classifier,1.0
2,Support Vector Machine,1.0
3,Random Forest Classifier,1.0
4,Decision Tree Classifier,1.0


In [30]:
#Here we can see that all above machine learning models gives 100% accuracy for both train data and test data
#and also all models are not giving overfitting problems.

In [31]:
#Random Forest classifier
forest = RandomForestClassifier()
forest.fit(x_train, y_train)

In [32]:
forest_predict = forest.predict(x_test)

In [33]:
print("Accuracy Score: ", accuracy_score(y_test, forest_predict))

Accuracy Score:  1.0


In [34]:
forest.score(x_train, y_train)

1.0

In [35]:
forest.score(x_test, y_test)

1.0

##### Hyperparameter Tunning

In [36]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 222, 344, 466, 588, 711, 833, 955, 1077, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 21, 32, 43, 54, 65, 76, 87, 98, 110, None], 'min_samples_split': [2, 3, 5], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [37]:
rf_random = RandomizedSearchCV(estimator = forest, param_distributions = random_grid, n_iter = 50, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)

In [38]:
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [39]:
rf_random.best_params_

{'n_estimators': 1200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [40]:
rf_random.best_score_

1.0

In [41]:
rf_predict = rf_random.predict(x_test)

In [42]:
accuracy_score(y_test, rf_predict)

1.0

In [43]:
rf_random.score(x_train, y_train)

1.0

In [44]:
rf_random.score(x_test, y_test)

1.0

In [45]:
#After performing hyperparameter tunning,model also gives 100% accuracy for both train and test data.