In [1]:
!pip install --upgrade scikit-learn==1.0.2
!pip install --upgrade numpy==1.21.5



In [2]:
import time
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 1.) Load "ModifiedEdibleMushroom.csv" data from the link 
# (note: this data set has been preliminarily prepared )
!wget https://raw.githubusercontent.com/kaopanboonyuen/Python-Data-Science/master/Dataset/hed2020_dataset.csv

--2022-08-31 02:04:07--  https://raw.githubusercontent.com/kaopanboonyuen/Python-Data-Science/master/Dataset/hed2020_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 352588 (344K) [text/plain]
Saving to: 'hed2020_dataset.csv'


2022-08-31 02:04:07 (3.05 MB/s) - 'hed2020_dataset.csv' saved [352588/352588]



In [4]:
# Q1 before doing the data prep., how many "na" are there in "gill-size" variables?  
df = pd.read_csv('hed2020_dataset.csv')
df.isnull().sum()

id                               0
label                           60
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

In [5]:
# 2.) Drop rows where the target (label) variable is missing
df = df.dropna(subset=["label"])
df

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819,5820,e,k,s,f,n,a,c,b,e,...,o,p,b,c,l,1.0,10.0,2.0,7.0,8.0
5820,5821,e,x,s,f,n,a,c,b,e,...,o,p,b,v,l,1.0,10.0,1.0,7.0,8.0
5821,5822,e,f,s,f,n,a,c,b,e,...,o,p,b,c,l,1.0,1.0,2.0,7.0,8.0
5822,5823,p,k,y,f,y,f,c,n,t,...,o,e,w,v,l,1.0,9.0,1.0,1.0,1.0


In [6]:
# 3.) Drop the following variables:
drop_varibles = ['gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate',
                'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
                'stalk-color-above-ring-rate','stalk-color-below-ring-rate',
                'veil-color-rate','veil-type'] 
df = df.drop(drop_varibles, axis=1)
df

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,1,p,x,s,t,p,e,o,p,k,s,u,1.0
1,2,e,x,s,t,a,e,o,p,n,n,g,2.0
2,3,e,b,s,t,l,e,o,p,n,n,m,3.0
3,4,p,x,y,t,p,e,o,p,k,s,u,3.0
4,5,e,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819,5820,e,k,s,f,n,e,o,p,b,c,l,1.0
5820,5821,e,x,s,f,n,e,o,p,b,v,l,1.0
5821,5822,e,f,s,f,n,e,o,p,b,c,l,1.0
5822,5823,p,k,y,f,y,t,o,e,w,v,l,1.0


In [7]:
# Q2 how many rows of data, how many total variables ?
df.shape

(5764, 13)

In [8]:
# 4.) Examine the number of rows, the number of digits, and whether any are missing
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5764 entries, 0 to 5823
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5764 non-null   int64  
 1   label              5764 non-null   object 
 2   cap-shape          5764 non-null   object 
 3   cap-surface        5737 non-null   object 
 4   bruises            5665 non-null   object 
 5   odor               5665 non-null   object 
 6   stalk-shape        5643 non-null   object 
 7   ring-number        5702 non-null   object 
 8   ring-type          5702 non-null   object 
 9   spore-print-color  5708 non-null   object 
 10  population         5708 non-null   object 
 11  habitat            5733 non-null   object 
 12  cap-color-rate     5737 non-null   float64
dtypes: float64(1), int64(1), object(11)
memory usage: 630.4+ KB


In [9]:
# 5.) Fill missing values by adding the mean for numeric variables 
# and the mode for nominal variables
df.isnull().sum()

id                     0
label                  0
cap-shape              0
cap-surface           27
bruises               99
odor                  99
stalk-shape          121
ring-number           62
ring-type             62
spore-print-color     56
population            56
habitat               31
cap-color-rate        27
dtype: int64

In [10]:
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

  df.fillna(df.mean(), inplace=True)


id                     0
label                  0
cap-shape              0
cap-surface           27
bruises               99
odor                  99
stalk-shape          121
ring-number           62
ring-type             62
spore-print-color     56
population            56
habitat               31
cap-color-rate         0
dtype: int64

In [11]:
for col in df.columns:
    df[col].fillna(df[col].mode()[0], inplace=True)
df.isnull().sum()

id                   0
label                0
cap-shape            0
cap-surface          0
bruises              0
odor                 0
stalk-shape          0
ring-number          0
ring-type            0
spore-print-color    0
population           0
habitat              0
cap-color-rate       0
dtype: int64

In [12]:
# 6.) Convert the label variable e (edible) to 1 and
# p (poisonous) to 0 and check the quantity. class0: class1
mapping_dict = {
    "label": {
        "e": 1,
        "p": 0
    }
}

df = df.replace(mapping_dict)
df

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,1,0,x,s,t,p,e,o,p,k,s,u,1.0
1,2,1,x,s,t,a,e,o,p,n,n,g,2.0
2,3,1,b,s,t,l,e,o,p,n,n,m,3.0
3,4,0,x,y,t,p,e,o,p,k,s,u,3.0
4,5,1,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819,5820,1,k,s,f,n,e,o,p,b,c,l,1.0
5820,5821,1,x,s,f,n,e,o,p,b,v,l,1.0
5821,5822,1,f,s,f,n,e,o,p,b,c,l,1.0
5822,5823,0,k,y,f,y,t,o,e,w,v,l,1.0


In [13]:
# Q3 answer the quantity class0:class1 ?
df.label.value_counts()

0    3660
1    2104
Name: label, dtype: int64

In [14]:
# 7. Convert the nominal variable to numeric using a dummy code with drop_first = True.
nominal_columns = ['cap-shape', 'cap-surface', 'bruises', 'odor',
                   'stalk-shape', 'ring-number', 'ring-type',
                   'spore-print-color', 'population', 'habitat']
dummy_df = pd.get_dummies(df[nominal_columns], drop_first=True)
dummy_df = pd.concat([df, dummy_df], axis=1)
dummy_df = dummy_df.drop(nominal_columns, axis=1)
dummy_df

Unnamed: 0,id,label,cap-color-rate,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,1.0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,1,0
1,2,1,2.0,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0
2,3,1,3.0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3,4,0,3.0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
4,5,1,4.0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819,5820,1,1.0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5820,5821,1,1.0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,0
5821,5822,1,1.0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
5822,5823,0,1.0,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0


In [15]:
# 8. Split train/test with 20% test, stratify, and seed = 2020
from sklearn.model_selection import train_test_split
X = dummy_df.drop('label', axis=1)
y = dummy_df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=2020)

In [16]:
# Q4 how much is each training and testing sets ?
print(X_train.shape)
print(X_test.shape)

(4611, 43)
(1153, 43)


In [17]:
# 9. Create a Random Forest with GridSearch on training data with 5 CV, random state 2020.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=dict(
        criterion=['gini','entropy'],
        max_depth=[2,3,6],
        min_samples_leaf=[2,5,10],
        n_estimators=[100,200],
        random_state=[2020]
    ),
    scoring='f1_weighted',
    cv=5,
    n_jobs=-1 # Parallel
)

grid_start_time = time.time()
grid_search.fit(X_train, y_train)
grid_end_time = time.time()
print(f"Searching Time: {datetime.timedelta(seconds=grid_end_time-grid_start_time)}")

Searching Time: 0:00:08.778190


In [18]:
# Q5 best params after doing random forest grid search (Hint: must include random_state=200) 
model = grid_search.best_estimator_
model

RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_leaf=2,
                       n_estimators=200, random_state=2020)

In [19]:
# 10.) Predict the testing data set with confusion_matrix and classification_report
RF = RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_leaf=2,n_estimators=200, random_state=2020)
RF.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_leaf=2,
                       n_estimators=200, random_state=2020)

In [20]:
import pickle
filename = 'model.sav'
pickle.dump(RF, open(filename, 'wb'))

In [21]:
RF = pickle.load(open(filename,'rb'))
RF

RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_leaf=2,
                       n_estimators=200, random_state=2020)

In [22]:
predictions = RF.predict(X_test)

In [23]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions,digits=4))

              precision    recall  f1-score   support

           0     0.9986    0.9986    0.9986       732
           1     0.9976    0.9976    0.9976       421

    accuracy                         0.9983      1153
   macro avg     0.9981    0.9981    0.9981      1153
weighted avg     0.9983    0.9983    0.9983      1153

