In [1]:
# Import machine learning and other dependencies

import sqlite3
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Connect to database

con = sqlite3.connect("mushrooms.db")
cur = con.cursor()

In [3]:
# Get list of tables from database

sql_query = """SELECT name FROM sqlite_master  
  WHERE type='table';"""

cur.execute(sql_query)

print(cur.fetchall())

[('mushroom_features',), ('mushroom_targets',)]


In [4]:
# Join tables and create dataframe

mushroom_df = pd.read_sql(
    'SELECT * FROM mushroom_targets JOIN mushroom_features ON mushroom_targets.id = mushroom_features.id', con
)

mushroom_df.head()

Unnamed: 0,id,class,id.1,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0,p,0,x,s,n,t,p,f,c,...,s,w,w,p,w,o,p,k,s,u
1,1,e,1,x,s,y,t,a,f,c,...,s,w,w,p,w,o,p,n,n,g
2,2,e,2,b,s,w,t,l,f,c,...,s,w,w,p,w,o,p,n,n,m
3,3,p,3,x,y,w,t,p,f,c,...,s,w,w,p,w,o,p,k,s,u
4,4,e,4,x,s,g,f,n,f,w,...,s,w,w,p,w,o,e,n,a,g


In [5]:
# Get count of rows and columns in dataframe to make sure we're on the right track

mushroom_df.shape

(8124, 25)

In [6]:
# Close database connection since we have what we need

con.close()

In [7]:
# Encode the data

mushroom_df = mushroom_df.apply(LabelEncoder().fit_transform)

mushroom_df.head(10)

Unnamed: 0,id,class,id.1,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,0,1,0,5,2,4,1,6,1,0,...,2,7,7,0,2,1,4,2,3,5
1,1,0,1,5,2,9,1,0,1,0,...,2,7,7,0,2,1,4,3,2,1
2,2,0,2,0,2,8,1,3,1,0,...,2,7,7,0,2,1,4,3,2,3
3,3,1,3,5,3,8,1,6,1,0,...,2,7,7,0,2,1,4,2,3,5
4,4,0,4,5,2,3,0,5,1,1,...,2,7,7,0,2,1,0,3,0,1
5,5,0,5,5,3,9,1,0,1,0,...,2,7,7,0,2,1,4,2,2,1
6,6,0,6,0,2,8,1,0,1,0,...,2,7,7,0,2,1,4,2,2,3
7,7,0,7,0,3,8,1,3,1,0,...,2,7,7,0,2,1,4,3,3,3
8,8,1,8,5,3,8,1,6,1,0,...,2,7,7,0,2,1,4,2,4,1
9,9,0,9,0,2,9,1,0,1,0,...,2,7,7,0,2,1,4,2,3,3


In [8]:
# Define the features and target

X = mushroom_df.drop(['id', 'class'], axis=1)

y = mushroom_df['class']

In [9]:
# Make sure features data looks right

X.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1


In [10]:
# Make sure targets data looks right

y.head()

0    1
1    0
2    0
3    1
4    0
Name: class, dtype: int32

In [11]:
# Split into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [12]:
# Create a standard scaler instance

scaler = StandardScaler()

# Fit the Standard Scaler with the training data

X_scaler = scaler.fit(X_train)

# Scale the data

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Create a random forest classifier

rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [14]:
# Fit the model

rf_model = rf_model.fit(X_train, y_train)

In [15]:
# Make predictions using the testing data

predictions = rf_model.predict(X_test)

In [16]:
# Create the confusion matrix and a dataframe

cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual edible", "Actual poisonous"], columns=["Predicted edible", "Predicted poisonous"])

print(cm_df)

                  Predicted edible  Predicted poisonous
Actual edible                 1048                    0
Actual poisonous                 0                  983


In [17]:
# Export confusion matrix to csv file

cm_df.to_csv('confusion_matrix.csv')

In [18]:
# Calculate the accuracy score

acc_score = accuracy_score(y_test, predictions)

In [19]:
# Display results

print("Confusion Matrix")
display(cm_df)

print(f"Accuracy Score : {acc_score}")

print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted edible,Predicted poisonous
Actual edible,1048,0
Actual poisonous,0,983


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1048
           1       1.00      1.00      1.00       983

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031



In [20]:
# Put classification report into a dataframe and export to csv

report = classification_report(y_test, predictions, output_dict=True)

cr_df = pd.DataFrame(report).transpose()

cr_df.to_csv('classification_report.csv')

In [21]:
# Calculate feature importance in the Random Forest model
# The output from this code returns an array of scores for the features in the X_test set, whose sum equals 1.0

importances = rf_model.feature_importances_

importances

array([0.00472861, 0.00853296, 0.01168102, 0.04957004, 0.15228304,
       0.00350804, 0.05244693, 0.10548454, 0.10171477, 0.02864228,
       0.05442542, 0.04628943, 0.05123152, 0.02397798, 0.01695259,
       0.        , 0.00201934, 0.01322657, 0.08221886, 0.10070301,
       0.05259085, 0.0377722 ])

In [22]:
# Sort the features by their importance.

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1522830399381663, 'odor'),
 (0.10548454435023337, 'gill_size'),
 (0.10171477478872035, 'gill_color'),
 (0.10070300678133833, 'spore_print_color'),
 (0.08221885567582286, 'ring_type'),
 (0.05442542008834844, 'stalk_root'),
 (0.0525908515982156, 'population'),
 (0.052446926438851876, 'gill_spacing'),
 (0.051231521493581726, 'stalk_surface_below_ring'),
 (0.04957003762064024, 'bruises'),
 (0.04628942828831537, 'stalk_surface_above_ring'),
 (0.037772199032309585, 'habitat'),
 (0.02864227869894102, 'stalk_shape'),
 (0.023977982526642364, 'stalk_color_above_ring'),
 (0.016952588970309504, 'stalk_color_below_ring'),
 (0.013226570924077627, 'ring_number'),
 (0.011681018243977173, 'cap_color'),
 (0.008532961786219333, 'cap_surface'),
 (0.004728613964602275, 'cap_shape'),
 (0.0035080424657842582, 'gill_attachment'),
 (0.0020193363249024986, 'veil_color'),
 (0.0, 'veil_type')]

In [23]:
# Put importance list into a dataframe and export

importance_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))

importance_df

Unnamed: 0,0,1
0,0.152283,odor
1,0.105485,gill_size
2,0.101715,gill_color
3,0.100703,spore_print_color
4,0.082219,ring_type
5,0.054425,stalk_root
6,0.052591,population
7,0.052447,gill_spacing
8,0.051232,stalk_surface_below_ring
9,0.04957,bruises


In [24]:
# Rename importance dataframe columns

dict = {0: 'Importance score', 1: 'Feature'}

importance_df = importance_df.rename(columns=dict)

importance_df

Unnamed: 0,Importance score,Feature
0,0.152283,odor
1,0.105485,gill_size
2,0.101715,gill_color
3,0.100703,spore_print_color
4,0.082219,ring_type
5,0.054425,stalk_root
6,0.052591,population
7,0.052447,gill_spacing
8,0.051232,stalk_surface_below_ring
9,0.04957,bruises


In [25]:
# Export importance dataframe to csv

importance_df.to_csv('feature_importance.csv')