#KNN Classifier for Mushrooms

This is a notebook aimed at creating a KNN model to classify a mushroom as edible or not based on it's characteristics.

In [1]:
#importing libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix
from sklearn.feature_selection import f_classif #To obtain the best feature

In [2]:
#Data Import & Exploration
df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/3qPv1_g8n6KvWjyLOrjXyw/mushroom-cleaned.csv')
df.sample(5)

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
36939,375,1,0,5,0.211312,616,11,0.943195,1
52416,434,5,4,5,1.178206,430,6,0.88845,1
5120,554,6,1,10,0.318745,990,6,0.943195,1
50556,487,3,0,10,2.013794,0,2,0.943195,1
20470,641,2,0,10,0.150776,1454,11,0.88845,0


In [3]:
df.describe()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
count,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0
mean,567.257204,4.000315,2.142056,7.329509,0.75911,1051.081299,8.418062,0.952163,0.549181
std,359.883763,2.160505,2.228821,3.200266,0.650969,782.056076,3.262078,0.305594,0.49758
min,0.0,0.0,0.0,0.0,0.000426,0.0,0.0,0.027372,0.0
25%,289.0,2.0,0.0,5.0,0.270997,421.0,6.0,0.88845,0.0
50%,525.0,5.0,1.0,8.0,0.593295,923.0,11.0,0.943195,1.0
75%,781.0,6.0,4.0,10.0,1.054858,1523.0,11.0,0.943195,1.0
max,1891.0,6.0,6.0,11.0,3.83532,3569.0,12.0,1.804273,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cap-diameter     54035 non-null  int64  
 1   cap-shape        54035 non-null  int64  
 2   gill-attachment  54035 non-null  int64  
 3   gill-color       54035 non-null  int64  
 4   stem-height      54035 non-null  float64
 5   stem-width       54035 non-null  int64  
 6   stem-color       54035 non-null  int64  
 7   season           54035 non-null  float64
 8   class            54035 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 3.7 MB


In [5]:
df.isna().sum()

Unnamed: 0,0
cap-diameter,0
cap-shape,0
gill-attachment,0
gill-color,0
stem-height,0
stem-width,0
stem-color,0
season,0
class,0


In [6]:
df.duplicated().sum()
df = df.drop_duplicates()

In [7]:
df.columns

Index(['cap-diameter', 'cap-shape', 'gill-attachment', 'gill-color',
       'stem-height', 'stem-width', 'stem-color', 'season', 'class'],
      dtype='object')

In [8]:
#Distribution of mushroom class
freq = df['class'].value_counts()
prob = freq.apply(lambda x:x/len(df))
print(prob)

class
1    0.546639
0    0.453361
Name: count, dtype: float64


In [9]:
#Scaling the features
scaler = StandardScaler()
preproc = scaler.fit_transform(df.drop(columns = ['class']))

df_standard = pd.DataFrame(preproc, columns = df.drop(columns = ['class']).columns)

In [10]:
df_standard

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season
0,2.229226,-0.926441,-0.063989,0.832305,4.729807,0.625435,0.786897,2.804390
1,2.476186,-0.926441,-0.063989,0.832305,4.729807,0.640815,0.786897,2.804390
2,2.226451,-0.926441,-0.063989,0.832305,4.427986,0.652349,0.786897,2.804390
3,1.921218,0.920991,-0.063989,0.832305,4.699009,0.652349,0.786897,2.804390
4,2.043311,0.920991,-0.063989,0.832305,4.581976,0.521623,0.786897,-0.030044
...,...,...,...,...,...,...,...,...
53727,-1.375292,0.459133,0.383934,-1.675204,0.209973,-0.625437,1.095970,-0.030044
53728,-1.350318,-0.926441,0.383934,-1.675204,0.671943,-0.726686,1.095970,-0.030044
53729,-1.350318,0.459133,0.383934,-1.675204,0.253090,-0.606212,1.095970,-0.210247
53730,-1.358643,-0.926441,0.383934,-1.675204,0.437878,-0.725404,1.095970,-0.210247


In [11]:
df.shape

(53732, 9)

In [12]:
df_standard.shape

(53732, 8)

In [13]:
#Concatenating the standardized dataframe with the target attribute column
#Using reset_index() to allow concatenation, since without it, the index persists in the memory and causes the number of rows to increase with multiple NA values
df_knn = pd.concat([df_standard.reset_index(drop=True), df["class"].reset_index(drop=True)], axis = 1)
df_knn

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,2.229226,-0.926441,-0.063989,0.832305,4.729807,0.625435,0.786897,2.804390,1
1,2.476186,-0.926441,-0.063989,0.832305,4.729807,0.640815,0.786897,2.804390,1
2,2.226451,-0.926441,-0.063989,0.832305,4.427986,0.652349,0.786897,2.804390,1
3,1.921218,0.920991,-0.063989,0.832305,4.699009,0.652349,0.786897,2.804390,1
4,2.043311,0.920991,-0.063989,0.832305,4.581976,0.521623,0.786897,-0.030044,1
...,...,...,...,...,...,...,...,...,...
53727,-1.375292,0.459133,0.383934,-1.675204,0.209973,-0.625437,1.095970,-0.030044,1
53728,-1.350318,-0.926441,0.383934,-1.675204,0.671943,-0.726686,1.095970,-0.030044,1
53729,-1.350318,0.459133,0.383934,-1.675204,0.253090,-0.606212,1.095970,-0.210247,1
53730,-1.358643,-0.926441,0.383934,-1.675204,0.437878,-0.725404,1.095970,-0.210247,1


In [14]:
#Splitting dataframe for training and testing
x = df_knn.drop(columns = ['class'])
y = df_knn['class']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = 0.25, random_state = 42)

In [15]:
#Creating the classifier and testing it for accuracy
knn=KNeighborsClassifier()
knn.fit(xtrain, ytrain)

ypred = knn.predict(xtest)
acc = accuracy_score(ytest, ypred)
print(f"Accuracy: {acc:.2%}")

Accuracy: 98.77%


In [16]:
#Hyperparamater Tuning
knn = KNeighborsClassifier()

param_grid = {'n_neighbors': range(1,11)}

grid = GridSearchCV(knn, param_grid, cv = 10)
grid.fit(xtrain, ytrain)

results = grid.cv_results_
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean accuracy: {mean_score:.3f} (std: {std_score:.3f}) with: {params}")

Mean accuracy: 0.987 (std: 0.001) with: {'n_neighbors': 1}
Mean accuracy: 0.986 (std: 0.002) with: {'n_neighbors': 2}
Mean accuracy: 0.987 (std: 0.001) with: {'n_neighbors': 3}
Mean accuracy: 0.987 (std: 0.001) with: {'n_neighbors': 4}
Mean accuracy: 0.987 (std: 0.001) with: {'n_neighbors': 5}
Mean accuracy: 0.987 (std: 0.001) with: {'n_neighbors': 6}
Mean accuracy: 0.987 (std: 0.001) with: {'n_neighbors': 7}
Mean accuracy: 0.986 (std: 0.001) with: {'n_neighbors': 8}
Mean accuracy: 0.986 (std: 0.002) with: {'n_neighbors': 9}
Mean accuracy: 0.986 (std: 0.002) with: {'n_neighbors': 10}


  _data = np.array(data, dtype=dtype, copy=copy,


In [17]:
#Understanding the most relevant attribute in the dataset for the classification task
f_score, fpvalue = f_classif(x,y)
fscores = pd.DataFrame({'Feature':x.columns, 'F-Score':f_score,'P-value':fpvalue})
fscores = fscores.sort_values(by = 'F-Score', ascending = False)

fscores

Unnamed: 0,Feature,F-Score,P-value
5,stem-width,1744.047944,0.0
4,stem-height,1714.301739,0.0
0,cap-diameter,1463.380893,6.397719e-316
1,cap-shape,943.784108,1.80644e-205
6,stem-color,784.220882,2.492885e-171
7,season,376.007435,1.778677e-83
3,gill-color,195.206062,2.777402e-44
2,gill-attachment,147.917059,5.48262e-34


In [18]:
#Creating the classifier with attributes having high f-scores
x_simple = df_knn.drop(columns = ['class','gill-attachment','gill-color'])
y = df_knn['class']

xtrain, xtest, ytrain, ytest = train_test_split(x_simple, y, test_size = 0.25, random_state = 42)

In [19]:
#Hyperparamater Tuning with only a subset of features
knn = KNeighborsClassifier()

param_grid = {'n_neighbors': range(1,11)}

grid = GridSearchCV(knn, param_grid, cv = 10)
grid.fit(xtrain, ytrain)

results = grid.cv_results_
for mean_score, std_score, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print(f"Mean accuracy: {mean_score:.3f} (std: {std_score:.3f}) with: {params}")

Mean accuracy: 0.914 (std: 0.004) with: {'n_neighbors': 1}
Mean accuracy: 0.913 (std: 0.006) with: {'n_neighbors': 2}
Mean accuracy: 0.921 (std: 0.005) with: {'n_neighbors': 3}
Mean accuracy: 0.921 (std: 0.005) with: {'n_neighbors': 4}
Mean accuracy: 0.922 (std: 0.004) with: {'n_neighbors': 5}
Mean accuracy: 0.922 (std: 0.004) with: {'n_neighbors': 6}
Mean accuracy: 0.923 (std: 0.004) with: {'n_neighbors': 7}
Mean accuracy: 0.923 (std: 0.003) with: {'n_neighbors': 8}
Mean accuracy: 0.922 (std: 0.004) with: {'n_neighbors': 9}
Mean accuracy: 0.922 (std: 0.003) with: {'n_neighbors': 10}


Conclusion:
If we were to train the model on a subset of the features (as done above), the model accuracy drops.
Hence, we include all the variables and obtain a KNN classifier for the problem with an accuracy score of 98.77%.