In [None]:
import pandas as pd
import pandas_profiling
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
# Tuning of graphical visualisation
plt.style.use('seaborn-whitegrid')
plt.set_cmap('rainbow')
plt.context=('talk')

# Exploratory Analysis

### In this section we will simply look at the data and see what action will be needed to perform a good machine learning model.

In [None]:
df=pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#Look at unique variable for all columns in the dataset.

for col in list(df):
    print(col,':------------- ',df[col].unique())

## EDA Takeaways

From the few code above we can already have an idea of what we will have to do in terms of cleaning and preprocessing.
1. The dataset have 23 columns of types object wich means these are all nominal variables.
2. There is no missing value.
3. The target column is name 'class', we will change that to 'target' for better understanding later on.
4. There is 5 columns that will need to be transform into Binary variable, all the rest will be transform in OneHotEncoders.
5. The 'veil-type' column could probably been drop because there is only one value.

# Proprocessing

In [None]:
#Rename the 'class' columns to 'target'
df.rename(columns ={'class':'target'},inplace=True)
df

In [None]:
#Transform the poisonness mushroom with the value 1 and 0 otherwise
df['target']=df['target'].replace({'p':1,'e':0})
df

In [None]:
#Drop the 'veil-type' column
df.drop('veil-type',axis=1,inplace=True)

## Quick Visualisation

In [None]:
#Distribution of poisonous mushrooms and non-poisonous 

df['target'].value_counts()/df.shape[0]

In [None]:
#Distribution of poisonous mushrooms and non-poisonous in a bar graph
plt.figure(figsize=(8,8))
sns.countplot(x='target',data=df)
plt.title('Nombre de champignon venimeux (1) et non venimeux (0)');

As we can see, there is almost a perfect balance in the target variable with 52% of the mushrooms been non-poisonous and so 48% of poisonous. 

This represent a balanced dataset and we will not have to rebalanced it or make any action in this matter. 

This also represent the **NULL MODEL** wich means that our ML model have to predict at least better than random guess (50%).

In [None]:
#Extra visualisation to all columns

for col in df.select_dtypes('object'):
    plt.figure(figsize=(5,5))
    sns.countplot(x=col,hue='target',data=df)
    plt.title(col)
    plt.legend(bbox_to_anchor =(1,0.5));

Whith these visual, we can say that the columns _gill-attachement_, _veil-color_ and _ring-number_ could be drop. I say that beacause they all have one value overly represented. They are almost constant. So in order to help the model and make it perform better, we will retreiver those columns.

In [None]:
#Drop more useless columns
extra_drop = ['gill-attachment','veil-color','ring-number']
df.drop(extra_drop,axis=1,inplace=True)

In [None]:
#transform 2 value columns into binary value containing 0 and 1

df['bruises'].replace({'t':1,'f':0},inplace =True)
df['gill-spacing'].replace({'c':1,'w':0},inplace =True)
df['stalk-shape'].replace({'e':1,'t':0},inplace =True)
df['gill-size'].replace({'n':1,'b':0},inplace =True)

In [None]:
#Transform the rest of the columns into a dummies variables. 

df =pd.get_dummies(df)

In [None]:
df

#### The data is now ready to be insert in our ML model.

# Model 

In [None]:
#Separate target values and features values in y and X variables
y =df['target']
X = df.drop('target',axis=1)

#Split the train_set and test_set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True)

Here i choose a test_size of 20% because it is pretty much a standard. Giving the large number of observation, i could have use 10%. I also set the shuffle argument to True. This will mix up the dataset so that the algorithm doesn't take in consideration the patterns in between each line of observation.

## Train Model

In [None]:
#KNEIGHBORS CLASSIFIER

knn = KNeighborsClassifier()
knn_model = knn.fit(X_train,y_train)
print(cross_val_score(knn,X_train,y_train,cv=5,scoring='recall').mean())

In [None]:
#DECISION TREE CLASSIFIER

tree = DecisionTreeClassifier()
tree_model = tree.fit(X_train,y_train)
print(cross_val_score(tree,X_train,y_train,cv=5,scoring='recall').mean())

In [None]:
#LOGISTIC REGRESSION

logic = LogisticRegression()
logic_model = logic.fit(X_train,y_train)
print(cross_val_score(logic,X_train,y_train,cv=5,scoring='recall').mean())

The best model seems to be KNeighbors classifier with 100% good classification. I set the performance measure (scoring) to recall because we want to classify all real poisonous mushrooms(TP) and we don't really mind if we miss classify some non-poisonous mushrooms into poisonous class (FP).

## Test Model

In [None]:
#KNEIGHBORS CLASSIFIER

knn_pred = knn_model.predict(X_test)
print('KNEIGHBORS CLASSIFIER')
print(confusion_matrix(y_test,knn_pred))
print(classification_report(y_test,knn_pred))

In [None]:
#DECISION TREE CLASSIFIER

tree_pred = tree_model.predict(X_test)
print('DECISION TREE CLASSIFIER')
print(confusion_matrix(y_test,tree_pred))
print(classification_report(y_test,tree_pred))

In [None]:
#LOGISTIC REGRESSION

logic_pred = logic_model.predict(X_test)
print('LOGISTIC REGRESSION')
print(confusion_matrix(y_test,logic_pred))
print(classification_report(y_test,logic_pred))

PERFECT !! We can see that all of our model have predict 100% of the mushrooms wheter they are poisonous or not. This kind of make sens because we saw in the graphs above that the caracteristic of poisonous or non-poisonous mushrooms often are at some extrème. 


If you liked this quick Kernel plz UPVOTE and feel free to leave a comment if anything is wrong in my code or my process. THANK YOU !