# Problem Statement:
Data Set Information:
This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be'' for Poisonous Oak and Ivy.

Thus, in this project, you have to predict the "Class" column. 

In [1]:
#Importing some libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [5]:
#Loading the dataset

df=pd.read_csv('mushrooms.csv')

In [6]:
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [8]:
#checking for null values

df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [9]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [10]:
#separating Independent and dependent variables

X=df.drop('class', axis=1)
Y=df['class']

In [11]:
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


In [12]:
Y.head()

0    p
1    e
2    e
3    p
4    e
Name: class, dtype: object

In [13]:
Y.value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [15]:
#applying one hot encoding for independent variables

X=pd.get_dummies(X, drop_first=True)
X.shape

(8124, 95)

In [16]:
#applying standard scaler

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df_x=scaler.fit_transform(X)


In [17]:
#creating final data with 55 columns
from sklearn.decomposition import PCA
finalPCA=PCA(n_components=55)
finalData= finalPCA.fit_transform(df_x)

In [18]:
#converting into dataframe

x=pd.DataFrame(finalData)
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,-3.680146,-1.484409,-0.636762,0.248643,-3.230014,1.053069,-0.58043,-0.187725,-2.996616,5.331224,...,-0.298724,0.23273,1.025658,-0.318986,-0.10909,0.042684,0.081262,0.736295,0.141309,-0.53246
1,-2.864404,-1.767377,1.294714,1.785572,-2.188659,1.001332,2.503931,-2.144354,1.173024,-0.367687,...,-0.726872,-1.84036,0.670044,0.56299,-0.527272,0.098579,0.078636,-0.071815,1.179174,1.059119
2,-4.093188,-1.873127,1.274432,1.837968,-1.928017,1.560156,4.710089,-4.025767,2.069354,-0.613832,...,0.765955,1.534497,-0.840492,-0.550953,0.416632,0.098524,-0.058775,0.070865,-0.494323,-0.441319
3,-3.547954,-2.096685,-0.136878,0.212695,-2.398306,1.227992,0.002909,-0.393715,-2.752352,5.091455,...,0.321811,0.724809,-0.234545,-0.423285,-0.520898,0.199975,0.111603,0.955067,0.261964,-0.265992
4,-2.638086,-1.462163,-0.075674,0.637974,-2.899138,-0.106029,-2.735822,1.649707,0.262649,-0.678335,...,-0.182074,-0.218617,0.488982,0.020226,0.050004,0.115433,-0.09213,-0.306678,0.435675,0.019898


In [19]:
#appying label encoder for dependent variable

from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
y=LE.fit_transform(Y)

In [20]:
y=pd.DataFrame(y)
y.head()

Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0


In [21]:
y.shape

(8124, 1)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [23]:
#finding the best random_state
maxAccu=0
maxRS=0
for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=i)
    LR=LogisticRegression()
    LR.fit(x_train,y_train)
    predrf=LR.predict(x_test)
    acc=accuracy_score(y_test,predrf)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print('Best accuracy is ',maxAccu, 'on Random_state ',maxRS)

Best accuracy is  1.0 on Random_state  4


In [25]:
#got the best random_state as 4,so lets apply train_test_split using random_state as 4

In [26]:
#Logistic Regrssion

LR=LogisticRegression()
LR.fit(x_train,y_train)
predlr=LR.predict(x_test)
print(accuracy_score(y_test,predlr))
print(confusion_matrix(y_test,predlr))
print(classification_report(y_test,predlr))

1.0
[[1257    0]
 [   0 1181]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1257
           1       1.00      1.00      1.00      1181

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438



In [27]:
#Decision Tree Classifier()

DTC=DecisionTreeClassifier()
DTC.fit(x_train,y_train)
preddtc=DTC.predict(x_test)
print(accuracy_score(y_test,preddtc))

0.9950779327317474


In [28]:
#Random forest classifier

RFC=RandomForestClassifier()
RFC.fit(x_train,y_train)
predrfc=RFC.predict(x_test)
print(accuracy_score(y_test,predrfc))

1.0


In [29]:
#Support vector classifier

SV=SVC()
SV.fit(x_train,y_train)
predsvc=SV.predict(x_test)
print(accuracy_score(y_test,predsvc))

0.9995898277276456


In [30]:
#applying cross_val_score to see whether it is overfitting or not

from sklearn.model_selection import cross_val_score
print(cross_val_score(LR,x,y,cv=5).mean())
print(cross_val_score(DTC,x,y,cv=5).mean())
print(cross_val_score(RFC,x,y,cv=5).mean())
print(cross_val_score(SV,x,y,cv=5).mean())

0.9130643425539977
0.8931157256536565
0.8867195907540735
0.8741597574838954


In [31]:
#Minimum difference in accuracy and cross_val_score is for Logistic regression,so this our best model

In [32]:
#hyper parameter tuning 

from sklearn.model_selection import GridSearchCV
parameter={'penalty':['l1','l2','elasticnet'],'C':[1,100]}
GCV=GridSearchCV(LogisticRegression(),parameter,cv=5)
GCV.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 100], 'penalty': ['l1', 'l2', 'elasticnet']})

In [33]:
GCV.best_params_

{'C': 100, 'penalty': 'l2'}

In [34]:
Final_model=LogisticRegression(C=100,penalty='l2')
Final_model.fit(x_train,y_train)
pred=Final_model.predict(x_test)
print(accuracy_score(y_test,pred))

1.0


In [36]:
import joblib
joblib.dump(Final_model,'Mushroomlogregmodel.pkl')
final_result=joblib.load('Mushroomlogregmodel.pkl')
final_result.predict(x_test)

array([0, 0, 0, ..., 1, 0, 0])