In [1]:
#libraries
import pandas as pd # data processing
import numpy as np # linear algebra

#ploting libraries
import seaborn as sns
import matplotlib.pyplot as plt 

#feature engineering
from sklearn import preprocessing

# data transformation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

# for Model Building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn import tree
import pickle as pickle5


In [2]:
# Loading data
data=pd.read_csv('mushrooms.csv')

In [3]:
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [4]:
# Droppping insignificant columns
data=data.drop(['gill-attachment','ring-number','stalk-surface-below-ring','stalk-color-below-ring','veil-type','veil-color'],axis=1)

In [5]:
# Renaming Column names
data.rename(columns = {'cap-shape':'cap_shape','cap-surface':'cap_surface',
                       'cap-color':'cap_color','gill-spacing':'gill_spacing',
                       'gill-size':'gill_size','gill-color':'gill_color',
                       'stalk-shape':'stalk_shape','stalk-root':'stalk_root',
                       'stalk-surface-above-ring':'stalk_surface_above_ring',
                       'stalk-color-above-ring':'stalk_color_above_ring',
                       'ring-type':'ring_type','spore-print-color':'spore_print_color'}, inplace = True)
data.head()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_color_above_ring,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,c,n,k,e,e,s,w,p,k,s,u
1,e,x,s,y,t,a,c,b,k,e,c,s,w,p,n,n,g
2,e,b,s,w,t,l,c,b,n,e,c,s,w,p,n,n,m
3,p,x,y,w,t,p,c,n,n,e,e,s,w,p,k,s,u
4,e,x,s,g,f,n,w,b,k,t,e,s,w,e,n,a,g


In [6]:
data['cap_shape']=np.where(data['cap_shape'].isin(['x','f','k']),
                           data['cap_shape'].str.title(),
                           'Other_shape')

In [7]:
# in cap-surface contribution of 'g=grooves'is negligible,so we can directly drop it.
data=data[data['cap_surface'] != 'g']

In [8]:
data['cap_color']=np.where(data['cap_color'].isin(['n','y','w','g','e']),
                           data['cap_color'].str.title(),
                           'Other_color')

In [9]:
data['odor']=np.where(data['odor'].isin(['n','f','y','s','a','l']),
                           data['odor'].str.title(),
                           'Other')

In [10]:
data['gill_color']=np.where(data['gill_color'].isin(['k','n','g','p','w','h','u','b']),
                           data['gill_color'].str.title(),
                           'Other_color')

In [11]:
data['stalk_root']=np.where(data['stalk_root'].isin(['e','b']),
                           data['stalk_root'].str.title(),
                           'Other')

In [12]:
data['stalk_surface_above_ring']=np.where(data['stalk_surface_above_ring'].isin(['s','k']),
                           data['stalk_surface_above_ring'].str.title(),
                           'Other')

In [13]:
data['stalk_color_above_ring']=np.where(data['stalk_color_above_ring'].isin(['w','g','p','n','b']),
                           data['stalk_color_above_ring'].str.title(),
                           'Other')

In [14]:
data['ring_type']=np.where(data['ring_type'].isin(['p','e','l']),
                           data['ring_type'].str.title(),
                           'Other')

In [15]:
data['spore_print_color']=np.where(data['spore_print_color'].isin(['k','n','h','w']),
                           data['spore_print_color'].str.title(),
                           'Other')

In [16]:
data['population']=np.where(data['population'].isin(['s','v','y']),
                           data['population'].str.title(),
                           'Other')

In [17]:
data['habitat']=np.where(data['habitat'].isin(['g','d','p','l']),
                           data['habitat'].str.title(),
                           'Other')

In [19]:
data.nunique()

class                       2
cap_shape                   4
cap_surface                 3
cap_color                   6
bruises                     2
odor                        7
gill_spacing                2
gill_size                   2
gill_color                  9
stalk_shape                 2
stalk_root                  3
stalk_surface_above_ring    3
stalk_color_above_ring      6
ring_type                   4
spore_print_color           5
population                  4
habitat                     5
dtype: int64

In [None]:
'n','f','y','s','a','l'

In [26]:
data['cap_shape']=data['cap_shape'].replace({'X':'convex','F':'flat','K':'knobbed'})
data['cap_surface']=data['cap_surface'].replace({'s':'smooth','y':'scaly','f':'fibrous'})
data['cap_color']=data['cap_color'].replace({'N':'brown','Y':'yellow','W':'white','G':'grey','E':'red'})
data['bruises']=data['bruises'].replace({'t':'bruises','f':'no_bruises'})
data['odor']=data['odor'].replace({'N':'none','F':'foul','Y':'fishy','S':'spicy','A':'almond','L':'anise'})
data['gill_spacing']=data['gill_spacing'].replace({'c':'close','w':'crowded'})


data['gill_size']=data['gill_size'].replace({'n':'narrow','b':'broad'})

data['gill_color']=data['gill_color'].replace({'K':'black','N':'brown','G':'grey','P':'pink','W':'white','H':'chocolate','U':'purple','B':'buff'})

data['stalk_shape']=data['stalk_shape'].replace({'e':'enlarging','t':'tapering'})

data['stalk_root']=data['stalk_root'].replace({'E':'equal','B':'bulbous'})

data['stalk_surface_above_ring']=data['stalk_surface_above_ring'].replace({'S':'smoth','K':'silky'})

data['stalk_color_above_ring']=data['stalk_color_above_ring'].replace({'W':'white','G':'grey','P':'pink','N':'brown','B':'buff'})

data['ring_type']=data['ring_type'].replace({'P': 'pendant','E':'evanescent','L':'large'})

data['spore_print_color']=data['spore_print_color'].replace({'K':'black','N':'brown','H':'chocolate','W':'white'})

data['population']=data['population'].replace({'S':'scattered','V':'several','Y':'solitary'})

data['habitat']=data['habitat'].replace({'G':'grasses','D':'woods','P':'paths','L':'leaves'})
                            

In [48]:
# OHE on Features
data_F=pd.get_dummies(data.iloc[:,1:])

In [49]:
# Lebel encoding on target
label_encoder=preprocessing.LabelEncoder()
data['class']=label_encoder.fit_transform(data['class'])

In [50]:
# forming all encoded columns together
data=pd.concat([data['class'],data_F],axis=1)

In [51]:
# Dividing data into Features(X) & Target(y)
X = data.iloc[:,1:]

In [52]:
y=data['class']

In [53]:
# Train-Test Split 
#Train test split will be a 70:30 ratio respectively.
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [54]:
#SVM Clasification
svm = SVC(C=1, kernel='linear')         #bydefault kernel=rbf      C=to control soft margin
svm1=svm.fit(X_train,y_train)
result_svm = svm1.score(X_test,y_test)

In [55]:
#Accuracy
print(np.round(result_svm, 4))

0.9992


In [56]:
filename = 'final_svm_model.pkl'
pickle.dump(svm, open(filename,'wb'))

In [57]:
pickled_model=pickle.load(open('final_svm_model.pkl','rb'))

In [58]:
pickled_model.fit(X_train,y_train)
pk=pickled_model.predict(X_test)
pk

array([1, 1, 0, ..., 0, 0, 0])