In [21]:
import pandas as pd
import numpy as np

from sklearn import set_config

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [2]:
set_config(display = "text")

In [3]:
df = pd.read_csv("mushrooms.csv")
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d


In [4]:
X = df.drop("class", axis = 1)
y = df["class"]

In [5]:
X.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [25]:
df["class"].value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [6]:
categorical_var = X.columns.tolist()
categorical_var

['cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat']

In [73]:
categorical_trans = make_pipeline(OneHotEncoder(sparse = False, handle_unknown = "ignore"))

In [74]:
preprocess = make_column_transformer((categorical_trans, categorical_var), remainder = "passthrough")

In [75]:
lr = LogisticRegression()

logreg = make_pipeline(preprocess, lr)
logreg_cv = cross_val_score(logreg, X, y, cv = 10)
print("Logistic Regression cross validation: {}".format(logreg_cv))
print("mean: {}".format(logreg_cv.mean()))

Logistic Regression cross validation: [0.79827798 1.         1.         1.         0.99630542 1.
 1.         1.         0.88300493 1.        ]
mean: 0.9677588327607414


In [76]:
param_grid_lr = {}

param_grid_lr["logisticregression__C"] = [1, 3, 5, 10, 100, 1000]
param_grid_lr["logisticregression__solver"] = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
param_grid_lr["logisticregression__max_iter"] = [1000, 3000, 5000]

In [77]:
logreg_grid = RandomizedSearchCV(logreg, param_grid_lr, cv = 10, n_iter= 30, scoring = "accuracy")
logreg_grid.fit(X,y)
print("Best scoring: {}".format(logreg_grid.best_score_))
print("Best parameter: {}".format(logreg_grid.best_params_))

Best scoring: 0.9806845654663443
Best parameter: {'logisticregression__solver': 'lbfgs', 'logisticregression__max_iter': 1000, 'logisticregression__C': 1000}


In [69]:
svc = SVC()

svc = make_pipeline(preprocess, svc)
svc_cv = cross_val_score(svc, X, y, cv = 10)
print("Support Vector Machine Classifier cross validation: {}".format(svc_cv))
print("Mean: {}".format(svc_cv.mean()))

Support Vector Machine Classifier cross validation: [0.68511685 1.         1.         1.         0.99630542 0.99876847
 1.         1.         0.97044335 0.99876847]
Mean: 0.9649402565454226


In [14]:
param_grid_svc = [
    {"svc__kernel": ["rbf"], "svc__gamma":[0.1,0.5,1,2,5,10], "svc__C" : [0.1,1,10,100,1000], "svc__decision_function_shape": ["ovo", "ovr"]},
    {"svc__kernel": ["linear"], "svc__C" : [0.1,1,10,100,1000], "svc__decision_function_shape": ["ovo", "ovr"]},
    {"svc__kernel": ["poly"],  "svc__gamma":[0.1,0.5,1,2,5,10], "svc__degree": [2,3,4,5], "svc__C": [0.1,1,10,100,1000], "svc__decision_function_shape": ["ovo", "ovr"]},
    {"svc__kernel": ["sigmoid"], "svc__gamma":[0.1,0.5,1,2,5,10], "svc__C" : [0.1,1,10,100,1000], "svc__decision_function_shape": ["ovo", "ovr"]},
]

In [88]:
svc_grid = RandomizedSearchCV(svc, param_grid_svc, n_iter= 370, scoring = "accuracy", cv = 10)
svc_grid.fit(X,y)
print("Best score: {}".format(svc_grid.best_score_))
print("Best parameters: {}".format(svc_grid.best_params_))



Best score: 0.9827780100461103
Best parameters: {'svc__kernel': 'linear', 'svc__decision_function_shape': 'ovo', 'svc__C': 1}


In [36]:
svc_best = make_pipeline(preprocess, SVC(kernel = "linear", decision_function_shape = "ovo", C = 1, probability = True))
logreg_best =  make_pipeline(preprocess, LogisticRegression(solver = "lbfgs", max_iter = 1000, C = 1000))

voting = VotingClassifier(estimators = [("logistic", logreg_best), ("svm", svc_best)])
voting_cv = cross_val_score(voting, X, y, cv = 10)

print("Voting classifier cross validation: {}".format(voting_cv))
print("Mean: {}".format(voting_cv.mean()))

Voting classifier cross validation: [0.76383764 1.         1.         1.         1.         1.
 1.         1.         0.98522167 1.        ]
Mean: 0.974905931325323


In [58]:
param_grid_voting = {}

param_grid_voting["voting"] = ["hard", "soft"]
param_grid_voting["weights"] = [(1, 10), (1, 9), (1, 8), (1, 7)]

In [59]:
voting_grid = RandomizedSearchCV(voting, param_grid_voting, n_iter = 8, cv = 10, scoring = "accuracy")
voting_grid.fit(X,y)

print("Best score: {}".format(voting_grid.best_score_))
print("Best parameters: {}".format(voting_grid.best_params_))

Best score: 0.983516926302268
Best parameters: {'weights': (1, 10), 'voting': 'soft'}
