In [4]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

import joblib

In [5]:
df = pd.read_csv('mushrooms.csv')

In [6]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [7]:
column_change = {'class': 'edible=e, poisonous=p',
                'cap-shape': 'bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s',
                'cap-surface': 'fibrous=f,grooves=g,scaly=y,smooth=s',
                'cap-color': 'brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y',
                'bruises': 'bruises=t,no=f',
                'odor': 'almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s',
                'gill-attachment': 'attached=a,descending=d,free=f,notched=n',
                'gill-spacing': 'close=c,crowded=w,distant=d',
                'gill-size': 'broad=b,narrow=n',
                'gill-color': 'black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y',
                'stalk-shape': 'enlarging=e,tapering=t',
                'stalk-root': 'bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?',
                'stalk-surface-above-ring': 'fibrous=f,scaly=y,silky=k,smooth=s',
                'stalk-surface-below-ring': 'fibrous=f,scaly=y,silky=k,smooth=s',
                'stalk-color-above-ring': 'brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y',
                 'stalk-color-below-ring': 'brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y',
                 'veil-type': 'partial=p,universal=u',
                 'veil-color': 'brown=n,orange=o,white=w,yellow=y',
                 'ring-number': 'none=n,one=o,two=t',
                 'ring-type': 'cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z',
                 'spore-print-color': 'black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y',
                 'population': 'abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y',
                 'habitat': 'grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d'}
                

In [8]:
columns = ['cap-shape','cap-surface','cap-color','odor','gill-color','habitat']

In [9]:
def decode_field(df, col_code_map):
    tmp_df = df.copy()
    for col in col_code_map.keys():
        #print(col)
        #print({x.strip().split('=')[1]:x.strip().split('=')[0] for x in col_code_map[col].split(',')})
        code_map = {x.strip().split('=')[1]:x.strip().split('=')[0] for x in col_code_map[col].split(',')}
        tmp_df[col] = tmp_df[col].map(code_map)
    return tmp_df

In [10]:
df_decode = decode_field(df, column_change)
df_decode.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,brown,bruises,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,bruises,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,bruises,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,bruises,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses


In [11]:
Y = df_decode['class'].map({'poisonous':1, 'edible':0})
X = df_decode[list(column_change.keys())[1:]]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [13]:
model = CatBoostClassifier(objective='Logloss', verbose=False, cat_features=columns)
# We train model
model.fit(X_train[columns],y_train)

<catboost.core.CatBoostClassifier at 0x7fc4f9360d30>

In [14]:
y_pred = model.predict_proba(X_test[columns])
print(y_pred[:, 1].shape)

(1625,)


In [15]:
#print(accuracy_score(y_test , y_pred))
print(roc_auc_score(y_test , y_pred[:, 1]))

0.9999211196160345


In [16]:
df_decode.to_csv('mushroom_decode.csv')

In [17]:
with open('mushroom_model_dump.pkl', 'wb') as output_file:
    joblib.dump(model, output_file)