In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
#Fetch mushroom data from ucirepo
mushroom = fetch_ucirepo(id=73)
mushroom

{'data': {'ids': None,
  'features':      cap-shape cap-surface cap-color bruises odor gill-attachment  \
  0            x           s         n       t    p               f   
  1            x           s         y       t    a               f   
  2            b           s         w       t    l               f   
  3            x           y         w       t    p               f   
  4            x           s         g       f    n               f   
  ...        ...         ...       ...     ...  ...             ...   
  8119         k           s         n       f    n               a   
  8120         x           s         n       f    n               a   
  8121         f           s         n       f    n               a   
  8122         k           y         n       f    y               f   
  8123         x           s         n       f    n               a   
  
       gill-spacing gill-size gill-color stalk-shape  ...  \
  0               c         n          k         

In [3]:
X=mushroom.data.features
Y=mushroom.data.targets

In [4]:
mushroom.metadata

{'uci_id': 73,
 'name': 'Mushroom',
 'repository_url': 'https://archive.ics.uci.edu/dataset/73/mushroom',
 'data_url': 'https://archive.ics.uci.edu/static/public/73/data.csv',
 'abstract': 'From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible',
 'area': 'Biology',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 8124,
 'num_features': 22,
 'feature_types': ['Categorical'],
 'demographics': [],
 'target_col': ['poisonous'],
 'index_col': None,
 'has_missing_values': 'yes',
 'missing_values_symbol': 'NaN',
 'year_of_dataset_creation': 1981,
 'last_updated': 'Thu Aug 10 2023',
 'dataset_doi': '10.24432/C5959T',
 'creators': [],
 'intro_paper': None,
 'additional_info': {'summary': "This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525).  Each species is identified as definitely 

In [5]:
mushroom.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,poisonous,Target,Categorical,,,,no
1,cap-shape,Feature,Categorical,,"bell=b,conical=c,convex=x,flat=f, knobbed=k,su...",,no
2,cap-surface,Feature,Categorical,,"fibrous=f,grooves=g,scaly=y,smooth=s",,no
3,cap-color,Feature,Binary,,"brown=n,buff=b,cinnamon=c,gray=g,green=r, pink...",,no
4,bruises,Feature,Categorical,,"bruises=t,no=f",,no
5,odor,Feature,Categorical,,"almond=a,anise=l,creosote=c,fishy=y,foul=f, mu...",,no
6,gill-attachment,Feature,Categorical,,"attached=a,descending=d,free=f,notched=n",,no
7,gill-spacing,Feature,Categorical,,"close=c,crowded=w,distant=d",,no
8,gill-size,Feature,Categorical,,"broad=b,narrow=n",,no
9,gill-color,Feature,Categorical,,"black=k,brown=n,buff=b,chocolate=h,gray=g, gre...",,no


In [6]:
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


In [7]:
X.describe()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,6,4,10,2,9,2,2,2,12,2,...,4,9,9,1,4,3,5,9,6,7
top,x,y,n,f,n,f,c,b,b,t,...,s,w,w,p,w,o,p,w,v,d
freq,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [8]:
X.shape

(8124, 22)

In [9]:
# Checking for empty values
X.isnull().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [10]:
X.isnull().mean()

cap-shape                   0.000000
cap-surface                 0.000000
cap-color                   0.000000
bruises                     0.000000
odor                        0.000000
gill-attachment             0.000000
gill-spacing                0.000000
gill-size                   0.000000
gill-color                  0.000000
stalk-shape                 0.000000
stalk-root                  0.305268
stalk-surface-above-ring    0.000000
stalk-surface-below-ring    0.000000
stalk-color-above-ring      0.000000
stalk-color-below-ring      0.000000
veil-type                   0.000000
veil-color                  0.000000
ring-number                 0.000000
ring-type                   0.000000
spore-print-color           0.000000
population                  0.000000
habitat                     0.000000
dtype: float64

In [11]:
data = pd.concat((X,Y),axis=1)
data

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,poisonous
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,o,o,p,o,o,p,b,c,l,e
8120,x,s,n,f,n,a,c,b,y,e,...,o,o,p,n,o,p,b,v,l,e
8121,f,s,n,f,n,a,c,b,n,e,...,o,o,p,o,o,p,b,c,l,e
8122,k,y,n,f,y,f,c,n,b,t,...,w,w,p,w,o,e,w,v,l,p


In [13]:
# Filling the missing values with the most frequent value in 'stalk-root' column
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(X[['stalk-root']])
imputer.statistics_

array(['b'], dtype=object)

In [14]:
X[['stalk-root']] = imputer.transform(X[['stalk-root']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['stalk-root']] = imputer.transform(X[['stalk-root']])


In [15]:
X[['stalk-root']]

Unnamed: 0,stalk-root
0,e
1,c
2,c
3,e
4,e
...,...
8119,b
8120,b
8121,b
8122,b


In [17]:
#Encoding data with Target Encoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import TargetEncoder

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3, random_state=42)

In [18]:
encoder= TargetEncoder(target_type='binary')
encoder.fit(X_train,Y_train.to_numpy().flatten())

X_train_enc = encoder.transform(X_train[X_train.columns])
X_test_enc = encoder.transform(X_test[X_test.columns])

In [19]:
type(X_train_enc)

numpy.ndarray

In [20]:
X_train_enc = pd.DataFrame(X_train_enc)

In [21]:
X_train_enc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.465786,0.549124,0.692099,0.179432,1.000000,0.491505,0.555308,0.300216,0.709943,0.437128,...,0.246275,0.385214,0.381467,0.481006,0.492239,0.508025,0.202817,0.967547,0.703480,0.749175
1,0.465786,0.328487,0.434887,0.179432,0.032488,0.491505,0.555308,0.300216,0.443275,0.437128,...,0.307602,0.687699,0.381467,0.481006,0.492239,0.508025,0.202817,0.118776,0.380796,0.395974
2,0.465786,0.328487,0.434887,0.695610,1.000000,0.491505,0.555308,0.300216,0.443275,0.539288,...,0.939537,0.964646,1.000000,0.481006,0.492239,0.508025,1.000000,0.967547,0.703480,0.395974
3,0.465786,0.535863,0.641975,0.179432,0.000000,0.491505,0.555308,0.300216,0.161214,0.539288,...,0.307602,0.385214,0.381467,0.481006,0.492239,0.508025,0.202817,0.118776,0.000000,0.349543
4,0.465786,0.535863,0.444675,0.695610,1.000000,0.491505,0.555308,0.889306,1.000000,0.437128,...,0.939537,0.385214,0.694664,0.481006,0.492239,0.508025,0.635732,0.755501,0.703480,0.395974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5681,0.465786,0.535863,0.444675,0.695610,1.000000,0.491505,0.555308,0.889306,1.000000,0.437128,...,0.307602,0.687699,0.381467,0.481006,0.492239,0.508025,0.635732,0.755501,0.703480,0.886013
5682,0.718640,0.535863,0.585945,0.179432,0.032488,0.491505,0.555308,0.300216,0.193271,0.539288,...,0.307602,0.385214,0.000000,0.481006,0.492239,0.115103,0.635732,0.755501,0.162098,0.000000
5683,0.485830,0.535863,0.444675,0.179432,0.000000,0.491505,0.555308,0.300216,0.193271,0.539288,...,0.294006,0.385214,0.381467,0.481006,0.492239,0.508025,0.202817,0.109221,0.380796,0.886013
5684,0.718640,0.549124,0.585945,0.695610,1.000000,0.491505,0.555308,0.889306,1.000000,0.437128,...,0.307602,0.687699,0.694664,0.481006,0.492239,0.508025,0.635732,0.755501,0.703480,0.886013


In [24]:
X_test_enc
X_test_enc = pd.DataFrame(X_test_enc)

array([[0.4858296 , 0.32848721, 0.44467538, ..., 0.10922094, 0.29031786,
        0.34954282],
       [0.4858296 , 0.54912427, 0.58594457, ..., 0.75550116, 0.70348019,
        0.71396935],
       [0.46578614, 0.53586321, 0.44467538, ..., 0.75550116, 0.70348019,
        0.71396935],
       ...,
       [0.46578614, 0.32848721, 0.4348865 , ..., 0.9675472 , 0.70348019,
        0.39597357],
       [0.46578614, 0.53586321, 0.44467538, ..., 0.10922094, 0.29031786,
        0.88601316],
       [0.46578614, 0.32848721, 0.4348865 , ..., 0.11877576, 0.70348019,
        0.39597357]])

In [27]:
#Training the model with Random Forest Algorithm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train_enc,Y_train.squeeze())
y_pred_rf = rnd_clf.predict(X_test_enc)

In [28]:
accuracy_score(Y_test, y_pred_rf)

1.0