## Mushroom dataset visualization and processing

In [1]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets.methods import dataset_information, delete_dataset_features, fill_dataset_samples, extract_samples_labels

In [2]:
## Constants

DATASET_FILENAME = 'mushroom_secondary.csv'
NORMALIZATION = True
DELETION_THRESHOLD = 0.20
IMPUTATION_VALUE = 'u'

TEST_SIZE = 0.2
RANDOM_SEED = 1234
SHUFFLE = True

### Read dataset

In [3]:
## original dataset

mushroom_dataset = pd.read_csv(DATASET_FILENAME, sep=';')
display(mushroom_dataset)

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.60,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.80,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


In [4]:
information = dataset_information(mushroom_dataset, NORMALIZATION)

for k, v in information.items():
    print(f'{k}: {v}')

mushroom_dataset.info()

samples: 61069
features: 21
features names: ['class' 'cap-diameter' 'cap-shape' 'cap-surface' 'cap-color'
 'does-bruise-or-bleed' 'gill-attachment' 'gill-spacing' 'gill-color'
 'stem-height' 'stem-width' 'stem-root' 'stem-surface' 'stem-color'
 'veil-type' 'veil-color' 'has-ring' 'ring-type' 'spore-print-color'
 'habitat' 'season']
classes: ['e' 'p']
classes samples: class
p    0.554913
e    0.445087
Name: count, dtype: float64
null samples: 1.0
null features: class                   0.000000
cap-diameter            0.000000
cap-shape               0.000000
cap-surface             0.231214
cap-color               0.000000
does-bruise-or-bleed    0.000000
gill-attachment         0.161850
gill-spacing            0.410405
gill-color              0.000000
stem-height             0.000000
stem-width              0.000000
stem-root               0.843931
stem-surface            0.624277
stem-color              0.000000
veil-type               0.947977
veil-color              0.878613
has-rin

### Preprocessing

In [5]:
## dataset after columns deletion

deletion_mushroom_dataset = delete_dataset_features(mushroom_dataset, DELETION_THRESHOLD)
display(deletion_mushroom_dataset)

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,o,f,e,w,16.95,17.09,w,t,g,d,w
1,p,16.60,x,o,f,e,w,17.99,18.19,w,t,g,d,u
2,p,14.07,x,o,f,e,w,17.80,17.74,w,t,g,d,w
3,p,14.17,f,e,f,e,w,15.77,15.98,w,t,p,d,w
4,p,14.64,x,o,f,e,w,16.53,17.20,w,t,p,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,y,f,f,f,3.93,6.22,y,f,f,d,a
61065,p,1.27,f,y,f,f,f,3.18,5.43,y,f,f,d,a
61066,p,1.27,s,y,f,f,f,3.86,6.37,y,f,f,d,u
61067,p,1.24,f,y,f,f,f,3.56,5.44,y,f,f,d,u


In [6]:
information = dataset_information(deletion_mushroom_dataset, NORMALIZATION)

for k, v in information.items():
    print(f'{k}: {v}')

deletion_mushroom_dataset.info()

samples: 61069
features: 14
features names: ['class' 'cap-diameter' 'cap-shape' 'cap-color' 'does-bruise-or-bleed'
 'gill-attachment' 'gill-color' 'stem-height' 'stem-width' 'stem-color'
 'has-ring' 'ring-type' 'habitat' 'season']
classes: ['e' 'p']
classes samples: class
p    0.554913
e    0.445087
Name: count, dtype: float64
null samples: 0.19653179190751446
null features: class                   0.000000
cap-diameter            0.000000
cap-shape               0.000000
cap-color               0.000000
does-bruise-or-bleed    0.000000
gill-attachment         0.161850
gill-color              0.000000
stem-height             0.000000
stem-width              0.000000
stem-color              0.000000
has-ring                0.000000
ring-type               0.040462
habitat                 0.000000
season                  0.000000
dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 14 columns):
 #   Column                Non-Null 

In [7]:
## dataset after rows imputation

imputation_mushroom_dataset = fill_dataset_samples(deletion_mushroom_dataset, IMPUTATION_VALUE)
display(imputation_mushroom_dataset)

Unnamed: 0,class,cap-diameter,cap-shape,cap-color,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,p,15.26,x,o,f,e,w,16.95,17.09,w,t,g,d,w
1,p,16.60,x,o,f,e,w,17.99,18.19,w,t,g,d,u
2,p,14.07,x,o,f,e,w,17.80,17.74,w,t,g,d,w
3,p,14.17,f,e,f,e,w,15.77,15.98,w,t,p,d,w
4,p,14.64,x,o,f,e,w,16.53,17.20,w,t,p,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,y,f,f,f,3.93,6.22,y,f,f,d,a
61065,p,1.27,f,y,f,f,f,3.18,5.43,y,f,f,d,a
61066,p,1.27,s,y,f,f,f,3.86,6.37,y,f,f,d,u
61067,p,1.24,f,y,f,f,f,3.56,5.44,y,f,f,d,u


In [8]:
information = dataset_information(imputation_mushroom_dataset, True)

for k, v in information.items():
    print(f'{k}: {v}')

imputation_mushroom_dataset.info()

samples: 61069
features: 14
features names: ['class' 'cap-diameter' 'cap-shape' 'cap-color' 'does-bruise-or-bleed'
 'gill-attachment' 'gill-color' 'stem-height' 'stem-width' 'stem-color'
 'has-ring' 'ring-type' 'habitat' 'season']
classes: ['e' 'p']
classes samples: class
p    0.554913
e    0.445087
Name: count, dtype: float64
null samples: 0.0
null features: class                   0.0
cap-diameter            0.0
cap-shape               0.0
cap-color               0.0
does-bruise-or-bleed    0.0
gill-attachment         0.0
gill-color              0.0
stem-height             0.0
stem-width              0.0
stem-color              0.0
has-ring                0.0
ring-type               0.0
habitat                 0.0
season                  0.0
dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class          

### Features and labels

In [9]:
def labels_map(label:str) -> int:
    return 1 if label == 'e' else -1

labels_map = np.vectorize(labels_map)

samples_set, labels_set = extract_samples_labels(imputation_mushroom_dataset, labels_map)

print(len(samples_set))
print(samples_set[:5])
print(len(labels_set))
print(labels_set[:5])

61069
[[15.26 'x' 'o' 'f' 'e' 'w' 16.95 17.09 'w' 't' 'g' 'd' 'w']
 [16.6 'x' 'o' 'f' 'e' 'w' 17.99 18.19 'w' 't' 'g' 'd' 'u']
 [14.07 'x' 'o' 'f' 'e' 'w' 17.8 17.74 'w' 't' 'g' 'd' 'w']
 [14.17 'f' 'e' 'f' 'e' 'w' 15.77 15.98 'w' 't' 'p' 'd' 'w']
 [14.64 'x' 'o' 'f' 'e' 'w' 16.53 17.2 'w' 't' 'p' 'd' 'w']]
61069
[-1 -1 -1 -1 -1]


### Train-test split

In [10]:
train_sample_set, test_sample_set, train_labels_set, test_labels_set = train_test_split(samples_set, labels_set, test_size=TEST_SIZE, shuffle=SHUFFLE, random_state=RANDOM_SEED)

print(len(train_sample_set), len(test_sample_set))
print(train_sample_set[:5], '\n', test_sample_set[:5])

print(len(train_labels_set), len(test_labels_set))
print(train_labels_set[:5], '\n', test_labels_set[:5])

48855 12214
[[2.14 'x' 'e' 'f' 'a' 'p' 5.43 2.13 'w' 'f' 'f' 'd' 'u']
 [21.35 'x' 'n' 'f' 'p' 'y' 19.99 84.12 'n' 'f' 'f' 'd' 'u']
 [10.09 'x' 'n' 't' 'x' 'u' 9.39 21.63 'n' 'f' 'f' 'd' 'a']
 [13.59 'x' 'n' 't' 'e' 'w' 9.46 16.25 'w' 't' 'u' 'd' 'a']
 [6.25 'x' 'g' 'f' 'a' 'n' 5.94 4.71 'g' 't' 'l' 'd' 'a']] 
 [[14.42 'x' 'e' 'f' 'e' 'w' 17.03 16.57 'w' 't' 'g' 'd' 'w']
 [5.27 'x' 'y' 'f' 'x' 'n' 6.79 4.27 'w' 'f' 'f' 'd' 'u']
 [7.71 'x' 'y' 'f' 'a' 'w' 6.53 16.51 'g' 'f' 'f' 'd' 'w']
 [8.77 'x' 'n' 't' 'e' 'p' 8.36 16.41 'w' 't' 'l' 'l' 'a']
 [51.21 'o' 'y' 'f' 'p' 'y' 6.39 47.23 'k' 'f' 'f' 'd' 'u']]
48855 12214
[-1  1 -1  1  1] 
 [-1  1  1  1  1]
