# Alternative to one-hot encoding

In [41]:
import pandas as pd
import random
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import euclidean
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [42]:
data_train = pd.read_csv('Mushroom_datasets/mushroom_train.csv')
data_test = pd.read_csv('Mushroom_datasets/mushroom_test.csv')

In [43]:
data_train.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,class
0,4.98,c,i,y,f,a,c,n,6.04,6.21,w,f,f,d,a,p
1,2.84,x,y,y,f,a,c,w,5.66,3.55,y,t,r,h,u,p
2,11.44,x,y,y,f,a,c,w,7.03,25.29,n,t,e,d,w,e
3,8.77,s,t,r,t,d,c,g,4.44,13.61,r,f,f,d,a,p
4,7.55,x,d,n,t,p,c,y,8.41,18.44,y,f,f,d,a,e


In [44]:
classes = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season', 'class']

temp_train = pd.get_dummies(data_train, columns=classes)
temp_test = pd.get_dummies(data_test, columns=classes)

In [45]:
temp_train.corr()

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,cap-shape_s,cap-shape_x,...,habitat_m,habitat_p,habitat_u,habitat_w,season_a,season_s,season_u,season_w,class_e,class_p
cap-diameter,1.000000,0.423171,0.695804,-0.191519,-0.099040,0.022730,0.133587,0.094008,0.053706,-0.007314,...,-0.040500,-0.050547,0.026136,-0.002765,-0.034119,0.033431,-0.008742,0.051043,0.178498,-0.178498
stem-height,0.423171,1.000000,0.436069,0.015604,-0.008988,0.003152,-0.261647,0.287276,-0.113514,0.070046,...,0.033949,-0.048805,0.160757,-0.048284,0.022103,-0.039386,0.003985,-0.016923,0.122160,-0.122160
stem-width,0.695804,0.436069,1.000000,-0.221537,-0.083040,-0.024715,0.102138,0.143271,0.073258,0.026121,...,-0.056467,0.028929,0.001503,0.002918,-0.004233,-0.007676,-0.020945,0.049504,0.196278,-0.196278
cap-shape_b,-0.191519,0.015604,-0.221537,1.000000,-0.056965,-0.170293,-0.079120,-0.068233,-0.118677,-0.286469,...,0.095214,-0.024538,-0.014341,-0.025234,-0.037119,0.073950,0.023704,-0.029967,-0.143139,0.143139
cap-shape_c,-0.099040,-0.008988,-0.083040,-0.056965,1.000000,-0.092869,-0.043148,-0.037211,-0.064720,-0.156226,...,-0.039467,0.102637,-0.007821,-0.013761,-0.022261,0.070587,0.023723,-0.054006,-0.007727,0.007727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
season_s,0.033431,-0.039386,-0.007676,0.073950,0.070587,-0.058596,0.182091,0.083167,-0.080515,-0.085326,...,0.040049,-0.016648,0.040769,-0.017119,-0.216001,1.000000,-0.170138,-0.067186,0.055174,-0.055174
season_u,-0.008742,0.003985,-0.020945,0.023704,0.023723,-0.031489,-0.002368,-0.011584,0.028701,-0.008872,...,0.012906,0.015309,-0.015888,0.019087,-0.764364,-0.170138,1.000000,-0.237753,-0.040096,0.040096
season_w,0.051043,-0.016923,0.049504,-0.029967,-0.054006,0.051990,0.042867,-0.020062,-0.025790,-0.001895,...,-0.014169,-0.023264,0.037321,-0.023923,-0.301842,-0.067186,-0.237753,1.000000,0.101958,-0.101958
class_e,0.178498,0.122160,0.196278,-0.143139,-0.007727,0.045753,-0.107930,0.066977,0.009505,0.065785,...,0.016886,-0.068404,0.049249,0.086657,-0.041370,0.055174,-0.040096,0.101958,1.000000,-1.000000


In [46]:
cols = ['stem-width', 'stem-height', 'cap-diameter']
corr_matrix = temp_train.corr(method='pearson')
filtered = corr_matrix[(corr_matrix.index.isin(cols))|(corr_matrix.columns.isin(cols))]

corr_sorted = filtered.abs().unstack().sort_values(ascending=False).reset_index()
corr_sorted = corr_sorted[corr_sorted['level_0'] != corr_sorted['level_1']]

corr_sorted.columns = ['Feature 1', 'Feature 2', 'Correlation Value']
new_filtered = corr_sorted[(corr_sorted['Feature 1'].isin(cols))|(corr_sorted['Feature 2'].isin(cols))]

print('Top 15 positive correlations:')
print(new_filtered.head(15))
print('\nTop 15 negative correlations:')
print(new_filtered.tail(15)[::-1])

Top 15 positive correlations:
            Feature 1     Feature 2  Correlation Value
3        cap-diameter    stem-width           0.695804
4          stem-width  cap-diameter           0.695804
5         stem-height    stem-width           0.436069
6          stem-width   stem-height           0.436069
7        cap-diameter   stem-height           0.423171
8         stem-height  cap-diameter           0.423171
9         ring-type_m   stem-height           0.406376
10  gill-attachment_p    stem-width           0.400621
11  gill-attachment_p  cap-diameter           0.353896
12        ring-type_f   stem-height           0.332793
13  gill-attachment_a    stem-width           0.287511
14        cap-shape_p   stem-height           0.287276
15         has-ring_f   stem-height           0.285016
16         has-ring_t   stem-height           0.285016
17        cap-shape_o   stem-height           0.261647

Top 15 negative correlations:
          Feature 1     Feature 2  Correlation Value
281   

In [47]:
cols = ['ring-type_m', 'gill-attachment_p', 'ring-type_f', 'gill-attachment_a']
features = ['cap-diameter', 'stem-height', 'stem-width']
#values = ['min','max','mean']

for i in cols:
    data_train['min_'+i[:-2]] = pd.Series(dtype='float64')
    data_train['max_'+i[:-2]] = pd.Series(dtype='float64')
    data_train['mean_'+i[:-2]] = pd.Series(dtype='float64')
    
for i in cols:
    values = data_train[i[:-2]].unique()
    for j in values:
        if j == i[-1]:
            new = data_train[data_train[i[:-2]] == i[-1]]
            for k in features:
                temp_min = new.describe().loc['min', k]
                temp_max = new.describe().loc['max', k]
                temp_avg = new.describe().loc['mean', k]

                data_train.loc[data_train[i[:-2]] == i[-1], 'min_'+i[:-2]] = temp_min
                data_train.loc[data_train[i[:-2]] == i[-1], 'mean_'+i[:-2]] = temp_avg
                data_train.loc[data_train[i[:-2]] == i[-1], 'max_'+i[:-2]] = temp_max
                
data_train.fillna(0.0, inplace=True)

In [49]:
data_train[]

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,...,ring-type,habitat,season,class,min_ring-type,max_ring-type,mean_ring-type,min_gill-attachment,max_gill-attachment,mean_gill-attachment
0,4.98,c,i,y,f,a,c,n,6.04,6.21,...,f,d,a,p,0.0,103.91,12.07009,0.0,38.37,8.369365
1,2.84,x,y,y,f,a,c,w,5.66,3.55,...,r,h,u,p,0.0,0.0,0.0,0.0,38.37,8.369365
2,11.44,x,y,y,f,a,c,w,7.03,25.29,...,e,d,w,e,0.0,0.0,0.0,0.0,38.37,8.369365
3,8.77,s,t,r,t,d,c,g,4.44,13.61,...,f,d,a,p,0.0,103.91,12.07009,0.0,0.0,0.0
4,7.55,x,d,n,t,p,c,y,8.41,18.44,...,f,d,a,e,0.0,103.91,12.07009,3.36,103.91,24.284523
5,3.0,s,d,w,f,d,d,p,3.33,2.6,...,f,g,a,p,0.0,103.91,12.07009,0.0,0.0,0.0
6,2.78,x,d,w,f,d,d,w,4.43,2.68,...,f,g,a,p,0.0,103.91,12.07009,0.0,0.0,0.0
7,3.12,x,t,w,f,x,c,p,9.07,4.22,...,f,d,a,p,0.0,103.91,12.07009,0.0,0.0,0.0
8,1.48,x,g,y,f,d,d,n,2.07,2.32,...,f,h,u,p,0.0,103.91,12.07009,0.0,0.0,0.0
9,5.94,x,h,n,f,e,c,w,13.04,12.88,...,f,d,a,e,0.0,103.91,12.07009,0.0,0.0,0.0


In [50]:
data_train['cap-diameter'].min()

0.38