# Alternative to one-hot encoding

In [1]:
import pandas as pd
import random
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import euclidean
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
data_train = pd.read_csv('Mushroom_datasets/mushroom_train.csv')
data_test = pd.read_csv('Mushroom_datasets/mushroom_test.csv')

In [3]:
data_train.head()


Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,class
0,4.98,c,i,y,f,a,c,n,6.04,6.21,w,f,f,d,a,p
1,2.84,x,y,y,f,a,c,w,5.66,3.55,y,t,r,h,u,p
2,11.44,x,y,y,f,a,c,w,7.03,25.29,n,t,e,d,w,e
3,8.77,s,t,r,t,d,c,g,4.44,13.61,r,f,f,d,a,p
4,7.55,x,d,n,t,p,c,y,8.41,18.44,y,f,f,d,a,e


In [4]:
classes = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']

temp_train = pd.get_dummies(data_train, columns=classes)
temp_test = pd.get_dummies(data_test, columns=classes)

In [5]:
temp_train.corr()

  temp_train.corr()


Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_o,cap-shape_p,cap-shape_s,cap-shape_x,...,habitat_h,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w,season_a,season_s,season_u,season_w
cap-diameter,1.000000,0.423171,0.695804,-0.191519,-0.099040,0.022730,0.133587,0.094008,0.053706,-0.007314,...,-0.064857,-0.044482,-0.040500,-0.050547,0.026136,-0.002765,-0.034119,0.033431,-0.008742,0.051043
stem-height,0.423171,1.000000,0.436069,0.015604,-0.008988,0.003152,-0.261647,0.287276,-0.113514,0.070046,...,-0.044873,-0.031233,0.033949,-0.048805,0.160757,-0.048284,0.022103,-0.039386,0.003985,-0.016923
stem-width,0.695804,0.436069,1.000000,-0.221537,-0.083040,-0.024715,0.102138,0.143271,0.073258,0.026121,...,-0.097291,0.002066,-0.056467,0.028929,0.001503,0.002918,-0.004233,-0.007676,-0.020945,0.049504
cap-shape_b,-0.191519,0.015604,-0.221537,1.000000,-0.056965,-0.170293,-0.079120,-0.068233,-0.118677,-0.286469,...,-0.018837,-0.058868,0.095214,-0.024538,-0.014341,-0.025234,-0.037119,0.073950,0.023704,-0.029967
cap-shape_c,-0.099040,-0.008988,-0.083040,-0.056965,1.000000,-0.092869,-0.043148,-0.037211,-0.064720,-0.156226,...,-0.001900,0.064052,-0.039467,0.102637,-0.007821,-0.013761,-0.022261,0.070587,0.023723,-0.054006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
habitat_w,-0.002765,-0.048284,0.002918,-0.025234,-0.013761,0.055348,-0.019113,-0.016483,-0.028669,0.007923,...,-0.014598,-0.018377,-0.017482,-0.005928,-0.003464,1.000000,0.002077,-0.017119,0.019087,-0.023923
season_a,-0.034119,0.022103,-0.004233,-0.037119,-0.022261,0.025897,-0.097923,-0.012340,0.020338,0.045363,...,0.033111,-0.004018,-0.021324,0.005169,-0.022579,0.002077,1.000000,-0.216001,-0.764364,-0.301842
season_s,0.033431,-0.039386,-0.007676,0.073950,0.070587,-0.058596,0.182091,0.083167,-0.080515,-0.085326,...,-0.018717,0.014570,0.040049,-0.016648,0.040769,-0.017119,-0.216001,1.000000,-0.170138,-0.067186
season_u,-0.008742,0.003985,-0.020945,0.023704,0.023723,-0.031489,-0.002368,-0.011584,0.028701,-0.008872,...,0.007041,-0.035152,0.012906,0.015309,-0.015888,0.019087,-0.764364,-0.170138,1.000000,-0.237753


In [9]:
# define the columns of interest
cols_of_interest = ['stem-width', 'stem-height', 'cap-diameter']

# calculate the correlation matrix
corr_matrix = temp_train.corr(method='pearson')

# filter the correlation matrix to only include rows or columns with 'stem-width', 'stem-height', or 'cap-diameter'
corr_filtered = corr_matrix[(corr_matrix.index.isin(cols_of_interest)) | (corr_matrix.columns.isin(cols_of_interest))]

# create a copy of the filtered correlation matrix for sorting
corr_sorted = corr_filtered.abs().unstack().sort_values(ascending=False).reset_index()

# remove self-correlations (correlations of features with themselves)
corr_sorted = corr_sorted[corr_sorted['level_0'] != corr_sorted['level_1']]

# rename the columns of the sorted correlation matrix
corr_sorted.columns = ['feature_1', 'feature_2', 'correlation_coefficient']

# filter the sorted correlation matrix to only include rows or columns with 'stem-width', 'stem-height', or 'cap-diameter'
corr_sorted_filtered = corr_sorted[(corr_sorted['feature_1'].isin(cols_of_interest)) | (corr_sorted['feature_2'].isin(cols_of_interest))]

# display the top 5 positive and top 5 negative correlation coefficient values with their corresponding row and column names
print('Top 10 positive correlations:')
print(corr_sorted_filtered.head(15))

print('\nTop 10 negative correlations:')
print(corr_sorted_filtered.tail(15)[::-1])


  corr_matrix = temp_train.corr(method='pearson')


Top 10 positive correlations:
            feature_1     feature_2  correlation_coefficient
3        cap-diameter    stem-width                 0.695804
4          stem-width  cap-diameter                 0.695804
5         stem-height    stem-width                 0.436069
6          stem-width   stem-height                 0.436069
7         stem-height  cap-diameter                 0.423171
8        cap-diameter   stem-height                 0.423171
9         ring-type_m   stem-height                 0.406376
10  gill-attachment_p    stem-width                 0.400621
11  gill-attachment_p  cap-diameter                 0.353896
12        ring-type_f   stem-height                 0.332793
13  gill-attachment_a    stem-width                 0.287511
14        cap-shape_p   stem-height                 0.287276
15         has-ring_f   stem-height                 0.285016
16         has-ring_t   stem-height                 0.285016
17        cap-shape_o   stem-height                 0.2

In [7]:
corr_matrix = temp_train.corr(method='pearson')
corr_values = corr_matrix.unstack()
corr_sorted = corr_values.sort_values(ascending=False)

# Remove duplicates and self-correlations
corr_sorted = corr_sorted[corr_sorted != 1.0]

# Extract top 5 positive and negative features
top_positive = corr_sorted.head(5)
top_negative = corr_sorted.tail(5)
print('Top 5 positive correlations:')
print(top_positive)
print('Top 5 negative correlations:')
print(top_negative)

  corr_matrix = temp_train.corr(method='pearson')


Top 5 positive correlations:
has-ring_f    ring-type_f          0.778522
ring-type_f   has-ring_f           0.778522
cap-diameter  stem-width           0.695804
stem-width    cap-diameter         0.695804
cap-shape_o   gill-attachment_f    0.556959
dtype: float64
Top 5 negative correlations:
gill-spacing_d          gill-spacing_c           -0.799812
has-ring_t              has-ring_f               -1.000000
does-bruise-or-bleed_f  does-bruise-or-bleed_t   -1.000000
has-ring_f              has-ring_t               -1.000000
does-bruise-or-bleed_t  does-bruise-or-bleed_f   -1.000000
dtype: float64


In [8]:
shape = data_train['cap-shape'].unique()
features = ['cap-diameter', 'stem-height', 'stem-width']
values = ['min','max','mean']

data_train['min_cap_shape'] = pd.Series(dtype='float64')
data_train['max_cap_shape'] = pd.Series(dtype='float64')
data_train['mean_cap_shape'] = pd.Series(dtype='float64')
for i in shape:
    cap_shape = data_train[data_train['cap-shape'] == i]
    for j in features:
        temp_min = cap_shape.describe().loc['min', j]
        temp_max = cap_shape.describe().loc['max', j]
        temp_avg = cap_shape.describe().loc['mean', j]
        
        data_train.loc[data_train['cap-shape'] == i, 'mean_cap_shape'] = temp_avg
        data_train.loc[data_train['cap-shape'] == i, 'max_cap_shape'] = temp_max
        data_train.loc[data_train['cap-shape'] == i, 'min_cap_shape'] = temp_min
data_train.head()
    

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,class,min_cap_shape,max_cap_shape,mean_cap_shape
0,4.98,c,i,y,f,a,c,n,6.04,6.21,w,f,f,d,a,p,0.77,23.57,7.404107
1,2.84,x,y,y,f,a,c,w,5.66,3.55,y,t,r,h,u,p,0.52,103.91,12.412532
2,11.44,x,y,y,f,a,c,w,7.03,25.29,n,t,e,d,w,e,0.52,103.91,12.412532
3,8.77,s,t,r,t,d,c,g,4.44,13.61,r,f,f,d,a,p,2.01,73.28,14.113719
4,7.55,x,d,n,t,p,c,y,8.41,18.44,y,f,f,d,a,e,0.52,103.91,12.412532
