In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import pairwise_distances

from scipy.spatial.distance import hamming, euclidean, pdist, squareform

In [10]:
data_path = '../../soybean_data_use.csv'

soyabean_dataset = pd.read_csv(data_path)

soya_df = soyabean_dataset.loc[:10, ['date', 'hail', 'germination']]

soya_df

Unnamed: 0,date,hail,germination
0,august,no,lt-80%
1,september,yes,lt-80%
2,july,yes,80-89%
3,october,yes,90-100%
4,august,yes,lt-80%
5,september,yes,90-100%
6,july,yes,80-89%
7,july,yes,lt-80%
8,october,yes,80-89%
9,october,yes,lt-80%


In [24]:
soya_df['date'].unique()

array(['august', 'september', 'july', 'october'], dtype=object)

In [25]:
soya_df['hail'].unique()

array(['no', ' yes'], dtype=object)

In [27]:
soya_df['germination'].unique()

array(['lt-80%', '80-89%', ' 90-100%'], dtype=object)

In [11]:
soya_df.loc[:1,:]

Unnamed: 0,date,hail,germination
0,august,no,lt-80%
1,september,yes,lt-80%


In [12]:
hamming(soya_df.loc[0], soya_df.loc[1])

0.6666666666666666

In [13]:
ordencoder = OrdinalEncoder()

soya_df_enc = ordencoder.fit_transform(soya_df)
soya_df_enc

array([[0., 1., 2.],
       [3., 0., 2.],
       [1., 0., 1.],
       [2., 0., 0.],
       [0., 0., 2.],
       [3., 0., 0.],
       [1., 0., 1.],
       [1., 0., 2.],
       [2., 0., 1.],
       [2., 0., 2.],
       [2., 1., 0.]])

In [16]:
dst = pdist(soya_df_enc, metric='hamming')
dst

array([0.66666667, 1.        , 1.        , 0.33333333, 1.        ,
       1.        , 0.66666667, 1.        , 0.66666667, 0.66666667,
       0.66666667, 0.66666667, 0.33333333, 0.33333333, 0.66666667,
       0.33333333, 0.66666667, 0.33333333, 1.        , 0.66666667,
       0.66666667, 0.66666667, 0.        , 0.33333333, 0.33333333,
       0.66666667, 1.        , 0.66666667, 0.33333333, 0.66666667,
       0.66666667, 0.33333333, 0.33333333, 0.33333333, 0.66666667,
       0.66666667, 0.33333333, 0.66666667, 0.33333333, 1.        ,
       0.66666667, 0.66666667, 0.66666667, 0.66666667, 0.66666667,
       0.33333333, 0.33333333, 0.66666667, 1.        , 0.66666667,
       0.33333333, 1.        , 0.33333333, 0.66666667, 0.66666667])

In [18]:
dst_matrix = squareform(dst)
pd.DataFrame(dst_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.666667,1.0,1.0,0.333333,1.0,1.0,0.666667,1.0,0.666667,0.666667
1,0.666667,0.0,0.666667,0.666667,0.333333,0.333333,0.666667,0.333333,0.666667,0.333333,1.0
2,1.0,0.666667,0.0,0.666667,0.666667,0.666667,0.0,0.333333,0.333333,0.666667,1.0
3,1.0,0.666667,0.666667,0.0,0.666667,0.333333,0.666667,0.666667,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.666667,0.666667,0.0,0.666667,0.666667,0.333333,0.666667,0.333333,1.0
5,1.0,0.333333,0.666667,0.333333,0.666667,0.0,0.666667,0.666667,0.666667,0.666667,0.666667
6,1.0,0.666667,0.0,0.666667,0.666667,0.666667,0.0,0.333333,0.333333,0.666667,1.0
7,0.666667,0.333333,0.333333,0.666667,0.333333,0.666667,0.333333,0.0,0.666667,0.333333,1.0
8,1.0,0.666667,0.333333,0.333333,0.666667,0.666667,0.333333,0.666667,0.0,0.333333,0.666667
9,0.666667,0.333333,0.666667,0.333333,0.333333,0.666667,0.666667,0.333333,0.333333,0.0,0.666667


In [22]:
dst_matrix_2 = pairwise_distances(soya_df_enc, metric='hamming')
pd.DataFrame(dst_matrix_2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.666667,1.0,1.0,0.333333,1.0,1.0,0.666667,1.0,0.666667,0.666667
1,0.666667,0.0,0.666667,0.666667,0.333333,0.333333,0.666667,0.333333,0.666667,0.333333,1.0
2,1.0,0.666667,0.0,0.666667,0.666667,0.666667,0.0,0.333333,0.333333,0.666667,1.0
3,1.0,0.666667,0.666667,0.0,0.666667,0.333333,0.666667,0.666667,0.333333,0.333333,0.333333
4,0.333333,0.333333,0.666667,0.666667,0.0,0.666667,0.666667,0.333333,0.666667,0.333333,1.0
5,1.0,0.333333,0.666667,0.333333,0.666667,0.0,0.666667,0.666667,0.666667,0.666667,0.666667
6,1.0,0.666667,0.0,0.666667,0.666667,0.666667,0.0,0.333333,0.333333,0.666667,1.0
7,0.666667,0.333333,0.333333,0.666667,0.333333,0.666667,0.333333,0.0,0.666667,0.333333,1.0
8,1.0,0.666667,0.333333,0.333333,0.666667,0.666667,0.333333,0.666667,0.0,0.333333,0.666667
9,0.666667,0.333333,0.666667,0.333333,0.333333,0.666667,0.666667,0.333333,0.333333,0.0,0.666667


In [23]:
np.array_equal(dst_matrix, dst_matrix_2)

True

## Euclidean Distance

In [30]:
oh_encoder = OneHotEncoder(sparse_output=False)
soya_df_enc_oh = oh_encoder.fit_transform(soya_df)
soya_df_enc_oh

array([[1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 0., 1., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 1., 0., 0.]])

In [31]:
soya_df.nunique()

date           4
hail           2
germination    3
dtype: int64

In [32]:
soya_df_enc_oh.shape

(11, 9)

In [33]:
dist1 = pdist(soya_df_enc_oh, metric='euclidean')
dist1_mtx = squareform(dist1)
pd.DataFrame(dist1_mtx)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,2.0,2.44949,2.44949,1.414214,2.44949,2.44949,2.0,2.44949,2.0,2.0
1,2.0,0.0,2.0,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,2.44949
2,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
3,2.44949,2.0,2.0,0.0,2.0,1.414214,2.0,2.0,1.414214,1.414214,1.414214
4,1.414214,1.414214,2.0,2.0,0.0,2.0,2.0,1.414214,2.0,1.414214,2.44949
5,2.44949,1.414214,2.0,1.414214,2.0,0.0,2.0,2.0,2.0,2.0,2.0
6,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
7,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,0.0,2.0,1.414214,2.44949
8,2.44949,2.0,1.414214,1.414214,2.0,2.0,1.414214,2.0,0.0,1.414214,2.0
9,2.0,1.414214,2.0,1.414214,1.414214,2.0,2.0,1.414214,1.414214,0.0,2.0


In [34]:
dist2 = pairwise_distances(soya_df_enc_oh, metric='euclidean')
pd.DataFrame(dist2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,2.0,2.44949,2.44949,1.414214,2.44949,2.44949,2.0,2.44949,2.0,2.0
1,2.0,0.0,2.0,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,2.44949
2,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
3,2.44949,2.0,2.0,0.0,2.0,1.414214,2.0,2.0,1.414214,1.414214,1.414214
4,1.414214,1.414214,2.0,2.0,0.0,2.0,2.0,1.414214,2.0,1.414214,2.44949
5,2.44949,1.414214,2.0,1.414214,2.0,0.0,2.0,2.0,2.0,2.0,2.0
6,2.44949,2.0,0.0,2.0,2.0,2.0,0.0,1.414214,1.414214,2.0,2.44949
7,2.0,1.414214,1.414214,2.0,1.414214,2.0,1.414214,0.0,2.0,1.414214,2.44949
8,2.44949,2.0,1.414214,1.414214,2.0,2.0,1.414214,2.0,0.0,1.414214,2.0
9,2.0,1.414214,2.0,1.414214,1.414214,2.0,2.0,1.414214,1.414214,0.0,2.0


In [35]:
np.array_equal(dist1_mtx, dist2)

True