In [19]:
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric
from scipy.spatial import distance

In [20]:
### load dat file 
### retrieve Renault, Rover, Toyota
df = pd.read_csv('carmean2.dat', sep ="\s+", header=None)

df_car = df.iloc[16:19,:]
df_car

Unnamed: 0,0,1,2,3,4,5,6,7,8
16,Re19,2.7,3.3,3.4,3.0,3.1,3.4,3.0,2.7
17,Rove,3.9,2.8,2.6,4.0,2.6,3.0,3.2,3.0
18,ToCo,2.5,2.9,3.4,3.0,3.2,3.1,3.2,2.8


In [21]:
df_car = df_car.loc[:,1:8]
df_car

Unnamed: 0,1,2,3,4,5,6,7,8
16,2.7,3.3,3.4,3.0,3.1,3.4,3.0,2.7
17,3.9,2.8,2.6,4.0,2.6,3.0,3.2,3.0
18,2.5,2.9,3.4,3.0,3.2,3.1,3.2,2.8


In [22]:
### transfer DataFrame to Numpy array
car_list = np.array(df_car).astype(float)

print("Data matrix: \n", car_list)

Data matrix: 
 [[2.7 3.3 3.4 3.  3.1 3.4 3.  2.7]
 [3.9 2.8 2.6 4.  2.6 3.  3.2 3. ]
 [2.5 2.9 3.4 3.  3.2 3.1 3.2 2.8]]


In [23]:
### calculate binary data
I = np.ones(shape=(3,3))
y = np.zeros(shape=(3,8))

### x_mu
car_mu = np.around(np.mean(car_list, axis=0), decimals=3, out=None)
print("Mean: \n", car_mu)

### fill binary matrix; if x(i,k)>x_mu(k): 1, else 0
for i in range (y.shape[0]):
    for k in range (y.shape[1]):
        if car_list[i,k] > car_mu[k]:
            y[i,k] = 1
        else:
            y[i,k] = 0

print("Binary matrix: \n", y)

Mean: 
 [3.033 3.    3.133 3.333 2.967 3.167 3.133 2.833]
Binary matrix: 
 [[0. 1. 1. 0. 1. 1. 0. 0.]
 [1. 0. 0. 1. 0. 0. 1. 1.]
 [0. 0. 1. 0. 1. 0. 1. 0.]]


In [24]:
### Jaccard measure
sim = DistanceMetric.get_metric('jaccard')
dist_jaccard = np.around(I - sim.pairwise(y), decimals=3, out=None)

print("Jaccard measure: \n", dist_jaccard)

Jaccard measure: 
 [[1.    0.    0.4  ]
 [0.    1.    0.167]
 [0.4   0.167 1.   ]]


In [25]:
### Simple matching
sim = DistanceMetric.get_metric('matching')
dist_simple = np.around(I - sim.pairwise(y), decimals=3, out=None)

print("Simple matching: \n", dist_simple)

Simple matching: 
 [[1.    0.    0.625]
 [0.    1.    0.375]
 [0.625 0.375 1.   ]]


In [26]:
### Tanimoto measure
sim = DistanceMetric.get_metric('rogerstanimoto')
dist_tanimoto = np.around(I - sim.pairwise(y), decimals=3, out=None)

print("Tanimoto measure: \n", dist_tanimoto)

Tanimoto measure: 
 [[1.    0.    0.455]
 [0.    1.    0.231]
 [0.455 0.231 1.   ]]
