In [None]:
import pandas as pd
from math import sqrt
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.preprocessing import OneHotEncoder
from google.colab import drive
drive.mount('/content/drive')
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Implement dot product for vectors and matrices -- from scratch**

In [None]:
def dot_product(m,v):
    '''
    This function is to implement dot product for vectors and matrices.
    INPUT:
    m: a matrix in pandas DataFrames format
    v: a vector in pandas Series format
    '''
    if m.shape[1]!=v.count():
        raise Exception("Demensions do not match!")
    else:
        result = [0 for i in range(m.shape[0])]
        for i in range(m.shape[0]):
            for j in range(m.shape[1]):
                result[i] += m.iloc[i,j]*v[j]
    return pd.Series(result)


In [None]:
# test case
df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
s = pd.Series([1, 1, 2, 1])
dot_product(df,s)

0   -4
1    5
dtype: int64

In [None]:
# check
matrix.dot(vector)

0   -4
1    5
dtype: int64

**Implement euclidean distance and cosine distance from scratch**

euclidean distance

In [None]:
def euc_dist(p,q):
    '''
    This function is to implement duclidean distance for two dots
    p and q are two dots in a DataFrame
    '''
    if p.shape != q.shape:
        raise Exception('Shapes do not match!')
    else:
        dist = 0
        for i in range(p.shape[0]):
            dist += (p[i]-q[i])**2
    return sqrt(dist)

In [None]:
## test case
p = df.iloc[0]
q = df.iloc[1]
euc_dist(p,q)

3.7416573867739413

In [None]:
## check
euclidean_distances(df)

array([[0.        , 3.74165739],
       [3.74165739, 0.        ]])

cosine distance

In [None]:
def magnitude(x):
    '''
    This function is to implement magnitude of x, used for compute cosine distance
    x is a vector in pandas Series format
    '''
    return sqrt(sum(x**2))

In [None]:
def cos_dist(p,q):
    '''
    This function is to implement cosine distance for two dots
    p and q are two dots in pandas Series format
    '''
    if p.shape != q.shape:
        raise Exception('Shapes do not match!')
    else:
        cos_sim = 0
        for i in range(p.shape[0]):
            cos_sim += (p[i]*q[i])/(magnitude(p) * magnitude(q))
    return 1-cos_sim    # cosine distance = 1 - cosine similarity


In [None]:
## test case
cos_dist(p,q)

1.4082482904638631

In [None]:
##check
cosine_distances(df)

array([[0.        , 1.40824829],
       [1.40824829, 0.        ]])

**Implement Manhattan Distance from scratch**

The Manhattan distance between two points x = (x1, x2, …, xn) and y = (y1, y2, …, yn) in n-dimensional space is the sum of the distances in each dimension.

In [None]:
def man_dist(p,q):
    '''
    This function is to implement Manhattan Distance
    p and q are two dots in pandas Series format
    '''
    if p.shape != q.shape:
        raise Exception('Shapes do not match!')
    else:
        man_dist = 0
        for i in range(p.shape[0]):
            man_dist += abs(p[i]-q[i])
    return man_dist

In [None]:
## test case
man_dist(p,q)

6

**See if you can figure out one-hot encoding for categorical variables**

In [None]:
cars = pd.read_csv('/content/drive/MyDrive/Project_for_EMSE6574/cars_2021_clean.csv')
cars.head()

Unnamed: 0.1,Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,lat,long
0,27,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,other,UNKNOWN,UNKNOWN,pickup,white,al,32.59,-85.48
1,28,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,other,UNKNOWN,UNKNOWN,pickup,blue,al,32.59,-85.48
2,29,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,other,UNKNOWN,UNKNOWN,pickup,red,al,32.59,-85.48
3,30,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,other,UNKNOWN,UNKNOWN,pickup,red,al,32.59,-85.48
4,31,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,black,al,32.592,-85.5189


In [None]:
cat_features = ['paint_color', 'size']

In [None]:
# method 1: pandas.get_dummies
pd.get_dummies(cars[cat_features])

Unnamed: 0,paint_color_UNKNOWN,paint_color_black,paint_color_blue,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow,size_UNKNOWN,size_compact,size_full-size,size_mid-size,size_sub-compact
0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260830,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
260831,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
260832,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
260833,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0


In [None]:
# method 2: sklearn.preprocessing.OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cars[cat_features])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [None]:
enc.categories_

[array(['UNKNOWN', 'black', 'blue', 'brown', 'custom', 'green', 'grey',
        'orange', 'purple', 'red', 'silver', 'white', 'yellow'],
       dtype=object),
 array(['UNKNOWN', 'compact', 'full-size', 'mid-size', 'sub-compact'],
       dtype=object)]

In [None]:
cars_onehot = enc.transform(cars[cat_features]).toarray()
print(cars_onehot)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [None]:
enc.fit_transform(cars[cat_features]).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

**Invent your own distance metric ;-)  -- for your particular problem**

In [None]:
# measure the distance between two cars by some numerical variables
def rui_dist(p,q):
    '''
    This function is to implement duclidean distance for two dots
    p and q are two dots in a DataFrame
    '''
    if p.shape != q.shape:
        raise Exception('Shapes do not match!')
    else:
        dist = 0
        for i in range(p.shape[0]):
            dist += (p[i]-q[i])**3
    return np.power(dist, 1/3)

In [None]:
def dist_cars(numeric_columns, car1_index, car2_index):
    return rui_dist(cars[numeric_columns].iloc[car1_index], cars[numeric_columns].iloc[car2_index])

In [None]:
dist_cars(['odometer'], 1, 5)

2532.999999999999