In [340]:
import numpy as np
from scipy import stats
from collections import Counter
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.neighbors import KNeighborsClassifier

In [465]:
def knn_home_made(X_test, X_train, y_train, k = 3, f = euclidean_distance):
    '''
    1 calculate distance wrt all the data points
    2 pick the nearest (top K of sorted array) data points 
    3 return the most frequent labels
    
    data shape = m * d 
    k is the # of nearest neighbors 
    ways of measuring distance can variy, default is euclidean
    input are numpy array
    
    
    '''
    dist = [[0 for _ in range(len(X_train))] for m in range(len(X_test))]
    
    for i,x_test in enumerate(X_test):
        for j,x_train in enumerate(X_train):
            dist[i][j] = f(x_test, x_train)
    
    index = np.argsort(dist, axis = 1)[:,:k]
    label = np.transpose(stats.mode(y_train[index], axis = 1)[0])[0]
    
    return label
    
#     index = [0 for _ in range(len(X_test))]
#     for i,row in enumerate(dist):
#         row = np.array(row)
#         index[i] = row.argsort(kind='quicksort')[:k]


    
#     print(index)
#     label = [0 for _ in range(len(X_test))]
#     for i,ind in enumerate(index):
#         count_label = Counter(y_train[ind])
#         label[i] = count_label.most_common(1)[0][0]
#     print(y_train[index])



In [466]:
def euclidean_distance(x,y):
    #dist_2 =[(a-b)**2 for a, b in zip(x,y)]
    x = np.array(x)
    y = np.array(y)
    dist = np.sqrt(sum(np.square(x-y)))
    return dist   

In [467]:
def cos_similarity(x,y):
    '''
    When to Use Cosine?
Cosine similarity is generally used as a metric for measuring distance when the magnitude of the vectors does not matter. This happens for example when working with text data represented by word counts. We could assume that when a word (e.g. science) occurs more frequent in document 1 than it does in document 2, that document 1 is more related to the topic of science. However, it could also be the case that we are working with documents of uneven lengths (Wikipedia articles for example). Then, science probably occurred more in document 1 just because it was way longer than document 2. Cosine similarity corrects for this.

Text data is the most typical example for when to use this metric. However, you might also want to apply cosine similarity for other cases where some properties of the instances make so that the weights might be larger without meaning anything different. Sensor values that were captured in various lengths (in time) between instances could be such an example.
'''
    return np.dot(x,y)/( np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y)))

In [468]:
cos_similarity([ 6.6,6.2] ,[ 9.7,9.9])

0.9991413385403556

In [469]:
euclidean_distance([6.6,6.2] ,[ 9.7,9.9])

4.827007354458868

In [470]:
# vectors      [ 6.6  6.2] [ 9.7  9.9]
# euclidean    4.82700735446
# cosine       0.99914133854

In [443]:
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [444]:
y_train

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
       1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
       1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
       0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
       1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 1, 2, 0, 1, 2])

In [445]:
y_pred = knn_home_made(X_test, X_train, y_train, 3, euclidean_distance)

In [446]:
confusion_matrix(y_test, y_pred)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  4,  7]])

In [447]:
accuracy = accuracy_score(y_test, y_pred)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

Accuracy of our model is equal 80.0 %.


In [448]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [449]:
y_pred_sk = neigh.predict(X_test)

In [450]:
accuracy = accuracy_score(y_test, y_pred_sk)*100
print('Accuracy of sklearn model is equal ' + str(round(accuracy, 2)) + ' %.')

Accuracy of sklearn model is equal 76.67 %.


In [464]:

# from decimal import Decimal
 
# class Similarity():
 
#     """ Five similarity measures function """
 
#     def euclidean_distance(self,x,y):
 
#         """ return euclidean distance between two lists """
 
#         return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
 
#     def manhattan_distance(self,x,y):
 
#         """ return manhattan distance between two lists """
 
#         return sum(abs(a-b) for a,b in zip(x,y))
 
#     def minkowski_distance(self,x,y,p_value):
 
#         """ return minkowski distance between two lists """
 
#         return self.nth_root(sum(pow(abs(a-b),p_value) for a,b in zip(x, y)),
#            p_value)
 
#     def nth_root(self,value, n_root):
 
#         """ returns the n_root of an value """
 
#         root_value = 1/float(n_root)
#         return round (Decimal(value) ** Decimal(root_value),3)
 
#     def cosine_similarity(self,x,y):
 
#         """ return cosine similarity between two lists """
 
#         numerator = sum(a*b for a,b in zip(x,y))
#         denominator = self.square_rooted(x)*self.square_rooted(y)
#         return round(numerator/float(denominator),3)
 
#     def square_rooted(self,x):
 
#         """ return 3 rounded square rooted value """
 
#         return round(sqrt(sum([a*a for a in x])),3)
 
#     def jaccard_similarity(self,x,y):
 
#     """ returns the jaccard similarity between two lists """
 
#         intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
#         union_cardinality = len(set.union(*[set(x), set(y)]))
#         return intersection_cardinality/float(union_cardinality)
 

