# K nearest Neighbour

This is an implementation of K nearest neigbour from scratch

I build a model to address the famous kaggle titanic problem, a binary classification problem. Individuals must be predicted as having survived or not survived the titanic disaster.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from numpy.random import randint
from scipy import stats

%matplotlib inline
pd.set_option('max.rows', None)

### read data

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
feats = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [5]:
class_labels = np.array(data['Survived'])

### impute Age NA values with mean  

In [6]:
feats.Age = feats.Age.fillna(data.Age.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [7]:
feats.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


In [11]:
feats_array = np.array(feats)
example_unknown = feats_array[0:6,:]

In [51]:
example_unknown

array([[ 3.        , 22.        ,  1.        ,  0.        ,  7.25      ],
       [ 1.        , 38.        ,  1.        ,  0.        , 71.2833    ],
       [ 3.        , 26.        ,  0.        ,  0.        ,  7.925     ],
       [ 1.        , 35.        ,  1.        ,  0.        , 53.1       ],
       [ 3.        , 35.        ,  0.        ,  0.        ,  8.05      ],
       [ 3.        , 29.69911765,  0.        ,  0.        ,  8.4583    ]])

### Approach

* scale variables
* choose distance metric
* identify closest K neighbours
* calculate mode class

class attributes will be:
* classification 
* feature 1 value, feature 2 value ... feature j value 

approach 2: vectorised

* store feature data in a numpy array then calculate distance metrics using vectorised operations

my prediction is that approach 2 will be fastest

distance metrics to be implemented:
* euclidean
* manhattan
* Minkowski
* mahalanobis

The algorithm type for calculating the nearest neighbours will be brute force

### vectorised approach

In [139]:
class knn:
    
    import numpy as np
    
    def __init__(self, train_data, class_labels, n_neighbours):
        
        self.train_data = train_data
        self.class_labels = class_labels
        self.n_neighbours = n_neighbours
        
    def euclidean_distance(self, feats_array, unknown):
        
        self.feats_array = feats_array
        self.unknown = unknown
        
        differences = self.feats_array - self.unknown
        euc_dists = ((differences[:,0])**2+(differences[:,1])**2+(differences[:,2])**2+(differences[:,3])**2+(differences[:,4])**2)**0.5
        return(euc_dists)
    
    def manhattan_distance(self, feats_array, unknown):
        
        self.feats_array = feats_array
        self.unknown = unknown
        
        differences = self.feats_array - self.unknown
        man_dists = (differences[:,0])+(differences[:,1])+(differences[:,2])+(differences[:,3])+(differences[:,4])
        return(man_dists)

    
    def minkowski_distance():
        None
    
    def mahalanobis():
        None

    def test_distance():
        None

    def predict(self, unknown):
        
        self.unknown_data = unknown
        
        self.results = []
        self.closest_neighbours =[]
        
        for row in range(self.unknown_data.shape[0]):
            



            unknown_row = self.unknown_data[row,:]
            


            distances = self.euclidean_distance(self.train_data, unknown_row)



            dist_joined = np.stack((distances, self.class_labels), axis = -1)

            sorter = dist_joined[:,0]
            sorted_indexes = sorter.argsort()
            sorted_distances = dist_joined[sorted_indexes]

            self.closest_neighbours.append(sorted_distances[0:self.n_neighbours, 1])
            
            self.results.append(stats.mode(sorted_distances[0:self.n_neighbours, 1])[0][0])
        




In [141]:
model = knn(train_data=feats_array, class_labels=class_labels, n_neighbours=5)

In [142]:
model.predict(unknown=feats_array)

### I predict and evaluate 

In [143]:
f1Score = f1_score(y_true=class_labels, y_pred = model.results)
accuracyScore = accuracy_score(y_true=class_labels, y_pred = model.results)

This is a rough measure of goodness of fit as I'm using predictions based off of the training data, this isn't a proper assesment of the model otherwise I would use k-fold stratified cross-validation.

In [151]:
print(f'f1_score for model predictions is {f1Score}')
print(f'accuracy_score for model predictions is {accuracyScore}')

f1_score for model predictions is 0.703125
accuracy_score for model predictions is 0.7867564534231201


In [152]:
randomLabel = pd.Series(randint(low = 0, high =2, size = data.shape[0]))

In [153]:
f1ScoreRandom = f1_score(class_labels, y_pred=randomLabel)
accuracyScoreRandom = accuracy_score(y_true=class_labels, y_pred=randomLabel)

In [154]:
print(f'f1_score for random predictions is {f1ScoreRandom}')
print(f'accuracy_score for random predictions is {accuracyScoreRandom}')

f1_score for random predictions is 0.4271356783919598
accuracy_score for random predictions is 0.4882154882154882
