# K nearest Neighbour

This is an implementation of K nearest neigbour from scratch

I build a model to address the famous kaggle titanic problem, a binary classification problem. Individuals must be predicted as having survived or not survived the titanic disaster.

In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from numpy.random import randint

%matplotlib inline
pd.set_option('max.rows', None)

### read data

In [8]:
data = pd.read_csv('train.csv')

In [9]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
feats = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

### impute Age NA values with mean  

In [11]:
feats.Age = feats.Age.fillna(data.Age.mean())

In [12]:
feats.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


### defining variables

the features x are an n dimensional matrix:
$$x \in \mathbb{R}^{n}$$
    

    


* scale variables
* choose distance metric
* identify closest K neighbours
* calculate mode class

class attributes will be:
* classification 
* feature 1 value, feature 2 value ... feature j value 

approach 2: vectorised

* store feature data in a numpy array then calculate distance metrics using vectorised operations

my prediction is that approach 2 will be fastest

distance metrics to be implemented:
* euclidean
* manhattan
* Minkowski
* mahalanobis

The algorithm type for calculating the nearest neighbours will be brute force

### distance functions

In [None]:
def euclidean_distance():
    
def manhattan_distance():
    
def minkowski_distance():
    
def mahalanobis():
    
def test_distance

In [None]:
def compute_nearest_neighbours(train, test, distance_metric):

In [13]:
feats.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


In [14]:
feats_array = np.array(feats)

In [15]:
feats_array

array([[ 3.        , 22.        ,  1.        ,  0.        ,  7.25      ],
       [ 1.        , 38.        ,  1.        ,  0.        , 71.2833    ],
       [ 3.        , 26.        ,  0.        ,  0.        ,  7.925     ],
       ...,
       [ 3.        , 29.69911765,  1.        ,  2.        , 23.45      ],
       [ 1.        , 26.        ,  0.        ,  0.        , 30.        ],
       [ 3.        , 32.        ,  0.        ,  0.        ,  7.75      ]])

In [21]:
example_unknown = feats_array[0,:]

In [22]:
example_unknown

array([ 3.  , 22.  ,  1.  ,  0.  ,  7.25])

In [25]:
differences = feats_array-example_unknown

In [26]:
differences

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-2.        , 16.        ,  0.        ,  0.        , 64.0333    ],
       [ 0.        ,  4.        , -1.        ,  0.        ,  0.675     ],
       ...,
       [ 0.        ,  7.69911765,  0.        ,  2.        , 16.2       ],
       [-2.        ,  4.        , -1.        ,  0.        , 22.75      ],
       [ 0.        , 10.        , -1.        ,  0.        ,  0.5       ]])

In [39]:
2**3

8

In [40]:
euc_dists = ((differences[:,0])**2+(differences[:,1])**2+(differences[:,2])**2+(differences[:,3])**2+(differences[:,4])**2)**0.5

In [44]:
(2**2 + 16**2 + 64.0333**2)**0.5

66.03229141026381

In [45]:
euc_dists

array([  0.        ,  66.03229141,   4.17799294,  47.69929245,
        13.06292463,   7.85725152,  54.94793132,  24.41578639,
         6.71416554,  24.20307651,  20.35442212,  40.90831211,
         2.37486842,  29.85298352,   8.08486596,  34.16961369,
        29.80797922,   9.71282207,  14.02007489,   7.76382879,
        22.85962598,  13.38142369,   7.11387044,  28.96657557,
        19.80228838,  29.38739366,   7.76382879, 255.79105242,
         7.78924291,   7.7906014 ,  27.35056951, 139.49778545,
         7.77987227,  44.14252485,  75.18727467,  49.05672737,
         7.76381641,   1.62480768,  11.51357894,   8.94056312,
        18.13699603,  14.66500938,   7.7906014 ,  39.30005054,
         3.22426622,   7.8048967 ,  11.28445446,   7.77987227,
        16.38530516,  11.28284095,  35.87744983,   1.51739909,
        74.56781633,  20.03902443,  69.64398992,  29.36560765,
         3.68272997,   6.57650611,  26.72545603,  41.38988403,
         1.0002163 ,  74.52222823,  79.64452665,  27.53