In [1]:
# Previously we looked at how to deal with missing values 
# This covered using statistical measures to fill in the values
# eg mean, median , most frequest and constant 
# another way is using KNN imputation 
# This notebook will cover how to use the nearest neighbour imputation 

In [2]:
# we will use the horse colic dataset. This shows medical records for horses which had colic and whether they lived or died
# ( all horse die nin the end so maybe a misnomer but bear with us)

In [4]:
# first lets load the dataset 
import pandas as pd

df_horse_data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv',header =None, na_values='?')

In [5]:
df_horse_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [8]:
df_horse_data.isnull().sum() / df_horse_data.isnull().count()

0     0.003333
1     0.000000
2     0.000000
3     0.200000
4     0.080000
5     0.193333
6     0.186667
7     0.230000
8     0.156667
9     0.106667
10    0.183333
11    0.146667
12    0.186667
13    0.346667
14    0.353333
15    0.823333
16    0.340000
17    0.393333
18    0.096667
19    0.110000
20    0.550000
21    0.660000
22    0.003333
23    0.000000
24    0.000000
25    0.000000
26    0.000000
27    0.000000
dtype: float64

In [10]:
# to use the Nearest Neighbour KNN we use KNNImputer 
from sklearn.impute import KNNImputer

In [12]:
KN = KNNImputer()
X = df_horse_data

In [13]:
KN.fit(X)

KNNImputer()

In [14]:
KN.transform(X)

array([[2.00000e+00, 1.00000e+00, 5.30101e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00],
       [1.00000e+00, 1.00000e+00, 5.34817e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00],
       [2.00000e+00, 1.00000e+00, 5.30334e+05, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       ...,
       [1.00000e+00, 1.00000e+00, 5.29386e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00],
       [1.00000e+00, 1.00000e+00, 5.30612e+05, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       [1.00000e+00, 1.00000e+00, 5.34618e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00]])

In [15]:
import numpy as np

In [18]:
np.isnan(KN.transform(X)).sum()

0

In [19]:
# lets break KNN down into more detail
# https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn
# different distance measures:
# Euclidean distance, Hamming distance, Manhattan distance and Minkowski distance
# Question is how many neighbour you need...ie what is K 
# Lets demonstrate this on the dataset 

In [20]:
KN = KNNImputer(n_neighbors=5, weights='uniform',metric='nan_euclidean')

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline


In [22]:
X = df_horse_data.drop([23], axis =1)
y = df_horse_data[23]


In [25]:
RF = RandomForestClassifier()
PIPE = Pipeline(steps = [('KNN',KN), ('RF',RF)])
RSK = RepeatedStratifiedKFold(n_splits=5, n_repeats=3,random_state=42)
cs = cross_val_score(PIPE, X,y, cv = RSK, scoring= 'accuracy')

In [26]:
print (np.mean(cs), np.std(cs))

0.8611111111111112 0.027666443551086058


In [27]:
# how do we know n_neighbours is best? 
# we dont. So lets test across various neighbours

np.arange(1,11,2)

array([1, 3, 5, 7, 9])

In [30]:
results = []
for nn in np.arange(1,11,2):
    KN = KNNImputer(n_neighbors=nn, weights='uniform',metric='nan_euclidean')
    RF = RandomForestClassifier()
    PIPE = Pipeline(steps = [('KNN',KN), ('RF',RF)])
    RSK = RepeatedStratifiedKFold(n_splits=5, n_repeats=3,random_state=1)
    cs = cross_val_score(PIPE, X,y, cv = RSK, scoring= 'accuracy')
    print(nn, np.mean(cs), np.std(cs))
    results.append(results)

1 0.8588888888888888 0.032126293988446575
3 0.8622222222222222 0.03520662115056636
5 0.8688888888888887 0.04166296279833926
7 0.8588888888888888 0.03744955454745048
9 0.8655555555555555 0.039659040661277206


In [31]:
import matplotlib.pyplot as plt

In [None]:
plt.boxplot(results)