# K- Nearest Neighbour for data imputation

## Import Library's

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [9]:
# Load csv file 
all_df = pd.read_csv('./titanic_train.csv', index_col=False)
# drop ID column
all_df.drop('PassengerId',axis=1,inplace=True)
all_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [11]:
# basic statistics for each column
all_df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [12]:
# check distribution of Age data
all_df['Age'].value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88, dtype: int64

In [13]:
# Assign features to X
X = all_df.drop('Name', axis=1)
X = X.drop('Ticket', axis=1)
X = X.drop('Cabin', axis=1)
#X = X.dropna()# drop missing data

# assign numeriscal label to y
y = X['Survived']
X = X.drop('Survived', axis=1)

# First , transform the class 'Sex' from thier original string representation (M and F ) into integers
# transform Embarked into integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Sex']= le.fit_transform(X['Sex'])
X['Embarked']= le.fit_transform(X['Embarked'])

X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [15]:
# Example of calculating Euclidean distance
from math import sqrt
 
# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [22]:
#X = X.to_numpy()
X

array([[ 3.    ,  1.    , 22.    , ...,  0.    ,  7.25  ,  2.    ],
       [ 1.    ,  0.    , 38.    , ...,  0.    , 71.2833,  0.    ],
       [ 3.    ,  0.    , 26.    , ...,  0.    ,  7.925 ,  2.    ],
       ...,
       [ 3.    ,  0.    ,     nan, ...,  2.    , 23.45  ,  2.    ],
       [ 1.    ,  1.    , 26.    , ...,  0.    , 30.    ,  0.    ],
       [ 3.    ,  1.    , 32.    , ...,  0.    ,  7.75  ,  1.    ]])

In [19]:
euclidean_distance(X[0],X[2])

4.296001047485906

In [23]:
# Locate the most similar neighbors
def get_neighbours(data, row, K):
    distances = list()
    for data_row in data:
        dist = euclidean_distance(row, data_row)
        distances.append((data_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbours = list()
    for i in range(K):
        neighbours.append(distances[i][0])
    return neighbours

In [26]:
test = get_neighbours(X, X[0], 5)
test

[array([ 3.  ,  1.  , 22.  ,  1.  ,  0.  ,  7.25,  2.  ]),
 array([ 3. ,  1. , 21. ,  0. ,  0. ,  7.8,  2. ]),
 array([ 3.  ,  1.  , 21.  ,  0.  ,  0.  ,  8.05,  2.  ]),
 array([ 3.  ,  1.  , 20.  ,  0.  ,  0.  ,  8.05,  2.  ]),
 array([ 3.    ,  0.    , 19.    ,  0.    ,  0.    ,  7.8792,  1.    ])]