# K Nearest Neighbors

kNN is one of the simplest of classification algorithms available for supervised learning. The idea is to search for closest match of the test data in feature space. 

<img src='images/knn_theory.png' />

In [23]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data loading & Understanding

In [24]:
# Importing the dataset
dataset = pd.read_csv('data/Social_Network_Ads.csv')

In [25]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [26]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [27]:
dataset.count()

User ID            400
Gender             400
Age                400
EstimatedSalary    400
Purchased          400
dtype: int64

In [28]:
dataset.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [29]:
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values   

In [30]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [31]:
state_label_encoder = LabelEncoder()

In [32]:
X[:,1]=state_label_encoder.fit_transform(X[:,1])

In [33]:
X[:2, :]

array([[19,  4],
       [35,  5]], dtype=int64)

In [34]:
state_onehot_encoder = OneHotEncoder(categorical_features = [1], dtype = np.int)
X = state_onehot_encoder.fit_transform(X).toarray()

In [35]:
X[:10, :]

array([[ 0.,  0.,  0., ...,  0.,  0., 19.],
       [ 0.,  0.,  0., ...,  0.,  0., 35.],
       [ 0.,  0.,  0., ...,  0.,  0., 26.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  1., 32.],
       [ 0.,  0.,  0., ...,  0.,  0., 25.],
       [ 0.,  0.,  0., ...,  0.,  0., 35.]])

## Spliting dataset into training and testing

In [36]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [38]:
X_train[:5, :]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        58.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,

In [39]:
y_train[:5]

array([1, 0, 0, 0, 1], dtype=int64)

## Feature Scaling

In [40]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

In [41]:
sc = StandardScaler()

In [42]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [43]:
X_train[:5, :]

array([[-0.09728167, -0.07930516, -0.09728167, -0.09728167, -0.07930516,
        -0.07930516, -0.07930516, -0.09728167, -0.14954671, -0.09728167,
        -0.09728167, -0.09728167, -0.11250879, -0.07930516, -0.09728167,
        -0.07930516, -0.11250879, -0.1382327 , -0.07930516, -0.07930516,
        -0.07930516, -0.05598925, -0.09728167, -0.11250879, -0.09728167,
        -0.11250879, -0.12598816, -0.11250879, -0.07930516, -0.05598925,
        -0.09728167, -0.05598925, -0.07930516, -0.11250879, -0.09728167,
        -0.11250879, -0.07930516, -0.11250879, -0.12598816, -0.12598816,
        -0.07930516, -0.16012815, -0.12598816, -0.1382327 , -0.05598925,
        -0.09728167, -0.05598925, -0.1382327 , -0.07930516, -0.07930516,
        -0.05598925, -0.09728167, -0.17011439, -0.19738551, -0.09728167,
        -0.14954671, -0.16012815, -0.07930516,  0.        , -0.07930516,
        -0.14954671, -0.16012815, -0.05598925, -0.12598816, -0.07930516,
        -0.09728167, -0.05598925, -0.07930516, -0.0

In [44]:
y_train[:5]

array([1, 0, 0, 0, 1], dtype=int64)

# K Nearest Neighbors

In [45]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier

In [46]:
knn = KNeighborsClassifier(n_neighbors = 3)

In [47]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [42]:
# Predicting the Test set results
y_test_pred = knn.predict(X_test)

## Accuracy

In [43]:
# Training accuracy
knn.score(X_train, y_train)

0.7638888888888888

In [44]:
# Testing accuracy
knn.score(X_test, y_test)

0.75

## Confusion Matrix
<img src='images/confusion_matrix_simple2.png'/>

In [45]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_test_pred)

In [46]:
cm

array([[29,  3],
       [ 7,  1]], dtype=int64)