# K Nearest Neighbours

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('teleCust1000t.csv')
df.head()

In [None]:
X = df[['region', 'tenure','age', 'marital', 'address', 'income', 'ed', 'employ','retire', 'gender', 'reside']].values  #.astype(float)
X[0:5]

In [None]:
y = df['custcat'].values
y[0:5]

### Normalizing the features

Data Standardization give data zero mean and unit variance. <br/>
It is always a good idea to normalize the features before performing algorithms like KNN which is based on distance of the cases.

In [None]:
from sklearn import preprocessing

In [None]:
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X[0:5]

### Splitting the data into train & test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Training the model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k = 4
model = KNeighborsClassifier(n_neighbors = k)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_hat = model.predict(X_test)

In [None]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, model.predict(X_train)))
print("Test set Accuracy: %.9f"% metrics.accuracy_score(y_test, y_hat))

## Brute forcing to find the best k

To find the best value of k we shall consider every case from 1 to *number of samples* as the value of **k**. <br />
**Note:** We cannot set the value of k as 0 or any value that is greater than equal to training sample size.<br/>
That is why we are finding the number of samples using the shape() on *X_train* which is of type <code>np.ndarray</code>

In [None]:
best_k = -1;
X_train_rows, X_train_cols = X_train.shape
max_test_accuracy = -10**9 + 7
for k in range(1, X_train_rows):
    model = KNeighborsClassifier(n_neighbors = k)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    if (metrics.accuracy_score(y_test, y_hat) > max_test_accuracy):
        best_k = k
        max_test_accuracy = metrics.accuracy_score(y_test, y_hat)
        
print("Best value for k is", best_k)
print("Maximum Test accuracy is %.9f" % max_test_accuracy)