# KNN Tips and Tricks

### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

Explore the dataset

In [None]:
wine = load_wine()

In [None]:
wine.keys()

Create a temp df to eyeball data

In [None]:
wine.target.shape

In [None]:
wine.target

In [None]:
wine.data.shape

In [None]:
X = wine.data
y = wine.target
df = pd.DataFrame(X, columns=wine.feature_names)

In [None]:
df.head()

In [None]:
df.describe()

Create test/train sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

Start with 5 neighbors (default)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

Create plots to look at distributions

In [None]:
plt.figure(1, figsize=(9, 3))

plt.subplot(131)
plt.scatter(y_pred, X_test[:,0], c='r')
plt.title('Alcohol%')
plt.subplot(132)
plt.scatter(y_pred, X_test[:,1], c='b')
plt.title('Malic_acid')
plt.subplot(133)
plt.scatter(y_pred, X_test[:,2], c='g')
plt.title('Ash')
plt.suptitle('Predicted')

plt.figure(2, figsize=(9, 3))

plt.subplot(131)
plt.scatter(y_test, X_test[:,0], c='r')
plt.title('Alcohol%')
plt.subplot(132)
plt.scatter(y_test, X_test[:,1], c='b')
plt.title('Malic_acid')
plt.subplot(133)
plt.scatter(y_test, X_test[:,2], c='g')
plt.title('Ash')
plt.suptitle('Actual')

plt.show()

Increase to 20 neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

Whoops, back it up and make it odd.

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

In [None]:
plt.figure(1, figsize=(9, 3))

plt.subplot(131)
plt.scatter(y_pred, X_test[:,0], c='r')
plt.title('Alcohol%')
plt.subplot(132)
plt.scatter(y_pred, X_test[:,1], c='b')
plt.title('Malic_acid')
plt.subplot(133)
plt.scatter(y_pred, X_test[:,2], c='g')
plt.title('Ash')
plt.suptitle('Predicted')

plt.figure(2, figsize=(9, 3))

plt.subplot(131)
plt.scatter(y_test, X_test[:,0], c='r')
plt.title('Alcohol%')
plt.subplot(132)
plt.scatter(y_test, X_test[:,1], c='b')
plt.title('Malic_acid')
plt.subplot(133)
plt.scatter(y_test, X_test[:,2], c='g')
plt.title('Ash')
plt.suptitle('Actual')

plt.show()

Scores and plots appear to reflect the model predicting classifications more accurately!

## Let's try rescaling (blatantly stolen from Michael)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_scaled = pd.DataFrame(scaler.transform(X_train), columns = wine.feature_names)

In [None]:
X_scaled.head()

In [None]:
X_scaled.describe()

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_scaled, y_train)
y_pred_scaled = knn.predict(X_scaled)

In [None]:
knn.score(X_test, y_test)

Hmmm...I'm likely not using this correctly but the score is discouraging so...let's move on.

## Adding weights

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train, y_train)
y_pred_weighted = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

In [None]:
plt.figure(1, figsize=(9, 3))

plt.subplot(131)
plt.scatter(y_pred_weighted, X_test[:,0], c='r')
plt.title('Alcohol%')
plt.subplot(132)
plt.scatter(y_pred_weighted, X_test[:,1], c='b')
plt.title('Malic_acid')
plt.subplot(133)
plt.scatter(y_pred_weighted, X_test[:,2], c='g')
plt.title('Ash')
plt.suptitle('Predicted')

plt.figure(2, figsize=(9, 3))

plt.subplot(131)
plt.scatter(y_test, X_test[:,0], c='r')
plt.title('Alcohol%')
plt.subplot(132)
plt.scatter(y_test, X_test[:,1], c='b')
plt.title('Malic_acid')
plt.subplot(133)
plt.scatter(y_test, X_test[:,2], c='g')
plt.title('Ash')
plt.suptitle('Actual')

plt.show()

Using this small of a neighborhood seems like we're likely overfitting but we can explore that later

In [None]:
#### Play with algorithm arguments to KNN here (variety of distance measurements)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', leaf_size=30)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', leaf_size=10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, algorithm='brute')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test, y_test)