In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale

## Read data

In [2]:
data = pd.read_csv('data/wine.data', names = ['wine_type', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od_of_diluted_wines', 'proline'], index_col=False)
data.head()

Unnamed: 0,wine_type,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od_of_diluted_wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
y = data.loc[:, 'wine_type']
y.describe()

count    178.000000
mean       1.938202
std        0.775035
min        1.000000
25%        1.000000
50%        2.000000
75%        3.000000
max        3.000000
Name: wine_type, dtype: float64

In [4]:
X = data.drop('wine_type', axis = 1)
X.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


## Run kNN wo/ normalization

In [5]:
validator = KFold(y.size, n_folds = 5, shuffle = True, random_state = 42)

In [6]:
scores = []
for n in xrange(1, 51):
    classifier = KNeighborsClassifier(n_neighbors = n)
    classifier.fit(X, y) 
    scores.append((n, cross_val_score(classifier, X, y, cv = validator).mean()))

In [7]:
sorted(scores, key=lambda score: -score[1])[0]

(1, 0.7304761904761905)

## Run kNN w/ normalization

In [8]:
X2 = scale(X)

In [9]:
scores = []
for n in xrange(1, 51):
    classifier = KNeighborsClassifier(n_neighbors = n)
    classifier.fit(X2, y) 
    scores.append((n, cross_val_score(classifier, X2, y = y, cv = validator).mean()))

In [10]:
sorted(scores, key=lambda score: -score[1])[0]

(29, 0.9776190476190475)

## Predict one 

In [11]:
classifier = KNeighborsClassifier(n_neighbors = 29)
X2 = scale(X)
classifier.fit(X2, y)
classifier.predict([X2[0,:], X2[155,:]])

array([1, 3])