# KNN --- K nearest neighbours


In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.neighbors import KNeighborsClassifier

In [3]:
#create features column
features = ['Classes',
            'Alcohol',
            'Malic acid',
            'Ash',
            'Alcalinity of ash',
            'Magnesium',
            'Total phenols',
            'Flavanoids',
            'Nonflavanoid phenols',
            'Proanthocyanins',
            'Color intensity',
            'Hue',
            'OD280/OD315 of diluted wines',
            'Proline']

In [4]:
#load data
df = pd.read_csv('wine.data', header=None)
#add columns
df.columns = features
df.head(2)

Unnamed: 0,Classes,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050


In [5]:
# Classes column is our target. Only for this exercise. In real data KNN has no target
y = df['Classes']
# drop Classes column
X = df.drop(['Classes'], axis=1)

In [6]:
# we'll use kfold method to cross validate our model
from sklearn.model_selection import KFold

In [7]:
#import evaluation libraries
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [8]:
results = list()
names = list()
highest = 0
highest_k = 0

k_list = list(range(1,51))

kfold = KFold(n_splits=5, random_state=42, shuffle=True)
kfold.get_n_splits(X)

for k in k_list: 
    kmeans = KNeighborsClassifier(n_neighbors=k)
    cv_results = cross_val_score(kmeans, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(k)
    if cv_results.mean() > highest:
        highest = cv_results.mean()
        highest_k  = k
    print('%s: %f (%f)' % (k, cv_results.mean(), cv_results.std()))

1: 0.730476 (0.040783)
2: 0.662540 (0.052873)
3: 0.708254 (0.090338)
4: 0.657778 (0.083453)
5: 0.674603 (0.054977)
6: 0.674286 (0.031828)
7: 0.680000 (0.050088)
8: 0.680000 (0.047286)
9: 0.702381 (0.036701)
10: 0.680159 (0.049401)
11: 0.702540 (0.043253)
12: 0.696667 (0.019942)
13: 0.690952 (0.025563)
14: 0.679365 (0.045108)
15: 0.701905 (0.040774)
16: 0.679524 (0.050656)
17: 0.701587 (0.058700)
18: 0.679524 (0.066467)
19: 0.679365 (0.059816)
20: 0.690952 (0.064048)
21: 0.701905 (0.059282)
22: 0.696508 (0.057780)
23: 0.702063 (0.071216)
24: 0.707619 (0.061535)
25: 0.701905 (0.059282)
26: 0.696508 (0.057314)
27: 0.696349 (0.055709)
28: 0.707937 (0.056869)
29: 0.713492 (0.047767)
30: 0.707937 (0.056869)
31: 0.690952 (0.073053)
32: 0.713492 (0.064293)
33: 0.713492 (0.059298)
34: 0.724603 (0.073747)
35: 0.724603 (0.048500)
36: 0.713492 (0.073267)
37: 0.713492 (0.064293)
38: 0.713492 (0.064293)
39: 0.707937 (0.056869)
40: 0.707778 (0.062678)
41: 0.713492 (0.064293)
42: 0.707778 (0.062678)
4

In [9]:
print('%s: %f' % (highest_k, highest))

1: 0.730476


In [10]:
#подключаем библиотеку для нормировки
from sklearn.preprocessing import scale

In [11]:
#нормируем наши данные
proc_X = scale(X)

In [12]:
results = list()
names = list()
highest = 0
highest_k = 0

k_list = list(range(1,51))

kfold = KFold(n_splits=5, random_state=42, shuffle=True)
kfold.get_n_splits(proc_X)

for k in k_list: 
    kmeans = KNeighborsClassifier(n_neighbors=k)
    cv_results = cross_val_score(kmeans, proc_X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(k)
    if cv_results.mean() > highest:
        highest = cv_results.mean()
        highest_k  = k
    print('%s: %f (%f)' % (k, cv_results.mean(), cv_results.std()))

1: 0.943968 (0.017327)
2: 0.932857 (0.027991)
3: 0.955079 (0.013688)
4: 0.938254 (0.010817)
5: 0.949365 (0.021393)
6: 0.949524 (0.020712)
7: 0.949524 (0.020712)
8: 0.955238 (0.021986)
9: 0.960794 (0.022468)
10: 0.960794 (0.013352)
11: 0.960952 (0.022145)
12: 0.955238 (0.021986)
13: 0.949524 (0.027810)
14: 0.966349 (0.010958)
15: 0.972063 (0.017571)
16: 0.972063 (0.017571)
17: 0.966508 (0.027186)
18: 0.966508 (0.027186)
19: 0.955397 (0.028236)
20: 0.966508 (0.027186)
21: 0.960952 (0.028267)
22: 0.966508 (0.020747)
23: 0.960952 (0.022145)
24: 0.955238 (0.013219)
25: 0.955238 (0.013219)
26: 0.960794 (0.013352)
27: 0.955238 (0.013219)
28: 0.966349 (0.010958)
29: 0.977619 (0.020832)
30: 0.960794 (0.022066)
31: 0.955238 (0.021986)
32: 0.960794 (0.022066)
33: 0.966349 (0.027154)
34: 0.966349 (0.027154)
35: 0.960794 (0.037588)
36: 0.960794 (0.022066)
37: 0.955238 (0.033177)
38: 0.960794 (0.022066)
39: 0.960794 (0.022066)
40: 0.960794 (0.022066)
41: 0.966349 (0.027154)
42: 0.960794 (0.013352)
4

In [13]:
print('%s: %f' % (highest_k, highest))

29: 0.977619


Нормировка данных помогла. Как видно, после нормировки точность поднялась больше чем на 20%. 

Задача упражнения состояла в том, чтобы при помощи метода ближайших соседей kNN (k-nearest neighbors) правильно классифицировать типы вин. 