In [None]:
# In this case KNN uses StandardScaler() and set the n_neighbors as sqrt(len(y_test)-1=11.
# 1, KNN is a supervised learning model.
# 2, KNN is doing non_linear regression with several neighbors and classify the sample as its neighbors.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
# reads the dataset
dataset = pd.read_csv('diabetes.csv')

In [3]:
print(dataset.head())
print(len(dataset))

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
768


In [4]:
# data wash by replace zeros
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

In [5]:
# zeros should not be calculated in mean.
for column in zero_not_accepted:
    # hold all zeros out of calculation
    dataset[column] = dataset[column].replace(0, np.NaN)
    # find mean without outliers
    mean = int(dataset[column].mean(skipna=True))
# replace outliers into mean which the least affects dataset.
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [6]:
# split dataset into inputs and outputs
X = dataset.iloc[:, 0:8]
Y = dataset.iloc[:, 8]
# split rows into 80% training data and 20% testing data.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

In [7]:
#Feature scaling
#name standard scaler method as sc_X[-1, 1](another is MinMaxScaler(), [0, 1] and etc).
sc_X = StandardScaler()
#fit_transform method is a combination of fit() and transform(). fit() to fit training data to standard-scaling by finding the mean and var and transform() to change data into the module of standard_scaling by the mean and var.
X_train = sc_X.fit_transform(X_train)
# no need to get a new mean and var by testing data so only use transform()
X_test = sc_X.transform(X_test)

In [8]:
# StandardScaler() and MinMaxScaler() is a class constructor, so data of mean and var is recorded in.
print(sc_X.mean_, sc_X.var_)

[  3.87459283 121.68403909  72.62703583  29.21986971 156.35504886
  32.62361564   0.46647068  33.59446254] [1.18393219e+01 9.14372481e+02 1.44067738e+02 7.80900938e+01
 7.58517036e+03 4.88455498e+01 1.10843562e-01 1.44153129e+02]


In [9]:
import math
math.sqrt(len(Y_test))
# math.sqrt(len(Y_test))-1 could be a good way of setting first n_neighbors number, but it still needs trying to evaluate performance.

12.409673645990857

In [10]:
# define the model: init K-NN
# p=2 because the output should be True or False, including 2 results.
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

In [11]:
# Fit Model
classifier.fit(X_train, Y_train)

In [12]:
# predict the test set results
Y_pred = classifier.predict(X_test)
print(Y_pred)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0]


In [13]:
# evaluate model
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

[[94 13]
 [15 32]]


In [14]:
# f1_score shows how the model's precision and recalling.
print(f1_score(Y_test, Y_pred))

0.6956521739130436


In [15]:
# a_score shows how many results fit predict.
print(accuracy_score(Y_test, Y_pred))

0.8181818181818182


In [None]:
# for this result the n_neighbors could be retried.