# K-Nearest Neighbor Algorithm
Practical Implementation

### Problem statement
Predict whether a person will have diabetes or not using KNN classifier

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

In [15]:
df = pd.read_csv(r"G:\Udemy\DATA SCIENCE ineuron\Resources\Dataset\diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### Replace Zeros with nan

In [16]:
zero_columns = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']
for col in zero_columns:
    df[col] = df[col].replace(0,np.nan)
    mean = int(df[col].mean(skipna=True))
    df[col] = df[col].replace(np.nan,mean)

### Now we will not find any null value

In [17]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### Spilt the dataset

In [31]:
X = df.iloc[:,0:8]
y = df.iloc[:,8]
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size= 0.2, random_state=0)

### Standard Scaler

Rule of thumb:: Any algorithm that computes distance or assumes normality, `scale your features!`

In [32]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### Define the model : KNN Classifier

In [36]:
import math
math.sqrt(len(y_test))

12.409673645990857

- 12 is the even number 
- In knn we predict based on voting method .
- so is is important to have odd neighbours so that we get 1 majority output 
- Therefore to define the model we take n_neighbors = 11

In [33]:
classifier = KNeighborsClassifier(n_neighbors = 11, p =2,metric='euclidean')

In [34]:
# Fit model
classifier.fit(x_train,y_train)

In [37]:
# Predict the model
y_pred = classifier.predict(x_test)

### Evaluate the model

In [39]:
# Evaluate Model
cm = confusion_matrix(y_test,y_pred)
print(cm)
print("F1_Score: ",f1_score(y_test,y_pred))

[[94 13]
 [15 32]]
F1_Score:  0.6956521739130436


In [40]:
print("Accuracy_score: ",accuracy_score(y_test,y_pred))

Accuracy_score:  0.8181818181818182


In [68]:
df[df['Outcome'] == 0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
5,5,116.0,74.0,29.0,155.0,25.6,0.201,30,0
7,10,115.0,72.0,29.0,155.0,35.3,0.134,29,0
10,4,110.0,92.0,29.0,155.0,37.6,0.191,30,0
...,...,...,...,...,...,...,...,...,...
762,9,89.0,62.0,29.0,155.0,22.5,0.142,33,0
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,155.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0


In [58]:
prediction = classifier.predict(np.array([5,121.0,72.0,23.0,112.0,26.2,0.245,30]).reshape(1,8))
if prediction == 1:
    print("The person will be diabetic")
else:
    print("The person will not be diabetic")

The person will be diabetic


In [72]:
classifier.predict(pd.DataFrame([[
    4
    ,110.0,
    92.0,
    29.0,
    155.0,
    37.6,
    0.191,
    30
    ]],columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age']))



array([1], dtype=int64)

In [66]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')