## K-NN Algorithm for Classification Problem

In [251]:
# importing required library
import numpy as np  
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [252]:
# Reading Data
iris = pd.read_csv('iris.csv')

In [253]:
# checking Few Observation of Dataset
iris.head(5)

Unnamed: 0.1,Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa


In [254]:
# Assigning Predictor/Features & Target/Label 
x = iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']]
y = iris['Species']

In [255]:
# Spliting Data for test-train using train_test_splitabs
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state = 100) 

In [256]:
## Transforming to Standardization as it will reduce domination of a variable with large Range.
## Using Scipy we can get standard normal distribution but only for varaible which follows Normal dist.
## if A varaible distribution is unknown then we can standardize it. as follow x_std = (Xi - X_bar)/std(X)
## we will use scipy lib only to transform variable to standardized data.

import scipy.stats as st
x_train = pd.DataFrame(st.zscore(x_train))
x_test = pd.DataFrame(st.zscore(x_test))

In [257]:
## Checking assigned Varaible
print(f'x_train data: \n{ x_train.head(2)}')
print(f'x_test data: \n{x_test.head(2)}')
print(f'y_train data: \n{y_train.head(2)}')
print(f'y_test data: \n{y_test.head(2)}')

x_train data: 
          0         1         2         3
0  0.284751 -0.671977  0.129450  0.111664
1 -0.441537 -1.160264  0.354115 -0.018611
x_test data: 
          0         1         2         3
0 -1.087814  0.174162 -1.305478 -1.303803
1 -0.357193 -1.063303  0.157334  0.183253
y_train data: 
71    versicolor
90    versicolor
Name: Species, dtype: object
y_test data: 
34        setosa
89    versicolor
Name: Species, dtype: object


### Function 1
### * To calculate Euclidean Distance between two Vector

In [258]:
# Euclidean Distance function using "for loop" method 
def DISTANCE(x,y):
    distance = np.sqrt(np.sum((x-y)**2))
    return(distance)

In [259]:
DISTANCE(x_train.iloc[87,],x_test.iloc[0,])

0.20525803196776615

In [260]:
# Euclidean Distance function using Vectorized Notation 
def DIST(x,y):
    distance = np.sqrt(np.dot((x-y), np.transpose(x-y)))
    return(distance)

In [261]:
DIST(x_train.iloc[87,],x_test.iloc[0,])

0.20525803196776615

### Function 2
### * To calculate KNN for one test observation where K = 3

In [262]:
# Importing Math Libaray
# extracting 1 observation from test dataset
import math
x_star = x_test.iloc[0,]
x_star

0   -1.087814
1    0.174162
2   -1.305478
3   -1.303803
Name: 0, dtype: float64

In [263]:
# Function to Calaulate K-NN with one obs in Test data set, where K = 3

def NN(x_train, y_train, x_test, k):
    dist = []
    for i in range(len(x_train)):
        distance = DISTANCE(x_train.iloc[i,], x_test)
        dist.append((distance, i))  
     
    # sorting dist list A/c to distance
    dist.sort()
    
    # creating Array to store neighbour A/c to sorted distance
    # Here we will get index of features training data(x_train) of nearest neighbour
    neighbours =[]
    for j in range(k):
        neighbours.append(dist[j][1])
     
    # fetching values in label traning data(y_train) with help of index in neighbours list 
    target = []
    for l in range(k):
        target.append(np.array(y_train)[neighbours[l]])
        
    
    #Create a function for counting frequency of values in target dictionary
    count = {}
    for t in target:
        count[t] = count.get(t,0) + 1
        
    #Create a list with (value, key) pair 
    pair = []
    for key in count:
        pair.append((count[key], key))
    
    #Sort the pair and get the most occurring word
    pair.sort(reverse=True)
    pred = pair[0][1]
    #print(pred)
    
    
    #print(f'Minimum Distance are : {dist}')
    #print(f"index is {index}")
    #return(np.array(y_train)[index])
    #print(count)
    return(pred)

In [264]:
NN(x_train, y_train, x_star, k=3)

'setosa'

In [265]:
y_train.iloc[87]

'setosa'

In [266]:
y_test.iloc[0]

'setosa'

In [267]:
### Function 3
### * To Find 1NN for all the Observation in Test Data.

In [268]:
# Function to Calaulate K-NN with All the obs in Test data set, where K = 3

def K_NN(x_train, y_train, x_test, k):
    y_pred = []
    for i in range(len(x_test)):
        y_pred.append(NN(x_train, y_train, x_test.iloc[i,], k))
        
    return(y_pred)

In [269]:
# Predicted Values are as follow:
y_pred = K_NN(x_train, y_train, x_test, k=3)
y_pred

['setosa',
 'versicolor',
 'virginica',
 'versicolor',
 'virginica',
 'setosa',
 'versicolor',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'virginica',
 'virginica',
 'virginica',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'versicolor',
 'virginica',
 'versicolor',
 'setosa',
 'versicolor',
 'versicolor',
 'setosa',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'setosa',
 'virginica',
 'virginica',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'virginica']

In [270]:
# Printing predicted & Actual Data
print(f'y_pred: {list(y_pred)}')
print(f'y_test: {list(y_test)}')

y_pred: ['setosa', 'versicolor', 'virginica', 'versicolor', 'virginica', 'setosa', 'versicolor', 'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica', 'virginica', 'virginica', 'versicolor', 'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica', 'versicolor', 'setosa', 'versicolor', 'versicolor', 'setosa', 'virginica', 'virginica', 'virginica', 'virginica', 'setosa', 'virginica', 'virginica', 'setosa', 'setosa', 'setosa', 'versicolor', 'setosa', 'setosa', 'setosa', 'versicolor', 'virginica']
y_test: ['setosa', 'versicolor', 'virginica', 'versicolor', 'virginica', 'setosa', 'versicolor', 'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor', 'setosa', 'versicolor', 'virginica', 'virginica', 'virginica', 'versicolor', 'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica', 'versicolor', 'setosa', 'virginica', 'versicolor', 'setosa', 'virginica', 'virginica', 'virginica', 'virginica', 'setosa'

In [271]:
# Converting to List
y_pred = list(y_pred)
y_test = list(y_test)

In [276]:
# To check Accuracy 
result = []
for i in range(len(y_pred)):
    for j in range(len(y_test)):
        if i == j:
            if y_pred[i] == y_test[j]:
                result.append(1)
            else:
                result.append(0)
        else:
            pass


In [277]:
print(f'Total No of Records: {len(result)}')
print(f'No of Records Predict correctly: {sum(result)}')

Total No of Records: 45
No of Records Predict correctly: 44


In [278]:
Accuracy = (sum(result)/len(result))*100
print(f'Accuracy is: {round(Accuracy, 2)}')

Accuracy is: 97.78


## Conclusion

In [275]:
# The model have predicted 44 values correctly out of 45, i.e with 97.78% Accuracy
# The Accuracy is sufficient to consider above model Good.