### KNN - Classifier on IRIS dataset

#### DATA Collection

In [14]:
#Import necessary Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [15]:
# Trainning Data Set

data = pd.read_csv(r'C:\Users\Tuhin\OneDrive\Desktop\BIA PROJECT\iris.csv')
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [16]:
# Copying Data 

df = data.copy()
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [18]:
#Checking  Duplicate

df[df.duplicated()]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
34,4.9,3.1,1.5,0.1,setosa
37,4.9,3.1,1.5,0.1,setosa
142,5.8,2.7,5.1,1.9,virginica


In [19]:
#removing Duplicate

df.drop_duplicates(inplace=True)
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [20]:
# For Statistical Checking
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,147.0,147.0,147.0,147.0
mean,5.856463,3.055782,3.780272,1.208844
std,0.8291,0.437009,1.759111,0.757874
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [21]:
#Null Checking
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [22]:
#Set Feature and target
X = df.drop(columns = 'species' , axis = 1)
y = df['species']


# Converting Categorical Target into Numeric
y_numeric = pd.factorize(y)[0]

#Train test split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42)

# scale Data Set 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
#Building and trainning model
model = KNeighborsClassifier()

In [26]:
#Hyperparameter tuning

param_grid = {
    'n_neighbors': np.arange(1,31),
    'weights' : ['uniform' , 'distance'],
    'metric' : ['minkowski','chebyshev'],
    'p' :[1,2,3]
}

In [27]:
#Hyper perameter tuning with 5 - Fold  cross validation

grid_search = GridSearchCV(estimator = model , param_grid = param_grid , cv = 5 , scoring = 'accuracy' , n_jobs = -1)

# train the model 
grid_search.fit(X_train_scaled , y_train)

In [28]:
# best parameters 
grid_search.best_params_

{'metric': 'minkowski',
 'n_neighbors': np.int64(9),
 'p': 2,
 'weights': 'uniform'}

In [29]:
# Retraning The  model 
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled , y_train)


#Prediction
y_predict = best_model.predict(X_test_scaled)

#Accuracy
accuracy = metrics.accuracy_score(y_test, y_predict)

print(accuracy)


0.9333333333333333
