In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [56]:
#provide path
path = 'C:/Users/shisk/Desktop/Projects/Data/iris.csv'

In [57]:
data = pd.read_csv(path)
data.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [59]:
data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [60]:
# drop null values and reset index
df = data.dropna(axis = 'rows')
df.reset_index(drop = True, inplace = True)

In [61]:
#recheck presense of null entries
df.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [62]:
# drop col 'Id' as it is the same as index
df.drop(labels = 'Id', axis = 1, inplace = True)
df.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [63]:
# convert 'Species' to numeric
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

In [64]:
# check the number of classes
df.groupby('Species').mean()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
Iris-virginica,6.588,2.974,5.552,2.026


In [65]:
# check if the data is balanced
df.groupby('Species').count()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,50,50,50,50
Iris-versicolor,50,50,50,50
Iris-virginica,50,50,50,50


In [66]:
species_name = le.fit_transform(df['Species'])
df['Species'] = species_name

In [67]:
le.inverse_transform([0, 1, 2])

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [68]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [69]:
# as KNN claculated distance for each attribute to classify test datapoint scale of attributes matter
# we will use standard scaler to standarize scale of the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

## Classification Model to Predict Species

In [70]:
# fit calculates major moments of data distribution like mean, variance, standard deviation, etc.
sc.fit(df.drop('Species', axis = 1))

StandardScaler()

In [71]:
# transform uses the moments calculated by fit() ti standardize the data
sc_features = sc.transform(df.drop('Species', axis = 1))

In [72]:
# create new df with scaled features
df_feat = pd.DataFrame(sc_features, columns = df.columns[:-1])

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X = df_feat
y = df['Species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [75]:
from sklearn.neighbors import KNeighborsClassifier

knn_c = KNeighborsClassifier()

In [76]:
# using grid search CV for cross validation
from sklearn.model_selection import RandomizedSearchCV

random_grid = {
    'n_neighbors': [3, 4, 5, 6, 7],
    'weights':['uniform','distance'],
    'leaf_size': [15, 26],
    'p': [1, 2]
}

In [77]:
knn_cv = RandomizedSearchCV(estimator = knn_c, param_distributions = random_grid, n_iter = 100,
                               cv = 5, verbose = 2)
knn_cv.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] weights=uniform, p=1, n_neighbors=3, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=3, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=3, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=3, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=3, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=3, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=3, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=3, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=3, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=3, leaf_size=15, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=3, leaf_size=15 ..............
[CV]  weights=distance, p=1, n_neighbors=3, leaf_size=15, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=3, leaf_size=15 ..............
[CV]  weights=

[CV]  weights=distance, p=2, n_neighbors=5, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=6, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=6, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=6, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=6, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=6, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=6, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=6, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=6, leaf_size=15, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=6, leaf_size=15 ...............
[CV]  weights=uniform, p=1, n_neighbors=6, leaf_size=15, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=6, leaf_size=15 ..............
[CV]  weights=distance, p=1, n_neighbors=6, leaf_size=15, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=6, leaf_size=15 ..............
[CV]

[CV]  weights=uniform, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=uniform, p=1, n_neighbors=4, leaf_size=26 ...............
[CV]  weights=uniform, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=4, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=4, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=4, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=4, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=4, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=4, leaf_size=26, total=   0.0s
[CV] weights=uniform, p=2, n_neighbors=4, leaf_size=26 ...............
[

[CV]  weights=distance, p=1, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=7, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=7, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=7, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=distance, p=1, n_neighbors=7, leaf_size=26 ..............
[CV]  weights=distance, p=1, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=uniform, p=2, n_neighbors=7, leaf_size=26 ...............
[CV]  weights=uniform, p=2, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=uniform, p=2, n_neighbors=7, leaf_size=26 ...............
[CV]  weights=uniform, p=2, n_neighbors=7, leaf_size=26, total=   0.0s
[CV] weights=uniform, p=2, n_neighbors=7, leaf_size=26 ...............
[

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    3.0s finished


RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=100,
                   param_distributions={'leaf_size': [15, 26],
                                        'n_neighbors': [3, 4, 5, 6, 7],
                                        'p': [1, 2],
                                        'weights': ['uniform', 'distance']},
                   verbose=2)

In [78]:
knn_cv.best_params_

{'weights': 'distance', 'p': 2, 'n_neighbors': 5, 'leaf_size': 15}

In [79]:
params = {'weights': 'uniform', 'p': 2, 'n_neighbors': 6, 'leaf_size': 15}
knn_final = knn_c = KNeighborsClassifier(**params)

In [80]:
knn_final.fit(X_train, y_train)
predictions = knn_final.predict(X_test)

In [82]:
from sklearn.metrics import classification_report, confusion_matrix

In [84]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[11  0  0]
 [ 0  6  0]
 [ 0  1 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.86      1.00      0.92         6
           2       1.00      0.92      0.96        13

    accuracy                           0.97        30
   macro avg       0.95      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30

