In [1]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn



In [96]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<br><br>
## Read datase

In [97]:
data = pd.read_csv("./dataset/Iris_dataset.csv")

<br><br>
## Check dataset details

In [98]:
# Data set first 5 rows
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [99]:
# Get data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [100]:
# Get more details
data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [101]:
# Dataset shape
data.shape

(150, 6)

In [102]:
# Check null values
data.isnull()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
145,False,False,False,False,False,False
146,False,False,False,False,False,False
147,False,False,False,False,False,False
148,False,False,False,False,False,False


In [103]:
data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [104]:
# All data types in Species column
data["Species"].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

<br><br>
## Data reshape

In [105]:
# Divide x axis and y axis
x = data.iloc[:, 1:5]
x.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [106]:
y = data.iloc[:, -1]
y.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object

<br><br>
## Feature Scaling - Standardization

In [107]:
scaler_std = StandardScaler()

In [108]:
x_std = scaler_std.fit_transform(x)
x_std[0:5]

array([[-0.90068117,  1.03205722, -1.3412724 , -1.31297673],
       [-1.14301691, -0.1249576 , -1.3412724 , -1.31297673],
       [-1.38535265,  0.33784833, -1.39813811, -1.31297673],
       [-1.50652052,  0.10644536, -1.2844067 , -1.31297673],
       [-1.02184904,  1.26346019, -1.3412724 , -1.31297673]])

<br><br>
## Feature Scaling - Normalization

In [109]:
scaler_nol = MinMaxScaler()

In [110]:
x_nol = scaler_nol.fit_transform(x)
x_nol[0:5]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

<br><br>
## Split train and test data

In [111]:
x_std_train, x_std_test, y_std_train, y_std_test = train_test_split(x_std, y, test_size=0.2)
x_nol_train, x_nol_test, y_nol_train, y_nol_test = train_test_split(x_nol, y, test_size=0.2)

In [112]:
print(f"x_std_train: { x_std_train.shape } and x_std_test: { x_std_test.shape }")

x_std_train: (120, 4) and x_std_test: (30, 4)


In [113]:
print(f"x_std_train: { x_nol_train.shape } and x_std_test: { x_nol_test.shape }")

x_std_train: (120, 4) and x_std_test: (30, 4)


<br><br>
## Function to get best K neighbor's count

In [114]:
def get_best_kn_count(x_train, y_train, x_test, y_test, max_kn_count):
    kn_count = []  # Ensure we are appending to this list
    for i in range(1, max_kn_count + 1):
        model = KNeighborsClassifier(n_neighbors=i)
        model.fit(x_train, y_train)
        predict = model.predict(x_test)
        correct_count = np.sum(predict == y_test)
        incorrect_count = np.sum(predict != y_test)
        accuracy = accuracy_score(y_test, predict)
        kn_count.append({
            "neighbor's count": i,
            "correct predict count": correct_count,
            "incorrect predict count": incorrect_count,
            "accuracy": accuracy
        })
    return kn_count

<br><br>
## Train model

#### (Standardization dataset)

In [115]:
# In this model, we have to specify the neighbor's count
model_std = KNeighborsClassifier(n_neighbors = 1)
model_std.fit(x_std_train, y_std_train)

In [116]:
predict_std = model_std.predict(x_std_test)
predict_std[0:5]

array(['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor'], dtype=object)

In [117]:
y_std_test[0:5]

3          Iris-setosa
147     Iris-virginica
32         Iris-setosa
69     Iris-versicolor
51     Iris-versicolor
Name: Species, dtype: object

In [118]:
accuracy_std = accuracy_score(y_std_test, predict_std)
accuracy_std

0.9666666666666667

In [119]:
cm_std = confusion_matrix(y_std_test, predict_std)
cm_std

array([[ 8,  0,  0],
       [ 0, 15,  1],
       [ 0,  0,  6]])

In [120]:
result_std = pd.DataFrame(data=[y_std_test.values, predict_std], index = ["y_std_test", "predict"])
# result_std.transpose()

In [121]:
best_kn_count_std = get_best_kn_count(x_std_train, y_std_train, x_std_test, y_std_test, 25)
# best_kn_count_std
max_accuracy_std = max(best_kn_count_std, key=lambda x: x['accuracy'])
max_accuracy_std

{"neighbor's count": 1,
 'correct predict count': np.int64(29),
 'incorrect predict count': np.int64(1),
 'accuracy': 0.9666666666666667}

<br><br>
#### (Normalization dataset)

In [122]:
model_nol = KNeighborsClassifier(n_neighbors = 1)
model_nol.fit(x_nol_train, y_nol_train)

In [123]:
predict_nol = model_nol.predict(x_nol_test)
predict_nol[0:5]

array(['Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor'], dtype=object)

In [124]:
y_nol_test[0:5]

24        Iris-setosa
78    Iris-versicolor
89    Iris-versicolor
38        Iris-setosa
88    Iris-versicolor
Name: Species, dtype: object

In [125]:
accuracy_nol = accuracy_score(y_nol_test, predict_nol)
accuracy_nol

0.9

In [126]:
cm_nol = confusion_matrix(y_nol_test, predict_nol)
cm_nol

array([[10,  0,  0],
       [ 0, 13,  1],
       [ 0,  2,  4]])

In [127]:
result_nol = pd.DataFrame(data=[y_nol_test.values, predict_nol], index = ["y_nol_test", "predict"])
# result_nol.transpose()

In [132]:
best_kn_count_nol = get_best_kn_count(x_nol_train, y_nol_train, x_nol_test, y_nol_test, 25)
# best_kn_count_nol
max_accuracy_nol = max(best_kn_count_nol, key=lambda x: x['accuracy'])
max_accuracy_nol

{"neighbor's count": 10,
 'correct predict count': np.int64(29),
 'incorrect predict count': np.int64(1),
 'accuracy': 0.9666666666666667}