# import libraries

In [24]:
import pandas as pd
import numpy as np

# read dataset

In [3]:
df = pd.read_csv('iris_dataset.csv')
df.shape

(150, 5)

# select sepal width and sepal length

In [4]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [13]:
selected_df = df[['sepal length (cm)', 'sepal width (cm)', 'target']]
selected_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   target             150 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 3.6 KB


# select an item to be used as an test data

In [12]:
test_item = df.iloc[-1][['sepal length (cm)', 'sepal width (cm)', 'target']]
test_item

sepal length (cm)    5.9
sepal width (cm)     3.0
target               2.0
Name: 149, dtype: float64

# find k=1 nearest neighbour
* create numpy array to ease our calculations
* calculate norm2
* find index of min distance
* extract class of k=1 nearest neighbour

In [28]:
selected_array = selected_df[['sepal length (cm)', 'sepal width (cm)']].to_numpy()
test_array = test_item[['sepal length (cm)', 'sepal width (cm)']].to_numpy()

print(f"selected_array.shape: {selected_array.shape}")
print(f"test_array.shape: {test_array.shape}")

selected_array.shape: (150, 2)
test_array.shape: (2,)


In [35]:
distance = np.linalg.norm(selected_array - test_array, axis=1)
print(f"distance.shape: {distance.shape}, distance[0]: {distance[0]}")

distance.shape: (150,), distance[0]: 0.943398113205661


In [37]:
idx = np.argmin(distance)
idx

np.int64(61)

In [42]:
neighbour_class = selected_df.iloc[idx]['target']
item_class = test_item['target']

print(f"neighbour_class ({neighbour_class}) == item_class ({item_class}): {neighbour_class == item_class}")

neighbour_class (1.0) == item_class (2.0): False


# find k=3 nearest neighbours
* find indices of three min values
* find most repeated class value

In [47]:
idx = np.argsort(distance)[:3]
nearest_neighbours_classes = selected_df.iloc[idx]['target']
values, counts = np.unique(nearest_neighbours_classes, return_counts=True)
most_frequent = values[np.argmax(counts)]
most_frequent

np.int64(2)

In [48]:
print(f"most_frequent ({most_frequent}) == item_class ({item_class}): {most_frequent == item_class}")

most_frequent (2) == item_class (2.0): True


# The probability of finding 1 unrelated class near our desired test item is much greater than finding 2 or more items. So by considering more neighbour we lower the effect of noise