<a href="https://colab.research.google.com/github/RishabhG998/Applied-Machine-Learning/blob/main/KNN_Imputation_on_IRIS_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.datasets import load_iris
from sklearn.metrics import mean_absolute_error

In [2]:
# Load the Iris dataset
data = load_iris()
X = data.data  # Features
y = data.target  # Target (in this case, species labels)

In [3]:
# Introduce missing values in the dataset (replace some values with NaN)
import numpy as np
np.random.seed(0)
X_with_missing = X.copy()

missing_fraction = 0.1  # 10% of missing values

missing_mask = np.random.rand(*X.shape) < missing_fraction
# 'np.random.rand(*X.shape)' generates an array of random numbers with the same shape as the array X
# '< missing_fraction' part compares each of the random values generated in step 1 to the missing_fraction value

X_with_missing[missing_mask] = np.nan

In [4]:
# Displaying intermediate results
np.random.rand(*X.shape)[:10]

array([[0.17465839, 0.327988  , 0.68034867, 0.06320762],
       [0.60724937, 0.4776465 , 0.28399998, 0.23841328],
       [0.51451274, 0.36792758, 0.45651989, 0.33747738],
       [0.97049369, 0.13343943, 0.09680395, 0.34339173],
       [0.5910269 , 0.65917647, 0.39725675, 0.99927799],
       [0.351893  , 0.72140667, 0.63758269, 0.81305386],
       [0.97622566, 0.88979366, 0.76456197, 0.69824848],
       [0.33549817, 0.14768558, 0.062636  , 0.2419017 ],
       [0.43228148, 0.52199627, 0.77308355, 0.95874092],
       [0.11732048, 0.10700414, 0.58969472, 0.74539807]])

In [5]:
# Displaying intermediate results
(np.random.rand(*X.shape) < missing_fraction)[:10]

array([[False, False, False, False],
       [False, False, False, False],
       [ True, False,  True, False],
       [False,  True, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False,  True, False],
       [False, False,  True,  True]])

In [6]:
# Displaying intermediate results
X_with_missing[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, nan, nan],
       [nan, 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, nan, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [7]:
# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

In [8]:
# Perform KNN imputation on the dataset
X_imputed = knn_imputer.fit_transform(X_with_missing)

In [9]:
# Create a DataFrame to visualize the results
df = pd.DataFrame(X_imputed, columns=data.feature_names)

In [10]:
# Check the DataFrame with imputed values
print(df.head(10))

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0               5.10               3.5               1.40               0.2
1               4.90               3.0               1.40               0.2
2               4.70               3.2               1.30               0.2
3               4.60               3.1               3.94               1.4
4               5.12               3.6               1.40               0.2
5               5.40               3.9               1.70               0.4
6               4.60               3.4               1.40               0.3
7               5.00               3.4               1.50               0.2
8               4.40               2.9               1.28               0.2
9               4.90               3.1               1.50               0.1


In [11]:
print(data.data[:10])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]


In [12]:
# Calculate the Mean Absolute Error between original and imputed data
mae = mean_absolute_error(data.data, X_imputed)

print(f"Mean Absolute Error between original and KNN-imputed data: {mae}")

Mean Absolute Error between original and KNN-imputed data: 0.029400000000000003
