<a href="https://colab.research.google.com/github/RishabhG998/Applied-Machine-Learning/blob/main/KNN_Imputation_on_IRIS_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.datasets import load_iris
from sklearn.metrics import mean_absolute_error

In [2]:
# Load the Iris dataset
data = load_iris()
X = data.data  # Features
y = data.target  # Target (in this case, species labels)

In [3]:
# Introduce missing values in the dataset (replace some values with NaN)
import numpy as np
np.random.seed(0)
X_with_missing = X.copy()

missing_fraction = 0.1  # 10% of missing values

missing_mask = np.random.rand(*X.shape) < missing_fraction
# 'np.random.rand(*X.shape)' generates an array of random numbers with the same shape as the array X
# '< missing_fraction' part compares each of the random values generated in step 1 to the missing_fraction value

X_with_missing[missing_mask] = np.nan

In [6]:
# Displaying intermediate results
np.random.rand(*X.shape)

array([[1.74658385e-01, 3.27988001e-01, 6.80348666e-01, 6.32076183e-02],
       [6.07249374e-01, 4.77646503e-01, 2.83999977e-01, 2.38413281e-01],
       [5.14512743e-01, 3.67927581e-01, 4.56519891e-01, 3.37477382e-01],
       [9.70493694e-01, 1.33439432e-01, 9.68039532e-02, 3.43391729e-01],
       [5.91026901e-01, 6.59176472e-01, 3.97256747e-01, 9.99277994e-01],
       [3.51892996e-01, 7.21406668e-01, 6.37582695e-01, 8.13053863e-01],
       [9.76225663e-01, 8.89793656e-01, 7.64561974e-01, 6.98248478e-01],
       [3.35498170e-01, 1.47685578e-01, 6.26360031e-02, 2.41901704e-01],
       [4.32281481e-01, 5.21996274e-01, 7.73083554e-01, 9.58740923e-01],
       [1.17320480e-01, 1.07004140e-01, 5.89694723e-01, 7.45398074e-01],
       [8.48150380e-01, 9.35832080e-01, 9.83426242e-01, 3.99801692e-01],
       [3.80335184e-01, 1.47808677e-01, 6.84934439e-01, 6.56761958e-01],
       [8.62062596e-01, 9.72579948e-02, 4.97776908e-01, 5.81081930e-01],
       [2.41557040e-01, 1.69025406e-01, 8.59580836e

In [7]:
# Displaying intermediate results
np.random.rand(*X.shape) < missing_fraction

array([[False, False, False, False],
       [False, False, False, False],
       [ True, False,  True, False],
       [False,  True, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False,  True, False],
       [False, False,  True,  True],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False,  True],
       [False, False, False, False],
       [False, False, False, False],
       [False, False,  True, False],
       [False, False, False, False],
       [ True, False, False, False],
       [False, False,  True, False],
       [False, False, False, False],
       [False, False, False, False],
       [False, False, False,  True],
 

In [4]:
# Displaying intermediate results
X_with_missing

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, nan, nan],
       [nan, 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, nan, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, nan],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, nan],
       [5.1, nan, 1.4, 0.3],
       [5.7, 3.8, 1.7, nan],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, nan, 0.2],
       [5.1, 3.7, 1.5, nan],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, nan, 1.9, nan],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, nan, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [8]:
# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

In [9]:
# Perform KNN imputation on the dataset
X_imputed = knn_imputer.fit_transform(X_with_missing)

In [10]:
# Create a DataFrame to visualize the results
df = pd.DataFrame(X_imputed, columns=data.feature_names)

In [14]:
# Check the DataFrame with imputed values
print(df.head(10))

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0               5.10               3.5               1.40               0.2
1               4.90               3.0               1.40               0.2
2               4.70               3.2               1.30               0.2
3               4.60               3.1               3.94               1.4
4               5.12               3.6               1.40               0.2
5               5.40               3.9               1.70               0.4
6               4.60               3.4               1.40               0.3
7               5.00               3.4               1.50               0.2
8               4.40               2.9               1.28               0.2
9               4.90               3.1               1.50               0.1


In [13]:
print(data)

{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
     

In [18]:
# Calculate the Mean Absolute Error between original and imputed data
mae = mean_absolute_error(data.data, X_imputed)

print(f"Mean Absolute Error between original and KNN-imputed data: {mae}")

Mean Absolute Error between original and KNN-imputed data: 0.029400000000000003
