## Handling missing datasets using different methods


### Import Libraries

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
import numpy as np

### Load the data

In [None]:
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

### Introduce missing values in random places

In [None]:
np.random.seed(42)
mask = np.random.choice([True, False], size=iris_df.shape, p=[0.2, 0.8])
iris_df[mask] = np.nan

### Split the data

In [None]:
X = iris_df.drop('target', axis=1)
y = iris_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Mean Imputation

In [None]:
mean_imputer = SimpleImputer(strategy='mean')
X_train_mean_imputed = mean_imputer.fit_transform(X_train)
X_test_mean_imputed = mean_imputer.transform(X_test)

### Median Imputation

In [None]:
median_imputer = SimpleImputer(strategy='median')
X_train_median_imputed = median_imputer.fit_transform(X_train)
X_test_median_imputed = median_imputer.transform(X_test)

### K-nearest neighbors imputation

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)
X_train_knn_imputed = knn_imputer.fit_transform(X_train)
X_test_knn_imputed = knn_imputer.transform(X_test)

### Results


In [None]:
print("Original Data:")
print(X_train.head())
print("\nMean Imputation:")
print(pd.DataFrame(X_train_mean_imputed, columns=X_train.columns).head())
print("\nMedian Imputation:")
print(pd.DataFrame(X_train_median_imputed, columns=X_train.columns).head())
print("\nK-nearest neighbors Imputation:")
print(pd.DataFrame(X_train_knn_imputed, columns=X_train.columns).head())

Original Data:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
22                4.6               NaN                1.0               0.2
15                5.7               4.4                NaN               0.4
65                6.7               3.1                4.4               1.4
11                4.8               NaN                NaN               NaN
42                4.4               3.2                1.3               0.2

Mean Imputation:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                4.6          3.045161           1.000000          0.200000
1                5.7          4.400000           3.734831          0.400000
2                6.7          3.100000           4.400000          1.400000
3                4.8          3.045161           3.734831          1.151546
4                4.4          3.200000           1.300000          0.200000

Median Imputation:
   sepal length (cm)  sepal w