In [36]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

---
# Simple Statistical Imputation
---

Mean/Median/Mode: Replace missing values with the mean (numerical), median (numerical), or mode (categorical) of the column.

In [6]:
# Create the DataFrame
data = {
    'column': [1, 2, np.nan, 4, 5]
}
df = pd.DataFrame(data)

# Modern approach without inplace=True
df['column'] = df['column'].fillna(df['column'].mean())

print(df)

   column
0     1.0
1     2.0
2     3.0
3     4.0
4     5.0


---
# K-Nearest Neighbors (KNN) Imputation
---

Concept: Replace missing values based on the values of the nearest neighbors.

In [11]:

# Example DataFrame with missing values
data = {
    "A": [1, 2, np.nan, 4],
    "B": [5, np.nan, 7, 8],
    "C": [9, 10, 11, np.nan],
}

# Convert dictionary to DataFrame
df = pd.DataFrame(data)

# Initialize KNNImputer
imputer = KNNImputer(n_neighbors=2)

# Apply KNN Imputation
imputed_data = imputer.fit_transform(df)

# Convert back to DataFrame if needed
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)

print(imputed_df)


     A    B     C
0  1.0  5.0   9.0
1  2.0  6.0  10.0
2  3.0  7.0  11.0
3  4.0  8.0  10.5


---
# Regression Imputation
---

Concept: Predict missing values using regression models based on other variables in the dataset.

In [31]:
# --- Step 1: Create a simple dataset ---
# Let's create a small dataset with some missing target values (NaN).
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [10, 20, 30, 40, 50],
    'Target': [5, np.nan, 15, np.nan, 25]
}

df = pd.DataFrame(data)

# Display the original data
print("Original Data:")
print(df)

# --- Step 2: Split data into training and testing ---
# First, split the data into rows with known (non-missing) target values and rows with missing target values.
train_data = df.dropna(subset=['Target'])  # Rows where Target is not NaN
test_data = df[df['Target'].isna()]  # Rows where Target is NaN

# Define X (features) and y (target)
X_train = train_data[['Feature1', 'Feature2']]  # Features for training
y_train = train_data['Target']  # Target values for training
X_test = test_data[['Feature1', 'Feature2']]  # Features to predict for missing targets

# --- Step 3: Fit a regression model ---
model = LinearRegression()
model.fit(X_train, y_train)  # Train the model on the known data

# --- Step 4: Predict missing target values ---
predicted_targets = model.predict(X_test)  # Predict missing values based on features

# --- Step 5: Fill the missing target values ---
df.loc[df['Target'].isna(), 'Target'] = predicted_targets

# Display the updated data with imputed (predicted) target values
print("\nData with Imputed Targets:")
print(df)

Original Data:
   Feature1  Feature2  Target
0         1        10     5.0
1         2        20     NaN
2         3        30    15.0
3         4        40     NaN
4         5        50    25.0

Data with Imputed Targets:
   Feature1  Feature2  Target
0         1        10     5.0
1         2        20    10.0
2         3        30    15.0
3         4        40    20.0
4         5        50    25.0


---
# Multivariate Imputation by Chained Equations (MICE)
---

Concept: Iteratively predicts missing values using a series of regressions.

In [37]:
# --- Step 1: Create a simple dataset ---
# Let's create a dataset with some missing target values (NaN).
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [10, 20, 30, 40, 50],
    'Target': [5, np.nan, 15, np.nan, 25]
}

df = pd.DataFrame(data)

# Display the original data
print("Original Data:")
print(df)

# --- Step 2: Initialize the IterativeImputer ---
# We will use MICE (Multivariate Imputation by Chained Equations) to predict the missing values.
imputer = IterativeImputer(random_state=42)

# --- Step 3: Fit the imputer model and transform the data ---
# This step will impute the missing values in the entire dataset.
imputed_data = imputer.fit_transform(df)

# Convert the result back into a DataFrame
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)

# Display the data after imputation
print("\nData after Imputation (MICE):")
print(df_imputed)

Original Data:
   Feature1  Feature2  Target
0         1        10     5.0
1         2        20     NaN
2         3        30    15.0
3         4        40     NaN
4         5        50    25.0

Data after Imputation (MICE):
   Feature1  Feature2  Target
0       1.0      10.0     5.0
1       2.0      20.0    10.0
2       3.0      30.0    15.0
3       4.0      40.0    20.0
4       5.0      50.0    25.0
