In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np

In [None]:
data = fetch_california_housing()
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target

df.head()

In [None]:
rng = np.random.default_rng(42)
missing_mask = rng.random(df.shape) < 0.1  # 10% missing values randomly
df_missing = df.mask(missing_mask)

print("Missing values per column:")
print(df_missing.isnull().sum())

In [None]:
import matplotlib.pyplot as plt
import missingno as msno

msno.matrix(df_missing)
plt.show()

In [None]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_mean_imputed.isnull().sum()

In [None]:
median_imputer = SimpleImputer(strategy='median')
df_median_imputed = pd.DataFrame(median_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_median_imputed.isnull().sum()

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=5)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_knn_imputed.isnull().sum()

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

mice_imputer = IterativeImputer(random_state=42)
df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_mice_imputed.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def evaluate_model(df_imputed, name):
    X = df_imputed.drop(columns=['target'])
    y = df_imputed['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name}: MSE = {mse:.4f}")

evaluate_model(df, "No Missing Values")
evaluate_model(df_mean_imputed, "Mean Imputation")
evaluate_model(df_median_imputed, "Median Imputation")
evaluate_model(df_knn_imputed, "KNN Imputation")
evaluate_model(df_mice_imputed, "MICE Imputation")