In [None]:
from sklearn.datasets import fetch_california_housing

In [None]:
data = fetch_california_housing()

In [None]:
import pandas as pd
import numpy as np

In [None]:
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target

df.head()

In [None]:
rng = np.random.default_rng(42)
missing_mask = rng.random(df.shape) < 0.1  # 10% missing values randomly
df_missing = df.mask(missing_mask)

print("Missing values per column:")
print(df_missing.isnull().sum())

In [None]:
import matplotlib.pyplot as plt
import missingno as msno

msno.matrix(df_missing)
plt.show()

In [None]:
df_drop = df_missing.dropna()
print("Shape before:", df_missing.shape)
print("Shape after dropping rows:", df_drop.shape)

### Simple Imputers

In [None]:
_df_missing = df_missing.copy()
mean_val = _df_missing['MedInc'].mean()
_df_missing['MedInc'] = _df_missing['MedInc'].fillna(mean_val)

In [None]:
msno.matrix(_df_missing)
plt.show()

In [None]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_mean_imputed.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

median_imputer = SimpleImputer(strategy='median')
df_median_imputed = pd.DataFrame(median_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_median_imputed.isnull().sum()

In [None]:
msno.matrix(df_mean_imputed)
plt.show()

In [None]:
classes = pd.Series(['A', 'A', 'B', 'B', np.nan, 'B',np.nan])
mode_label = classes.mode()[0]
classes_mode = classes.fillna(mode_label)
print(classes_mode)

In [None]:
median_imputer = SimpleImputer(strategy='median')
df_median_imputed = pd.DataFrame(median_imputer.fit_transform(df_missing), columns=df_missing.columns)

df_median_imputed.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def evaluate_model(df_imputed, name):
    X = df_imputed.drop(columns=['target'])
    y = df_imputed['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name}: MSE = {mse:.4f}")

evaluate_model(df, "No Missing Values")
evaluate_model(df_drop, "Drop Rows")
evaluate_model(df_mean_imputed, "Mean Imputation")
evaluate_model(df_median_imputed, "Median Imputation")

In [None]:
_df_missing_row_ind = df_missing.copy()
indicators = _df_missing_row_ind.isna().astype(int)
indicators = indicators.add_suffix('_missing')
_df_missing_row_ind = pd.concat([_df_missing_row_ind, indicators], axis=1)

In [None]:
median_imputer = SimpleImputer(strategy='median')
_df_missing_row_ind = pd.DataFrame(median_imputer.fit_transform(_df_missing_row_ind), columns=_df_missing_row_ind.columns)

_df_missing_row_ind.isnull().sum()

In [None]:
X = _df_missing_row_ind.drop(columns=['target'])
y = _df_missing_row_ind['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(mse)