<a href="https://colab.research.google.com/github/Sam-Wadmare/ML-LAB/blob/main/lab/clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# exp1_simple_clean.py
# Minimal Data Cleaning: median imputation, IQR capping, standard scaling
# Recommended imports style (B)
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 1. Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame.copy()          # DataFrame with features + target

# OPTIONAL: if you want to see basic info, uncomment:
# print(df.shape); print(df.head())

# 2. Median imputation for any missing numeric values (in-place)
num_cols = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy="median")
df[num_cols] = imputer.fit_transform(df[num_cols])

# 3. Outlier handling: cap each numeric column to [Q1-1.5*IQR, Q3+1.5*IQR]
for c in num_cols:
    q1 = df[c].quantile(0.25)
    q3 = df[c].quantile(0.75)
    iqr = q3 - q1
    low = q1 - 1.5 * iqr
    high = q3 + 1.5 * iqr
    df[c] = df[c].clip(lower=low, upper=high)

# 4. Scale numeric columns (mean~0, std~1)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 5. Save cleaned dataset
df.to_csv("exp1_cleaned_simple.csv", index=False)

# 6. Minimal prints for quick verification
print("Saved cleaned file: exp1_cleaned_simple.csv")
print("Shape:", df.shape)
print("Numeric columns mean (approx):\n", df[num_cols].mean().round(3))
print("Numeric columns std (approx):\n", df[num_cols].std().round(3))


Saved cleaned file: exp1_cleaned_simple.csv
Shape: (20640, 9)
Numeric columns mean (approx):
 MedInc        -0.0
HouseAge       0.0
AveRooms       0.0
AveBedrms     -0.0
Population     0.0
AveOccup      -0.0
Latitude      -0.0
Longitude     -0.0
MedHouseVal   -0.0
dtype: float64
Numeric columns std (approx):
 MedInc         1.0
HouseAge       1.0
AveRooms       1.0
AveBedrms      1.0
Population     1.0
AveOccup       1.0
Latitude       1.0
Longitude      1.0
MedHouseVal    1.0
dtype: float64
