<a href="https://colab.research.google.com/github/Sam-Wadmare/ML-LAB/blob/main/lab/transform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# exp2_simple_transform.py
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# 1. Load data
data = fetch_california_housing(as_frame=True)
df = data.frame.copy()

# 2. Feature engineering using EXISTING columns only
df["rooms_per_occupant"] = df["AveRooms"] / (df["AveOccup"] + 1e-6)
df["bedrooms_per_room"] = df["AveBedrms"] / (df["AveRooms"] + 1e-6)

# 3. Log-transform skewed columns
for col in ["MedInc", "Population", "AveRooms"]:
    df[col + "_log"] = np.log1p(df[col])

# 4. (Optional) Drop the original versions of the logged columns
# Comment this line if you want to keep everything
df = df.drop(columns=["MedInc", "Population", "AveRooms"])

# 5. Scale all numeric columns
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

# 6. Save file
df.to_csv("exp2_transformed.csv", index=False)
print("Saved exp2_transformed.csv | shape:", df.shape)
print(df.head())

print(df.describe().T.round(3))


Saved exp2_transformed.csv | shape: (20640, 11)
   HouseAge  AveBedrms  AveOccup  Latitude  Longitude  MedHouseVal  \
0  0.982143  -0.153758 -0.049597  1.052548  -1.327835     2.129631   
1 -0.607019  -0.263336 -0.092512  1.043185  -1.322844     1.314156   
2  1.856182  -0.049016 -0.025843  1.038503  -1.332827     1.258693   
3  1.856182  -0.049833 -0.050329  1.038503  -1.337818     1.165100   
4  1.856182  -0.032906 -0.085616  1.038503  -1.337818     1.172900   

   rooms_per_occupant  bedrooms_per_room  MedInc_log  Population_log  \
0            0.659647          -1.145833    1.995505       -1.694943   
1            0.854906          -0.987175    1.988380        1.030337   
2            0.855758          -1.440116    1.656444       -1.109604   
3            0.267179          -0.493194    1.049948       -0.949925   
4            0.787681          -0.706259    0.170631       -0.933021   

   AveRooms_log  
0      1.072436  
1      0.649505  
2      1.724704  
3      0.391271  
4      0