In [4]:
# Zadanie 13

"""
Dataset: California Housing

Wymagania:
Stwórz nowe cechy: interakcje (MedInc * AveRooms), transformacje (log, sqrt), bins
Porównaj R² przed i po feature engineering
Które nowe cechy najbardziej poprawiły model?

Oczekiwany rezultat:
Lista nowych cech
Porównanie R² (co najmniej 5% poprawa)
"""

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Wczytanie datasetu
data = fetch_california_housing(as_frame=True)
df = data.frame

X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

# Trenowanie modelu bazowego
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_baseline = LinearRegression()
model_baseline.fit(X_train_scaled, y_train)

y_pred_baseline = model_baseline.predict(X_test_scaled)
r2_baseline = r2_score(y_test, y_pred_baseline)

print("Bazowe R²:", round(r2_baseline, 4))

# Feature engineering
X_fe = X.copy()

# Interakcja
X_fe["MedInc_AveRooms"] = X_fe["MedInc"] * X_fe["AveRooms"]

# Transformacje log
X_fe["log_MedInc"] = np.log1p(X_fe["MedInc"])
X_fe["log_AveRooms"] = np.log1p(X_fe["AveRooms"])

# Transformacje sqrt
X_fe["sqrt_HouseAge"] = np.sqrt(X_fe["HouseAge"])
X_fe["sqrt_Population"] = np.sqrt(X_fe["Population"])

# Binning
X_fe["MedInc_bins"] = pd.cut(
    X_fe["MedInc"], bins=5, labels=False
)

# Trenowanie po feature engineering
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(
    X_fe, y, test_size=0.2, random_state=42
)

scaler_fe = StandardScaler()
X_train_fe_scaled = scaler_fe.fit_transform(X_train_fe)
X_test_fe_scaled = scaler_fe.transform(X_test_fe)

model_fe = LinearRegression()
model_fe.fit(X_train_fe_scaled, y_train_fe)

y_pred_fe = model_fe.predict(X_test_fe_scaled)
r2_fe = r2_score(y_test_fe, y_pred_fe)

print("R² po Feature Engineering:", round(r2_fe, 4))

improvement = ((r2_fe - r2_baseline) / r2_baseline) * 100
print("Poprawa (%):", round(improvement, 2))

# Które cechy najbardziej pomogły
feature_importance = pd.DataFrame({
    "Feature": X_fe.columns,
    "Współczynnik": model_fe.coef_
})

feature_importance["AbsCoeff"] = feature_importance["Współczynnik"].abs()
feature_importance = feature_importance.sort_values(
    by="AbsCoeff", ascending=False
)

print("\nTop 10 cech z uwagi na współczynniki:")
print(feature_importance.head(10))

# Lista nowych cech
new_features = [
    "MedInc_AveRooms",
    "log_MedInc",
    "log_AveRooms",
    "sqrt_HouseAge",
    "sqrt_Population",
    "MedInc_bins"
]

print("\nLista nowych cech:")
for f in new_features:
    print("-", f)

Bazowe R²: 0.5758
R² po Feature Engineering: 0.5745
Poprawa (%): -0.23

Top 10 cech z uwagi na współczynniki:
          Feature  Współczynnik  AbsCoeff
6        Latitude     -0.859496  0.859496
7       Longitude     -0.835173  0.835173
0          MedInc      0.694997  0.694997
3       AveBedrms      0.314800  0.314800
1        HouseAge      0.292903  0.292903
2        AveRooms     -0.219410  0.219410
11  sqrt_HouseAge     -0.173295  0.173295
9      log_MedInc      0.143307  0.143307
10   log_AveRooms     -0.073436  0.073436
13    MedInc_bins      0.053212  0.053212

Lista nowych cech:
- MedInc_AveRooms
- log_MedInc
- log_AveRooms
- sqrt_HouseAge
- sqrt_Population
- MedInc_bins
