In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold, cross_validate, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
import joblib

from src.config import (
    MASTER_DF_FILE,
    RANDOM_SEED,
    MODELS_DIR,
    MUNICIPALITY_COLUMN,
    WEATHER_COLUMNS,
    SERVICES_COLUMNS,
    AGE_COLUMNS,
    INCOME_COLUMN,
    POP_DENSITY_COLUMN,
    URBAN_CLUSTER_FILE,
    CLUSTER_LABELS
)

: 

In [None]:
df_master = pd.read_csv(URBAN_CLUSTER_FILE)
target_column = "log_price_sqm"

numerical_features = WEATHER_COLUMNS + SERVICES_COLUMNS + [INCOME_COLUMN, POP_DENSITY_COLUMN]
categorical_features = AGE_COLUMNS + ["cluster_urban"]
feature_cols = numerical_features + categorical_features

cluster_counts = df_master['cluster_urban'].value_counts().sort_index()
print(cluster_counts)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", "passthrough", categorical_features),
    ],
    remainder="drop",
)

In [None]:
selected_clusters = [0, 1, 4]
for cluster_id in selected_clusters:
    label = CLUSTER_LABELS["cluster_urban"].get(cluster_id, f"Cluster {cluster_id}")
    print(f"  Cluster {cluster_id}: {label}")


In [None]:
model_configs = {
    "elasticnet": {"class": ElasticNet, "params": {"alpha": 0.1, "l1_ratio": 0.5, "random_state": RANDOM_SEED, "max_iter": 5000}},
    "gbm": {"class": GradientBoostingRegressor, "params": {"n_estimators": 100, "max_depth": 5, "learning_rate": 0.1, "random_state": RANDOM_SEED, "subsample": 0.8, "max_features": "sqrt"}},
}

In [None]:
cluster_results = {}
cluster_pipelines = {}

for cluster_id in selected_clusters:
    cluster_label = CLUSTER_LABELS["cluster_urban"].get(cluster_id, f"Cluster {cluster_id}")
    print(f"\nCluster {cluster_id}: {cluster_label}")

    df_cluster = df_master[df_master['cluster_urban'] == cluster_id].copy()
    df_train = df_cluster[df_cluster['year'] < 2023].copy()
    df_test = df_cluster[df_cluster['year'] == 2023].copy()
    
    if len(df_train) == 0 or len(df_test) == 0:
        print(f"WARNING: Insufficient data for cluster {cluster_id}, skipping...")
        continue

    X_train = df_train[feature_cols].copy()
    y_train = df_train[target_column].copy()
    groups_train = df_train[MUNICIPALITY_COLUMN].values

    X_test = df_test[feature_cols].copy()
    y_test = df_test[target_column].copy()

    valid_train = X_train.notna().all(axis=1) & y_train.notna()
    X_train = X_train[valid_train].reset_index(drop=True)
    y_train = y_train[valid_train].reset_index(drop=True)
    groups_train = groups_train[valid_train]
    
    valid_test = X_test.notna().all(axis=1) & y_test.notna()
    X_test = X_test[valid_test].reset_index(drop=True)
    y_test = y_test[valid_test].reset_index(drop=True)

    cluster_results[cluster_id] = {}
    cluster_pipelines[cluster_id] = {}

    gkf = GroupKFold(n_splits=5)
    scoring = {"r2": "r2", "neg_mse": "neg_mean_squared_error", "neg_mae": "neg_mean_absolute_error"}
    
    for model_key, config in model_configs.items():
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", config["class"](**config["params"])),
        ])

        cv_scores = cross_validate(
            pipeline, X_train, y_train,
            cv=gkf, groups=groups_train,
            scoring=scoring, return_train_score=True, n_jobs=-1
        )

        pipeline.fit(X_train, y_train)
        y_pred_test = pipeline.predict(X_test)

        test_r2 = r2_score(y_test, y_pred_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        test_mae = mean_absolute_error(y_test, y_pred_test)

        cluster_results[cluster_id][model_key] = {
            'cv_r2_mean': cv_scores["test_r2"].mean(),
            'cv_r2_std': cv_scores["test_r2"].std(),
            'cv_rmse': np.sqrt(-cv_scores["test_neg_mse"]).mean(),
            'cv_mae': (-cv_scores["test_neg_mae"]).mean(),
            'test_r2': test_r2,
            'test_rmse': test_rmse,
            'test_mae': test_mae
        }
        
        cluster_pipelines[cluster_id][model_key] = pipeline

print("\nTraining complete.")


In [None]:
results_summary = []

for cluster_id in selected_clusters:
    if cluster_id not in cluster_results:
        continue
    
    cluster_label = CLUSTER_LABELS["cluster_urban"].get(cluster_id, f"Cluster {cluster_id}")
    
    for model_key in model_configs.keys():
        if model_key not in cluster_results[cluster_id]:
            continue
        
        metrics = cluster_results[cluster_id][model_key]
        
        results_summary.append({
            "Cluster ID": cluster_id,
            "Cluster": cluster_label,
            "Model": model_key,
            "CV R² (2019-2022)": metrics['cv_r2_mean'],
            "CV R² Std": metrics['cv_r2_std'],
            "Test R² (2023)": metrics['test_r2'],
            "CV RMSE": metrics['cv_rmse'],
            "Test RMSE (2023)": metrics['test_rmse'],
            "CV MAE": metrics['cv_mae'],
            "Test MAE (2023)": metrics['test_mae']
        })

results_df = pd.DataFrame(results_summary)
display(results_df.round(4))

MODELS_DIR.mkdir(parents=True, exist_ok=True)
results_path = MODELS_DIR / "temporal_validation_by_cluster.csv"
results_df.to_csv(results_path, index=False)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

test_r2_data = results_df.pivot(index='Cluster', columns='Model', values='Test R² (2023)')
test_r2_data.plot(kind='bar', ax=axes[0], color=['steelblue', 'coral'], alpha=0.7)
axes[0].set_xlabel('Cluster', fontsize=12)
axes[0].set_ylabel('Test R² (2023)', fontsize=12)
axes[0].set_title('Test R² by Cluster', fontsize=14, fontweight='bold')
axes[0].legend(title='Model', fontsize=10)
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

test_rmse_data = results_df.pivot(index='Cluster', columns='Model', values='Test RMSE (2023)')
test_rmse_data.plot(kind='bar', ax=axes[1], color=['steelblue', 'coral'], alpha=0.7)
axes[1].set_xlabel('Cluster', fontsize=12)
axes[1].set_ylabel('Test RMSE (2023)', fontsize=12)
axes[1].set_title('Test RMSE by Cluster', fontsize=14, fontweight='bold')
axes[1].legend(title='Model', fontsize=10)
axes[1].grid(True, alpha=0.3, axis='y')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
feature_importance_data = []

for cluster_id in selected_clusters:
    if cluster_id not in cluster_pipelines:
        continue
    
    cluster_label = CLUSTER_LABELS["cluster_urban"].get(cluster_id, f"Cluster {cluster_id}")
    
    for model_key, pipeline in cluster_pipelines[cluster_id].items():
        regressor = pipeline.named_steps['regressor']
        feature_names = numerical_features + categorical_features
        
        if model_key == 'elasticnet':
            importances = np.abs(regressor.coef_)
        elif model_key == 'gbm':
            importances = regressor.feature_importances_
        
        for feat_name, importance in zip(feature_names, importances):
            feature_importance_data.append({
                'Cluster ID': cluster_id,
                'Cluster': cluster_label,
                'Model': model_key,
                'Feature': feat_name,
                'Importance': importance
            })

importance_df = pd.DataFrame(feature_importance_data)

importance_path = MODELS_DIR / "feature_importance_by_cluster.csv"
importance_df.to_csv(importance_path, index=False)


In [None]:
for cluster_id in selected_clusters:
    if cluster_id not in cluster_pipelines:
        continue
    
    cluster_label = CLUSTER_LABELS["cluster_urban"].get(cluster_id, f"Cluster {cluster_id}")
    print(f"\n{cluster_label} - Top 10 Features:")
    
    for model_key in model_configs.keys():
        if model_key not in cluster_pipelines[cluster_id]:
            continue
        
        cluster_model_df = importance_df[
            (importance_df['Cluster ID'] == cluster_id) & 
            (importance_df['Model'] == model_key)
        ].sort_values('Importance', ascending=False).head(10)
        
        print(f"\n{model_key.upper()}:")
        display(cluster_model_df[['Feature', 'Importance']].reset_index(drop=True))


In [None]:
fig, axes = plt.subplots(len(selected_clusters), 2, figsize=(16, 5 * len(selected_clusters)))

if len(selected_clusters) == 1:
    axes = axes.reshape(1, -1)

for idx, cluster_id in enumerate(selected_clusters):
    if cluster_id not in cluster_pipelines:
        continue
    
    cluster_label = CLUSTER_LABELS["cluster_urban"].get(cluster_id, f"Cluster {cluster_id}")

    en_data = importance_df[
        (importance_df['Cluster ID'] == cluster_id) & 
        (importance_df['Model'] == 'elasticnet')
    ].sort_values('Importance', ascending=False).head(15)
    
    axes[idx, 0].barh(en_data['Feature'], en_data['Importance'], color='steelblue', alpha=0.7)
    axes[idx, 0].set_xlabel('Absolute Coefficient', fontsize=11)
    axes[idx, 0].set_title(f'{cluster_label} - ElasticNet', fontsize=12, fontweight='bold')
    axes[idx, 0].invert_yaxis()
    axes[idx, 0].grid(True, alpha=0.3, axis='x')
    
    gbm_data = importance_df[
        (importance_df['Cluster ID'] == cluster_id) & 
        (importance_df['Model'] == 'gbm')
    ].sort_values('Importance', ascending=False).head(15)
    
    axes[idx, 1].barh(gbm_data['Feature'], gbm_data['Importance'], color='coral', alpha=0.7)
    axes[idx, 1].set_xlabel('Feature Importance', fontsize=11)
    axes[idx, 1].set_title(f'{cluster_label} - GradientBoostingRegressor', fontsize=12, fontweight='bold')
    axes[idx, 1].invert_yaxis()
    axes[idx, 1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

In [None]:
pivot_importance = importance_df.groupby(['Feature', 'Model', 'Cluster'])['Importance'].mean().reset_index()

for model_key in model_configs.keys():
    model_pivot = pivot_importance[pivot_importance['Model'] == model_key].pivot(
        index='Feature', 
        columns='Cluster', 
        values='Importance'
    ).fillna(0)
    
    top_features = model_pivot.mean(axis=1).sort_values(ascending=False).head(20).index
    model_pivot_top = model_pivot.loc[top_features]
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(model_pivot_top, annot=True, fmt='.3f', cmap='YlOrRd', cbar_kws={'label': 'Importance'})
    plt.title(f'Feature Importance Heatmap - {model_key.upper()}', fontsize=14, fontweight='bold')
    plt.xlabel('Cluster', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.tight_layout()
    plt.show()
