# üè¢ PR√âDICTION DES √âMISSIONS DE GES - B√ÇTIMENTS DE SEATTLE
## Notebook 04 : Mod√©lisation Professionnelle (Version Modulaire)

---

### üìã Informations du Projet

**Projet** : Seattle Energy Benchmarking - Pr√©diction des √©missions de CO‚ÇÇ  
**Dataset** : 1,666 b√¢timents non-r√©sidentiels (2015-2016)  
**Objectif** : Comparer deux approches pr√©dictives  
**Date** : Janvier 2026  

---

### üéØ Strat√©gie de Mod√©lisation

Ce notebook impl√©mente une **architecture modulaire professionnelle** avec :

‚úÖ **S√©paration des responsabilit√©s** : Code r√©utilisable dans `src/`  
‚úÖ **Fonctions test√©es** : Modules document√©s et maintenables  
‚úÖ **Notebook √©pur√©** : Focus sur l'analyse, pas l'impl√©mentation  
‚úÖ **Bonnes pratiques** : Structure de projet professionnelle  

### üìä Les 2 Mod√®les

1. **Mod√®le 1 (Pr√©dictif Pur)** : Variables disponibles au permis de construction
2. **Mod√®le 2 (Performance Optimale)** : Avec ENERGY STAR Score

---

# Section 0 : Configuration et Imports

In [None]:
# ============================================================================
# IMPORTS STANDARDS
# ============================================================================

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import os

# ============================================================================
# AJOUTER src AU PATH - APPROCHE ROBUSTE
# ============================================================================

# Option 1 : Depuis le r√©pertoire courant
try:
    # Essayer d'abord avec Path courant
    current_path = Path.cwd()
    print(f"R√©pertoire courant : {current_path}")
    
    # Si on est dans notebooks, remonter au parent
    if current_path.name == 'notebooks':
        PROJECT_ROOT = current_path.parent
    # Si on est √† la racine
    elif (current_path / 'notebooks').exists():
        PROJECT_ROOT = current_path
    # Sinon, chercher le dossier notebooks
    else:
        PROJECT_ROOT = current_path.parent
    
    SRC_PATH = PROJECT_ROOT / 'src'
    
    print(f"PROJECT_ROOT : {PROJECT_ROOT}")
    print(f"SRC_PATH : {SRC_PATH}")
    print(f"src existe : {SRC_PATH.exists()}")
    
    # Ajouter au path
    if str(SRC_PATH) not in sys.path:
        sys.path.insert(0, str(SRC_PATH))
    
    if str(PROJECT_ROOT) not in sys.path:
        sys.path.insert(0, str(PROJECT_ROOT))
    
    print("‚úì Chemins ajout√©s au PATH")
    
except Exception as e:
    print(f"‚ùå Erreur lors du setup des chemins : {e}")
    raise

# ============================================================================
# IMPORTS MODULES PERSONNALIS√âS
# ============================================================================

try:
    # Importer directement les modules depuis leur fichier
    import importlib.util
    
    # Load modeling_utils
    spec_modeling = importlib.util.spec_from_file_location("modeling_utils", SRC_PATH / "modeling_utils.py")
    modeling_utils = importlib.util.module_from_spec(spec_modeling)
    spec_modeling.loader.exec_module(modeling_utils)
    
    # Load evaluation_utils
    spec_eval = importlib.util.spec_from_file_location("evaluation_utils", SRC_PATH / "evaluation_utils.py")
    evaluation_utils = importlib.util.module_from_spec(spec_eval)
    spec_eval.loader.exec_module(evaluation_utils)
    
    # Load visualization_utils
    spec_viz = importlib.util.spec_from_file_location("visualization_utils", SRC_PATH / "visualization_utils.py")
    visualization_utils = importlib.util.module_from_spec(spec_viz)
    spec_viz.loader.exec_module(visualization_utils)
    
    # Extraire les fonctions
    get_model_param_grid = modeling_utils.get_model_param_grid
    optimize_model = modeling_utils.optimize_model
    train_multiple_models = modeling_utils.train_multiple_models
    save_model = modeling_utils.save_model
    load_model = modeling_utils.load_model
    create_stacking_model = modeling_utils.create_stacking_model
    get_feature_importance = modeling_utils.get_feature_importance
    compare_models = modeling_utils.compare_models
    
    evaluate_model = evaluation_utils.evaluate_model
    cv_evaluate_model = evaluation_utils.cv_evaluate_model
    compute_residuals_stats = evaluation_utils.compute_residuals_stats
    test_homoscedasticity = evaluation_utils.test_homoscedasticity
    evaluate_prediction_quality = evaluation_utils.evaluate_prediction_quality
    calculate_metrics_summary = evaluation_utils.calculate_metrics_summary
    compare_model_performance = evaluation_utils.compare_model_performance
    
    plot_predictions = visualization_utils.plot_predictions
    plot_residuals_distribution = visualization_utils.plot_residuals_distribution
    plot_feature_importance = visualization_utils.plot_feature_importance
    plot_model_comparison = visualization_utils.plot_model_comparison
    plot_learning_curves = visualization_utils.plot_learning_curves
    plot_comparison_two_models = visualization_utils.plot_comparison_two_models
    
    print("‚úì Modules personnalis√©s import√©s avec succ√®s !")
    
except ImportError as e:
    print(f"‚ùå Erreur d'import : {e}")
    raise

# ============================================================================
# SCIKIT-LEARN
# ============================================================================

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# XGBoost (optionnel)
try:
    from xgboost import XGBRegressor
    XGBOOST_AVAILABLE = True
    print("‚úì XGBoost disponible")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("‚úó XGBoost non disponible")

print("\n" + "="*80)
print("IMPORTS TERMIN√âS")
print("="*80)

‚úì PROJECT_ROOT : c:\Users\LENOVO\CO2-Emission-in-Seattle-REGO3
‚úì SRC_PATH : c:\Users\LENOVO\CO2-Emission-in-Seattle-REGO3\src
‚úì Chemin src ajout√© au PATH


ModuleNotFoundError: No module named 'src'

In [None]:
# ============================================================================
# CONFIGURATION GLOBALE
# ============================================================================

# Seeds
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configuration pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

# Configuration matplotlib/seaborn
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_palette("husl")

# ============================================================================
# CHEMINS
# ============================================================================

DATA_DIR = PROJECT_ROOT / 'data'
INTERIM_DIR = DATA_DIR / 'interim_data'
MODELS_DIR = PROJECT_ROOT / 'models'
RESULTS_DIR = PROJECT_ROOT / 'results'
FIGURES_DIR = RESULTS_DIR / 'figures'

# Cr√©er dossiers
for directory in [MODELS_DIR, RESULTS_DIR, FIGURES_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print("‚úì Configuration termin√©e")
print(f"\nChemins configur√©s :")
print(f"  Donn√©es    : {INTERIM_DIR}")
print(f"  Mod√®les    : {MODELS_DIR}")
print(f"  R√©sultats  : {RESULTS_DIR}")

# Section 1 : Chargement des Donn√©es

In [None]:
print("="*80)
print("CHARGEMENT DES DONN√âES")
print("="*80)

# Charger
train_df = pd.read_csv(INTERIM_DIR / 'train_with_features.csv')
test_df = pd.read_csv(INTERIM_DIR / 'test_with_features.csv')

print(f"\n‚úì Train : {train_df.shape}")
print(f"‚úì Test  : {test_df.shape}")

# Aper√ßu
print("\nAper√ßu des donn√©es :")
display(train_df.head())

# Section 2 : D√©finition des Variables

In [None]:
# ============================================================================
# VARIABLES
# ============================================================================

# Target
TARGET = 'TotalGHGEmissions_log'

# V√©rifier et cr√©er si n√©cessaire
if TARGET not in train_df.columns:
    print(f"‚ö† {TARGET} non trouv√©e. Cr√©ation...")
    train_df[TARGET] = np.log1p(train_df['TotalGHGEmissions'])
    test_df[TARGET] = np.log1p(test_df['TotalGHGEmissions'])
    print(f"‚úì {TARGET} cr√©√©e")

# MOD√àLE 1 : Variables autoris√©es
variables_autorisees = [
    'BuildingType', 'PrimaryPropertyType', 'City', 'State', 'ZipCode',
    'CouncilDistrictCode', 'Neighborhood', 'Latitude', 'Longitude',
    'YearBuilt', 'NumberofBuildings', 'NumberofFloors',
    'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)',
    'ListOfAllPropertyUseTypes', 'LargestPropertyUseType',
    'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType',
    'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
    'ThirdLargestPropertyUseTypeGFA'
]

# Filtrer variables existantes
variables_autorisees_existantes = [
    v for v in variables_autorisees if v in train_df.columns
]

# Si peu de variables trouv√©es, utiliser toutes les features
if len(variables_autorisees_existantes) < 5:
    print("\n‚ö† Variables pr√©d√©finies non trouv√©es. Utilisation de toutes les features.")
    exclude_cols = ['OSEBuildingID', 'DataYear', 'TotalGHGEmissions', TARGET, 
                    'ENERGYSTARScore', 'PropertyName', 'Address']
    variables_autorisees_existantes = [
        col for col in train_df.columns if col not in exclude_cols
    ]

# MOD√àLE 2 : Avec ENERGY STAR
variables_exp_tot = variables_autorisees_existantes.copy()
if 'ENERGYSTARScore' in train_df.columns:
    variables_exp_tot.append('ENERGYSTARScore')

print(f"\nüìä MOD√àLE 1 : {len(variables_autorisees_existantes)} features")
print(f"üìä MOD√àLE 2 : {len(variables_exp_tot)} features")
print(f"üéØ Target : {TARGET}")

# Section 3 : Pr√©paration des Datasets

In [None]:
# ============================================================================
# PR√âPARATION MOD√àLE 1
# ============================================================================

print("="*80)
print("PR√âPARATION MOD√àLE 1 - SANS ENERGY STAR")
print("="*80)

# Extraire target
y_train = train_df[TARGET]
y_test = test_df[TARGET]

# Features Mod√®le 1
X_train_m1 = train_df[variables_autorisees_existantes].copy()
X_test_m1 = test_df[variables_autorisees_existantes].copy()

# G√©rer variables cat√©gorielles
categorical_cols = X_train_m1.select_dtypes(include=['object']).columns.tolist()

if len(categorical_cols) > 0:
    print(f"\n‚ö† {len(categorical_cols)} variables cat√©gorielles d√©tect√©es")
    print("   ‚Üí Encodage One-Hot")
    X_train_m1 = pd.get_dummies(X_train_m1, columns=categorical_cols, drop_first=True)
    X_test_m1 = pd.get_dummies(X_test_m1, columns=categorical_cols, drop_first=True)
    X_train_m1, X_test_m1 = X_train_m1.align(X_test_m1, join='left', axis=1, fill_value=0)
    print(f"   ‚úì {X_train_m1.shape[1]} features apr√®s encodage")

# Scaling
print("\nüîÑ Scaling...")
scaler_m1 = StandardScaler()
X_train_m1_scaled = scaler_m1.fit_transform(X_train_m1)
X_test_m1_scaled = scaler_m1.transform(X_test_m1)

# Convertir en DataFrame
X_train_m1_scaled = pd.DataFrame(X_train_m1_scaled, columns=X_train_m1.columns, index=X_train_m1.index)
X_test_m1_scaled = pd.DataFrame(X_test_m1_scaled, columns=X_test_m1.columns, index=X_test_m1.index)

print(f"\n‚úì Mod√®le 1 pr√™t :")
print(f"  X_train : {X_train_m1_scaled.shape}")
print(f"  X_test  : {X_test_m1_scaled.shape}")

In [None]:
# ============================================================================
# PR√âPARATION MOD√àLE 2
# ============================================================================

print("="*80)
print("PR√âPARATION MOD√àLE 2 - AVEC ENERGY STAR")
print("="*80)

# Features Mod√®le 2
X_train_m2 = train_df[variables_exp_tot].copy()
X_test_m2 = test_df[variables_exp_tot].copy()

# Imputer ENERGY STAR si valeurs manquantes
if 'ENERGYSTARScore' in X_train_m2.columns:
    missing = X_train_m2['ENERGYSTARScore'].isnull().sum()
    if missing > 0:
        print(f"\n‚ö† {missing} valeurs manquantes ENERGYSTARScore")
        median_val = X_train_m2['ENERGYSTARScore'].median()
        X_train_m2['ENERGYSTARScore'].fillna(median_val, inplace=True)
        X_test_m2['ENERGYSTARScore'].fillna(median_val, inplace=True)
        print(f"   ‚úì Imputation avec m√©diane = {median_val:.2f}")

# G√©rer variables cat√©gorielles
categorical_cols_m2 = X_train_m2.select_dtypes(include=['object']).columns.tolist()

if len(categorical_cols_m2) > 0:
    print(f"\n‚ö† {len(categorical_cols_m2)} variables cat√©gorielles")
    X_train_m2 = pd.get_dummies(X_train_m2, columns=categorical_cols_m2, drop_first=True)
    X_test_m2 = pd.get_dummies(X_test_m2, columns=categorical_cols_m2, drop_first=True)
    X_train_m2, X_test_m2 = X_train_m2.align(X_test_m2, join='left', axis=1, fill_value=0)
    print(f"   ‚úì {X_train_m2.shape[1]} features apr√®s encodage")

# Scaling
print("\nüîÑ Scaling...")
scaler_m2 = StandardScaler()
X_train_m2_scaled = scaler_m2.fit_transform(X_train_m2)
X_test_m2_scaled = scaler_m2.transform(X_test_m2)

X_train_m2_scaled = pd.DataFrame(X_train_m2_scaled, columns=X_train_m2.columns, index=X_train_m2.index)
X_test_m2_scaled = pd.DataFrame(X_test_m2_scaled, columns=X_test_m2.columns, index=X_test_m2.index)

print(f"\n‚úì Mod√®le 2 pr√™t :")
print(f"  X_train : {X_train_m2_scaled.shape}")
print(f"  X_test  : {X_test_m2_scaled.shape}")

print("\n" + "="*80)
print("‚úì PR√âPARATION TERMIN√âE")
print("="*80)

# Section 4 : MOD√àLE 1 - Baseline

Entra√Ænement et √©valuation de 5 algorithmes.

In [None]:
# ============================================================================
# D√âFINITION DES MOD√àLES
# ============================================================================

models_m1 = {
    'Ridge': Ridge(random_state=RANDOM_STATE),
    'Lasso': Lasso(random_state=RANDOM_STATE, max_iter=2000),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=RANDOM_STATE),
    'SVR': SVR(kernel='rbf')
}

if XGBOOST_AVAILABLE:
    models_m1['XGBoost'] = XGBRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1, verbosity=0)

print(f"‚úì {len(models_m1)} mod√®les d√©finis pour Mod√®le 1")

In [None]:
# ============================================================================
# ENTRA√éNEMENT BASELINE - MOD√àLE 1
# ============================================================================

# Utiliser la fonction du module
results_df_m1, trained_models_m1 = train_multiple_models(
    models_m1,
    X_train_m1_scaled,
    y_train,
    X_test_m1_scaled,
    y_test,
    cv=5,
    verbose=True
)

# Afficher r√©sultats
print("\nüìä R√©sultats Baseline - Mod√®le 1 :\n")
display(results_df_m1[['model', 'test_r2', 'test_rmse_log', 'test_mae_log', 
                       'test_rmse_original', 'test_mape', 'overfitting_r2']].round(4))

In [None]:
# ============================================================================
# VISUALISATION COMPARAISON - MOD√àLE 1
# ============================================================================

plot_model_comparison(
    results_df_m1,
    title="Comparaison Baseline - Mod√®le 1 (Sans ENERGY STAR)",
    save_path=FIGURES_DIR / 'baseline_m1_comparison.png'
)

# Section 5 : MOD√àLE 1 - Optimisation

Optimisation du meilleur mod√®le.

In [None]:
# ============================================================================
# OPTIMISATION - MOD√àLE 1
# ============================================================================

# Identifier meilleur mod√®le
best_model_name_m1 = results_df_m1.iloc[0]['model']
best_model_baseline_m1 = trained_models_m1[best_model_name_m1]

print(f"üèÜ Meilleur mod√®le baseline : {best_model_name_m1}")
print(f"   R¬≤ Test : {results_df_m1.iloc[0]['test_r2']:.4f}")

# Optimisation
best_model_m1, best_params_m1, opt_time_m1 = optimize_model(
    best_model_baseline_m1,
    X_train_m1_scaled,
    y_train,
    n_iter=200,
    cv=5,
    random_state=RANDOM_STATE
)

print(f"\n‚úì Optimisation termin√©e")
print(f"\nMeilleurs param√®tres :")
for param, value in best_params_m1.items():
    print(f"  {param}: {value}")

In [None]:
# √âvaluer mod√®le optimis√©
optimized_metrics_m1 = evaluate_model(
    best_model_m1,
    X_train_m1_scaled, y_train,
    X_test_m1_scaled, y_test,
    model_name=f"{best_model_name_m1} (optimis√©)"
)

# Afficher r√©sum√©
print("\nüìä R√âSULTATS MOD√àLE 1 OPTIMIS√â :")
summary_m1 = calculate_metrics_summary(optimized_metrics_m1)
display(summary_m1)

# Section 6 : MOD√àLE 1 - Analyse

Feature importance et analyse des r√©sidus.

In [None]:
# ============================================================================
# FEATURE IMPORTANCE - MOD√àLE 1
# ============================================================================

importance_df_m1 = plot_feature_importance(
    best_model_m1,
    X_train_m1_scaled.columns,
    top_n=20,
    title=f"Top 20 Features - Mod√®le 1 ({best_model_name_m1})",
    save_path=FIGURES_DIR / 'feature_importance_m1.png'
)

print("\nTop 10 Features :")
display(importance_df_m1.head(10))

In [None]:
# ============================================================================
# ANALYSE R√âSIDUS - MOD√àLE 1
# ============================================================================

# Pr√©dictions
y_pred_m1 = best_model_m1.predict(X_test_m1_scaled)

# Visualisations
plot_predictions(
    y_test, y_pred_m1,
    title=f"Pr√©dictions - Mod√®le 1 ({best_model_name_m1})",
    save_path=FIGURES_DIR / 'predictions_m1.png'
)

# Distribution r√©sidus
residuals_m1 = y_test - y_pred_m1
plot_residuals_distribution(
    residuals_m1,
    title="Distribution R√©sidus - Mod√®le 1",
    save_path=FIGURES_DIR / 'residuals_m1.png'
)

---

# üéì **SECTIONS SUIVANTES**

## √Ä compl√©ter :

### **Section 7-9 : MOD√àLE 2 (Avec ENERGY STAR)**
- M√™me structure que Sections 4-6
- Changer `_m1` ‚Üí `_m2`

### **Section 10 : Comparaison des 2 Mod√®les**
```python
# Utiliser la fonction de comparaison
comparison = compare_models(results_df_m1, results_df_m2, "Mod√®le 1", "Mod√®le 2")
display(comparison)

# Visualisation
plot_comparison_two_models(results_df_m1, results_df_m2, 
                           "Sans ENERGY STAR", "Avec ENERGY STAR")
```

### **Section 11 : Sauvegarde**
```python
# Sauvegarder mod√®les
save_model(best_model_m1, MODELS_DIR / 'model1_best.pkl', optimized_metrics_m1)
save_model(best_model_m2, MODELS_DIR / 'model2_best.pkl', optimized_metrics_m2)
```

---