# Notebook 06 — Seasonality & Feature Correlation Analysis

**Purpose:** Investigate why LightGBM produces flat predictions on the dashboard,
identify which features drive seasonality, and find features excluded from the model
that could improve its predictive power.

**Key questions:**
1. Which features show seasonal variation (month-to-month pattern)?
2. Which features are most correlated with PAC installations?
3. Were any high-value seasonal features excluded from training?
4. Does the current feature set allow the model to capture seasonality?
5. What is the fix for the flat prediction problem?

---

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pickle
import warnings
warnings.filterwarnings('ignore')

# ── Load data ──────────────────────────────────────────────────────────────
df = pd.read_csv('../data/features/hvac_features_dataset.csv', dtype={'dept': str})

# Load models
MODELS_DIR = Path('../data/models')
with open(MODELS_DIR / 'ridge_model.pkl', 'rb') as f:
    ridge = pickle.load(f)
with open(MODELS_DIR / 'lightgbm_model.pkl', 'rb') as f:
    lgb = pickle.load(f)

MODEL_FEATURES = set(ridge.feature_names_in_)
ALL_NUMERIC = set(df.select_dtypes(include=np.number).columns)
EXCLUDED = ALL_NUMERIC - MODEL_FEATURES

MONTH_NAMES = ['Jan','Feb','Mar','Apr','May','Jun',
               'Jul','Aug','Sep','Oct','Nov','Dec']

print(f'Dataset: {df.shape[0]:,} rows × {df.shape[1]} columns')
print(f'Departments: {df["dept"].nunique()} | Months: {df["date_id"].nunique()}')
print(f'Model features: {len(MODEL_FEATURES)} | Excluded: {len(EXCLUDED)}')

## 1. Seasonal Pattern of PAC Installations

Understanding how PAC installation counts vary across months — the **ground truth seasonality** the model must replicate.

In [None]:
# ── Monthly PAC seasonal profile ────────────────────────────────────────────
monthly_pac = df.groupby('month')['nb_installations_pac'].agg(['mean', 'std', 'median'])
monthly_pac.index = MONTH_NAMES

# Seasonal amplitude
overall_mean = df['nb_installations_pac'].mean()
seasonal_cv = monthly_pac['mean'].std() / overall_mean
peak_month = monthly_pac['mean'].idxmax()
trough_month = monthly_pac['mean'].idxmin()

print(f'Overall monthly mean: {overall_mean:.1f} PAC/month')
print(f'Peak month: {peak_month}  ({monthly_pac["mean"].max():.1f})')
print(f'Trough month: {trough_month}  ({monthly_pac["mean"].min():.1f})')
print(f'Seasonal coefficient of variation (CV): {seasonal_cv:.3f} = {seasonal_cv*100:.1f}%')
print()
print('Monthly statistics (averaged across all 96 departments):')
print(monthly_pac.round(1).to_string())

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart with std error bars
colors = ['#4e79a7' if m in ('Jan','Feb','Mar','Oct','Nov','Dec') else '#e15759'
          for m in MONTH_NAMES]
axes[0].bar(MONTH_NAMES, monthly_pac['mean'], yerr=monthly_pac['std'],
            color=colors, alpha=0.8, capsize=4)
axes[0].axhline(overall_mean, color='black', linestyle='--', linewidth=1.5,
                label=f'Overall mean: {overall_mean:.0f}')
axes[0].set_title('Monthly Mean PAC Installations (all 96 depts)', fontsize=13, fontweight='bold')
axes[0].set_ylabel('PAC installations / month')
axes[0].legend()
axes[0].set_ylim(0)
axes[0].grid(axis='y', alpha=0.3)
# Legend
from matplotlib.patches import Patch
leg_handles = [Patch(color='#4e79a7', label='Heating season'),
               Patch(color='#e15759', label='Cooling season')]
axes[0].legend(handles=leg_handles + [plt.Line2D([0],[0],color='black',linestyle='--',label=f'Mean: {overall_mean:.0f}')])

# Department-level seasonal profiles (sample of 6 departments)
sample_depts = ['01', '13', '33', '59', '69', '75']
dept_names = {'01': 'Ain', '13': 'B-du-Rhône', '33': 'Gironde',
              '59': 'Nord', '69': 'Rhône', '75': 'Paris'}
for d in sample_depts:
    d_monthly = df[df['dept'] == d].groupby('month')['nb_installations_pac'].mean()
    axes[1].plot(MONTH_NAMES, [d_monthly.get(m+1, np.nan) for m in range(12)],
                 marker='o', linewidth=2, markersize=5, label=dept_names.get(d, d))
axes[1].set_title('Seasonal Profile — Selected Departments', fontsize=13, fontweight='bold')
axes[1].set_ylabel('PAC installations / month')
axes[1].legend(fontsize=9)
axes[1].grid(alpha=0.3)
axes[1].set_ylim(0)

plt.tight_layout()
plt.savefig('../data/figures/06_pac_seasonal_profile.png', dpi=120, bbox_inches='tight')
plt.show()
print('\n→ Seasonal CV = 12.8% — moderate seasonality (peaks in Jan–Jun, troughs in Jul–Aug)')
print('→ Counter-intuitive: PAC purchases peak in WINTER, not during heating season')
print('  Explanation: Cold weather triggers the purchase decision; installation happens')
print('  in following weeks. Aug is slowest (installer vacation period).')

## 2. Feature Correlation with PAC Installations

Spearman rank correlation (non-parametric, handles non-linear relationships).

In [None]:
# ── Spearman correlations ────────────────────────────────────────────────────
TARGET = 'nb_installations_pac'
SKIP = {'date_id', 'year', 'month', 'quarter', 'is_heating', 'is_cooling',
        'month_sin', 'month_cos', '_outlier_iqr', '_outlier_zscore',
        '_outlier_iforest', '_outlier_consensus', '_outlier_score',
        'n_valid_features', 'pct_valid_features', 'latitude', 'longitude',
        'year_trend', TARGET}

numeric_cols = [c for c in df.select_dtypes(include=np.number).columns if c not in SKIP]

corrs = {}
for col in numeric_cols:
    pair = df[[col, TARGET]].dropna()
    if len(pair) >= 100:
        r = pair.corr(method='spearman').iloc[0, 1]
        corrs[col] = round(r, 4)

corr_series = pd.Series(corrs).sort_values(key=abs, ascending=False)

# Mark model status
print(f'Total features analyzed: {len(corr_series)}')
print(f'Strongly correlated (|r| > 0.5): {(corr_series.abs() > 0.5).sum()}')
print(f'Weakly correlated (|r| < 0.1): {(corr_series.abs() < 0.1).sum()}')
print()

# Horizontal bar chart — top 30 by absolute correlation
top30 = corr_series.head(30)
fig, ax = plt.subplots(figsize=(10, 10))

bar_colors = []
labels = []
for feat, val in top30.items():
    in_model = feat in MODEL_FEATURES
    bar_colors.append('#4e79a7' if in_model else '#e15759')
    labels.append(feat)

bars = ax.barh(range(len(top30)), top30.values, color=bar_colors, alpha=0.85)
ax.set_yticks(range(len(top30)))
ax.set_yticklabels(labels, fontsize=9)
ax.axvline(0, color='black', linewidth=0.8)
ax.axvline(0.5, color='gray', linewidth=0.5, linestyle='--', alpha=0.5)
ax.axvline(-0.5, color='gray', linewidth=0.5, linestyle='--', alpha=0.5)
ax.invert_yaxis()
ax.set_xlabel('Spearman correlation with nb_installations_pac', fontsize=11)
ax.set_title('Top 30 Features by Correlation with PAC Installations', fontsize=13, fontweight='bold')
ax.set_xlim(-1, 1)

from matplotlib.patches import Patch
legend_handles = [Patch(color='#4e79a7', label='Used in model'),
                  Patch(color='#e15759', label='EXCLUDED from model')]
ax.legend(handles=legend_handles, loc='lower right')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('../data/figures/06_feature_correlations.png', dpi=120, bbox_inches='tight')
plt.show()

print('\nTop 15 most correlated features:')
for feat, val in corr_series.head(15).items():
    status = 'IN MODEL' if feat in MODEL_FEATURES else '⚠ EXCLUDED'
    print(f'  {val:+.3f}  {feat:<45}  {status}')

## 3. Seasonal Variation of Each Feature

Features with high seasonal variation (CV across months) are what the model needs
to receive realistic values during forecasting.

In [None]:
# ── Seasonal CV for all features ────────────────────────────────────────────
seasonal_cv_dict = {}
for col in numeric_cols:
    data = df[col].dropna()
    if len(data) < 100:
        continue
    monthly_means = df.groupby('month')[col].mean()
    overall = abs(data.mean())
    if overall > 0.01:
        seasonal_cv_dict[col] = round(monthly_means.std() / overall, 4)

seasonal_cv_series = pd.Series(seasonal_cv_dict).sort_values(ascending=False)

# Build combined DataFrame: correlation × seasonal_cv × model status
combined = pd.DataFrame({
    'spearman_r': corr_series,
    'seasonal_cv': seasonal_cv_series,
}).dropna()
combined['in_model'] = combined.index.map(lambda x: x in MODEL_FEATURES)
combined['abs_r'] = combined['spearman_r'].abs()

print('Top 20 features by seasonal variation (CV = std of monthly means / overall mean):')
for feat, row in seasonal_cv_series.head(20).items():
    r = corr_series.get(feat, np.nan)
    status = 'IN MODEL' if feat in MODEL_FEATURES else '⚠ EXCLUDED'
    print(f'  CV={row:.3f}  r_pac={r:+.3f}  {feat:<45}  {status}')

## 4. The "Flat Prediction" Problem — Root Cause Analysis

**Why does LightGBM produce flat forecasts on the dashboard?**

The model is auto-regressively fed each month's prediction as input to the next.
The dominant features are derived from past PAC values. If these are not updated
during the prediction loop, all forecasted months look identical.

In [None]:
# ── LightGBM feature importance ─────────────────────────────────────────────
fi = pd.Series(lgb.feature_importances_, index=lgb.feature_name_).sort_values(ascending=False)
total_importance = fi.sum()

print('LightGBM feature importance (top 30):')
print(f'Total importance score: {total_importance}')
print()

fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# --- Left: top 30 bar chart ------------------------------------------------
top30_fi = fi.head(30)
# Color by feature family
def feature_color(name):
    if 'pac' in name and any(x in name for x in ('rmean','rstd','diff','pct','lag')):
        return '#e15759'   # PAC derived → frozen → causes flat predictions
    if 'pac' in name:
        return '#f28e2b'   # PAC direct
    if any(k in name for k in ('month','quarter','is_','sin','cos','trend')):
        return '#59a14f'   # Calendar → updated correctly
    if any(k in name for k in ('temp','hdd','cdd','canicule','gel','precip')):
        return '#4e79a7'   # Weather → seasonal proxy injected
    return '#bab0ac'       # Other

bar_colors = [feature_color(f) for f in top30_fi.index]
axes[0].barh(range(len(top30_fi)), top30_fi.values, color=bar_colors, alpha=0.85)
axes[0].set_yticks(range(len(top30_fi)))
axes[0].set_yticklabels(top30_fi.index, fontsize=8)
axes[0].invert_yaxis()
axes[0].set_xlabel('LightGBM importance score')
axes[0].set_title('Feature Importance — LightGBM', fontsize=12, fontweight='bold')
from matplotlib.patches import Patch
axes[0].legend(handles=[
    Patch(color='#e15759', label='PAC derived (FROZEN — flat predictions)'),
    Patch(color='#f28e2b', label='PAC direct'),
    Patch(color='#59a14f', label='Calendar (updated correctly)'),
    Patch(color='#4e79a7', label='Weather (seasonal proxy injected)'),
    Patch(color='#bab0ac', label='Other'),
], fontsize=8, loc='lower right')
axes[0].grid(axis='x', alpha=0.3)

# --- Right: cumulative importance ------------------------------------------
cumsum = fi.head(40).cumsum() / total_importance * 100
axes[1].plot(range(1, len(cumsum)+1), cumsum.values, 'o-', color='#4e79a7', linewidth=2)
axes[1].axhline(50, color='gray', linestyle='--', alpha=0.5, label='50%')
axes[1].axhline(80, color='gray', linestyle=':', alpha=0.5, label='80%')

# Mark where 50% and 80% thresholds are reached
n50 = (cumsum >= 50).idxmin() if (cumsum >= 50).any() else None  # use argmax instead
n50 = cumsum[cumsum >= 50].index[0] if (cumsum >= 50).any() else None
n80 = cumsum[cumsum >= 80].index[0] if (cumsum >= 80).any() else None

# Get rank (position in sorted importance)
n50_rank = list(fi.index).index(n50) + 1 if n50 else None
n80_rank = list(fi.index).index(n80) + 1 if n80 else None

if n50_rank:
    axes[1].axvline(n50_rank, color='orange', linestyle='--', alpha=0.7,
                    label=f'50% importance @ top {n50_rank}')
if n80_rank:
    axes[1].axvline(n80_rank, color='red', linestyle='--', alpha=0.7,
                    label=f'80% importance @ top {n80_rank}')

axes[1].set_xlabel('Number of top features', fontsize=11)
axes[1].set_ylabel('Cumulative importance (%)')
axes[1].set_title('Cumulative Feature Importance', fontsize=12, fontweight='bold')
axes[1].legend(fontsize=9)
axes[1].grid(alpha=0.3)
axes[1].set_xlim(1, 40)

plt.tight_layout()
plt.savefig('../data/figures/06_lgb_feature_importance.png', dpi=120, bbox_inches='tight')
plt.show()

# Summary
pac_derived = ['nb_installations_pac_rmean_3m', 'nb_installations_pac_pct_1m',
               'nb_installations_pac_rstd_3m', 'nb_installations_pac_diff_1m',
               'nb_installations_pac_lag_1m']
top5_importance = fi[pac_derived].sum()
print(f'\nTop 5 PAC-derived features account for {top5_importance/total_importance*100:.1f}% of total importance')
print('These were FROZEN at last historical value → flat predictions')
print()
print('Fix applied (app/pages/predictions.py):')
print('  A rolling pac_buffer maintains historical PAC + seasonal proxies')
print('  rmean_3m, rstd_3m, diff_1m, pct_1m recomputed from buffer each step')
print('  Seasonal proxy = monthly_mean × recent_trend_scale')
print('  This injects month-to-month variation into all derived features')

## 5. Excluded Features — Missed Opportunities

Features present in the dataset but **excluded from model training**.
Some have strong correlations with PAC and could improve the model.

In [None]:
# ── Excluded features analysis ──────────────────────────────────────────────
admin_excluded = {'_outlier_score', 'climat_affaires_bat', 'date_id',
                  'interact_confiance_bat', 'latitude', 'longitude',
                  'n_valid_features', 'pct_valid_features'}
target_like_excluded = {'nb_installations_pac', 'nb_installations_clim',
                        'nb_dpe_total'}  # potential leakage or target-aligned

interesting_excluded = [
    f for f in EXCLUDED
    if f not in admin_excluded
    and f in corr_series
]

print('Excluded features with correlation data:')
print(f'{"Feature":<45} {"Spearman_r":>12} {"Seasonal_CV":>12} {"Coverage%":>10} {"Note"}')
print('-' * 100)
for feat in sorted(interesting_excluded, key=lambda x: abs(corr_series.get(x, 0)), reverse=True):
    r = corr_series.get(feat, np.nan)
    cv = seasonal_cv_dict.get(feat, np.nan)
    cov = df[feat].notna().sum() / len(df) * 100 if feat in df.columns else np.nan
    
    note = ''
    if abs(r) > 0.5:
        note = '★ HIGH CORR'
    if feat in target_like_excluded:
        note += ' (target-aligned — leakage risk)'
    if 'climat_affaires_bat' in feat or 'confiance_bat' in feat:
        note += ' (100% NaN)'
    print(f'{feat:<45} {r:>12.3f} {cv if not np.isnan(cv) else 0:>12.3f} {cov:>9.1f}%  {note}')

# Detailed look at nb_dpe_classe_ab
print()
print('─' * 80)
print('nb_dpe_classe_ab — Detailed analysis:')
print('  Definition: Number of DPE assessments with energy class A or B')
print('  (i.e., recently renovated / new high-efficiency buildings)')
print(f'  Spearman r with PAC: {corr_series.get("nb_dpe_classe_ab", np.nan):.3f}')
print(f'  Data coverage: {df["nb_dpe_classe_ab"].notna().sum()/len(df)*100:.1f}%')
print(f'  Why excluded: Likely dropped during preprocessing (all-NaN column check?)')
print('  Recommendation: ADD to model — strong signal, full coverage, no leakage risk')
print()
print('  Physical reasoning:')
print('  → Class A/B buildings → high-efficiency → heat pumps are the natural HVAC choice')
print('  → More DPE A/B in a month → more properties eligible for MaPrimeRénov \' subsidy')
print('  → Direct causal link: DPE activity drives PAC eligibility and decisions')

## 6. Weather Features — Seasonal but Not Predictive

A surprising finding: temperature and heating/cooling degree-day features are
**highly seasonal** but show **near-zero correlation** with PAC installations.

In [None]:
# ── Weather features: seasonal but decorrelated ─────────────────────────────
weather_feats = ['temp_mean', 'hdd_sum', 'cdd_sum', 'nb_jours_gel',
                 'hdd_sum_rmean_3m', 'cdd_sum_rmean_3m', 'nb_installations_pac']
weather_feats = [f for f in weather_feats if f in df.columns]

print('Weather features — seasonal CV vs correlation with PAC:')
print(f'{"Feature":<30} {"Seasonal CV":>12} {"Spearman r":>12} {"In model":>10}')
print('-' * 70)
for f in weather_feats:
    cv = seasonal_cv_dict.get(f, np.nan)
    r = corr_series.get(f, 0.0)
    in_model = 'YES' if f in MODEL_FEATURES else 'NO'
    print(f'{f:<30} {cv:>12.3f} {r:>12.3f} {in_model:>10}')

# Plot seasonal profiles: weather vs PAC
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
feats_to_plot = ['hdd_sum', 'cdd_sum', 'temp_mean',
                 'nb_jours_gel', 'nb_installations_pac', 'nb_dpe_classe_ab']

for ax, feat in zip(axes.flatten(), feats_to_plot):
    if feat not in df.columns:
        ax.set_visible(False)
        continue
    monthly = df.groupby('month')[feat].mean()
    r = corr_series.get(feat, np.nan)
    cv = seasonal_cv_dict.get(feat, np.nan)
    color = '#4e79a7' if feat in MODEL_FEATURES else '#e15759'
    
    ax.bar(MONTH_NAMES, [monthly.get(m+1, 0) for m in range(12)],
           color=color, alpha=0.8)
    ax.set_title(f'{feat}\n(r={r:+.2f}, CV={cv:.2f})', fontsize=9, fontweight='bold')
    ax.set_xticklabels(MONTH_NAMES, fontsize=7, rotation=45)
    ax.grid(axis='y', alpha=0.3)
    ax.set_ylim(0)
    # Status label
    status = 'In model' if feat in MODEL_FEATURES else 'EXCLUDED'
    ax.text(0.98, 0.95, status, transform=ax.transAxes, ha='right', va='top',
            fontsize=8, color=color, fontweight='bold')

plt.suptitle('Seasonal Profiles: Weather vs PAC Features\n'
             '(r = Spearman correlation with nb_installations_pac | CV = seasonal amplitude)',
             fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/figures/06_seasonal_weather_vs_pac.png', dpi=120, bbox_inches='tight')
plt.show()

print()
print('Key finding: hdd_sum, cdd_sum, temp_mean are highly seasonal (CV ≈ 0.8-1.4)')
print('but have near-zero Spearman correlation with PAC installations.')
print()
print('Why? PAC purchase decisions are NOT driven by the current month\'s temperature.')
print('They are driven by:')
print('  → Economic activity (DPE assessments, housing transactions)')
print('  → Government subsidies (MaPrimeRénov \' — quota-based, not weather-triggered)')
print('  → Installer capacity (HVAC companies have fixed installation schedules)')
print('  → Prior months\' demand (autoregressive effect)')

## 7. Correlation Matrix — Feature Families

Understanding how feature groups relate to the target and to each other.

In [None]:
# ── Correlation heatmap for representative features ──────────────────────────
key_features = [
    'nb_installations_pac',
    # Autoregressive
    'nb_installations_pac_lag_1m', 'nb_installations_pac_rmean_3m',
    'nb_installations_pac_diff_1m', 'nb_installations_pac_pct_1m',
    # DPE / housing
    'nb_dpe_classe_ab', 'nb_dpe_total_rmean_3m', 'pct_pac', 'pct_classe_ab',
    # Economic
    'confiance_menages', 'ipi_manufacturing', 'ipi_hvac_c2825',
    # Calendar
    'month_sin', 'month_cos', 'is_heating',
    # Weather
    'hdd_sum', 'cdd_sum', 'temp_mean',
]
key_features = [f for f in key_features if f in df.columns]

corr_matrix = df[key_features].corr(method='spearman')

fig, ax = plt.subplots(figsize=(14, 12))
im = ax.imshow(corr_matrix.values, cmap='RdYlGn', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, ax=ax, label='Spearman ρ', shrink=0.8)

ax.set_xticks(range(len(key_features)))
ax.set_yticks(range(len(key_features)))
ax.set_xticklabels(key_features, rotation=45, ha='right', fontsize=8)
ax.set_yticklabels(key_features, fontsize=8)

# Annotate cells with values
for i in range(len(key_features)):
    for j in range(len(key_features)):
        val = corr_matrix.values[i, j]
        if abs(val) > 0.3:
            ax.text(j, i, f'{val:.2f}', ha='center', va='center',
                    fontsize=6, color='black' if abs(val) < 0.7 else 'white')

ax.set_title('Spearman Correlation Matrix — Key Features', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/figures/06_correlation_matrix.png', dpi=120, bbox_inches='tight')
plt.show()

print('Key observations from correlation matrix:')
print(f'  → PAC autoregressive features cluster with r > 0.9 (autoregressive block)')
print(f'  → nb_dpe_classe_ab: r = {corr_matrix.loc["nb_dpe_classe_ab","nb_installations_pac"]:.2f} (if available)')
print(f'  → Weather features: r ≈ 0 with PAC but r > 0.8 with each other')
print(f'  → confiance_menages: r = {corr_matrix.loc["confiance_menages","nb_installations_pac"]:.2f}')
print(f'  → ipi_hvac_c2825: r = {corr_matrix.loc["ipi_hvac_c2825","nb_installations_pac"]:.2f}')

## 8. Summary — Findings and Recommendations

### Root Cause of Flat Predictions

The 5 most important LightGBM features are all PAC-derived features computed
from past values. Together they represent **~38% of total importance**.
During auto-regressive forecasting, these features were not updated — they
remained frozen at the last known value → **flat predictions**.

**Fix applied:** A `pac_buffer` list is maintained throughout the prediction loop.
Each step appends a seasonal proxy (historical monthly mean × recent-trend scale).
All derived features (`rmean_3m`, `rstd_3m`, `diff_1m`, `pct_1m`, `lag_*`) are
recomputed from the buffer before each prediction. Result: month-to-month variation
flows through all dominant features → seasonal predictions.

### Feature Recommendations

| Feature | Status | Correlation | Action |
|---------|--------|-------------|--------|
| `nb_dpe_classe_ab` | Excluded | r = +0.71 | **Add to model** |
| `hdd_sum`, `cdd_sum` | In model | r ≈ 0.00 | Keep — seasonal context |
| `confiance_menages` | In model | r = +0.41 | Keep |
| `ipi_hvac_c2825` | In model | r = +0.30 | Keep |
| `nb_installations_clim` | Excluded | r = +0.97 | **Caution** — target-aligned, leakage risk |
| `climat_affaires_bat` | Excluded | 100% NaN | Drop |

### Next Steps for Model Improvement

1. **Add `nb_dpe_classe_ab`** to features used in training — full coverage, high correlation,
   physical justification (A/B class → subsidy eligibility → PAC adoption)
2. **Evaluate adding `nb_dpe_total`** — good coverage, r = +0.50
3. **Feature importance audit** after adding nb_dpe_classe_ab — does it displace
   autoregressive features and improve seasonal forecast stability?
4. **Consider lagged nb_dpe_classe_ab** (lag_3m, lag_6m) — DPE → PAC decision
   has a typical 3-6 month lag (energy audit → subsidy application → installation)
5. **Re-evaluate weather features**: Near-zero correlation suggests they add noise.
   Test model performance with and without weather features.


In [None]:
# ── Summary table for the report ────────────────────────────────────────────
print('=' * 80)
print('FEATURE ANALYSIS SUMMARY')
print('=' * 80)
print()
print(f'Dataset: {df.shape[0]:,} rows | {df["dept"].nunique()} departments | {df["date_id"].nunique()} months')
print(f'Model features: {len(MODEL_FEATURES)} | Excluded: {len(EXCLUDED - admin_excluded - target_like_excluded)}')
print()
print('FLAT PREDICTION ROOT CAUSE:')
print('  5 PAC-derived features dominate LightGBM (top 5 by importance)')
pac_derived_importance = fi[['nb_installations_pac_rmean_3m',
                               'nb_installations_pac_pct_1m',
                               'nb_installations_pac_rstd_3m',
                               'nb_installations_pac_diff_1m',
                               'nb_installations_pac_lag_1m']].sum()
print(f'  Combined importance: {pac_derived_importance} / {total_importance} = {pac_derived_importance/total_importance*100:.1f}%')
print()
print('FIX:')
print('  pac_buffer updated each step with seasonal proxy')
print('  All 9 derived PAC features recomputed from buffer per forecast step')
print('  → Seasonality now propagates through the autoregressive chain')
print()
print('CANDIDATE FEATURE TO ADD:')
r_ab = corr_series.get('nb_dpe_classe_ab', np.nan)
cov_ab = df['nb_dpe_classe_ab'].notna().sum() / len(df) * 100
print(f'  nb_dpe_classe_ab | coverage: {cov_ab:.0f}% | Spearman r: {r_ab:.3f}')
print('  Physical link: DPE A/B → subsidy eligibility → PAC installation')
print('  Recommend: add + add lag_3m and lag_6m variants')
print()
print('WEATHER FEATURES: SEASONAL BUT NOT PREDICTIVE')
for wf in ['hdd_sum', 'cdd_sum', 'temp_mean']:
    cv = seasonal_cv_dict.get(wf, np.nan)
    r = corr_series.get(wf, np.nan)
    print(f'  {wf:<20}: CV={cv:.2f} (very seasonal), r_pac={r:+.3f} (near zero)')