In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import r2_score, mean_squared_error

# ================= Configuration =================
CSV_FILE_PATH = "../../results/05_p_m_a_model/a_model_4/LSTM_Full_Results_SEQLEN400_20251229_221056.csv"
TARGET_COL = 'Raw_SSN'
TEST_END_DATE = '2019-11-30'

# Models and their specific OOT (Out-of-Sample) start dates
MODEL_CONFIGS = {
    'M8+2': '1996-08-01',
    'M8+3': '1986-08-01',
    'M0+3': '1986-09-01',
    'M0+2': '1996-08-01'
}

# ================= Core Metric Functions =================

def dm_test(actual, pred_bench, pred_hybrid, h=1, criterion='MSE'):
    """Diebold-Mariano Test for statistical significance[cite: 1, 10]."""
    e1 = np.array(actual) - np.array(pred_bench)
    e2 = np.array(actual) - np.array(pred_hybrid)
    
    if criterion == 'MSE':
        d = e1**2 - e2**2
    else:
        d = np.abs(e1) - np.abs(e2)
        
    T = float(len(d))
    mean_d = np.mean(d)
    
    def autocovariance(x, k):
        if k == 0:
            return np.var(x)
        return np.sum((x[:-k] - np.mean(x)) * (x[k:] - np.mean(x))) / len(x)
    
    gamma = [autocovariance(d, lag) for lag in range(h)]
    v_d = (gamma[0] + 2 * sum(gamma[1:])) / T
    
    if v_d <= 0: return 0.0, 1.0
    dm_stat = mean_d / np.sqrt(v_d)
    p_value = 2 * (1 - stats.norm.cdf(np.abs(dm_stat)))
    return dm_stat, p_value

def calculate_da(y_true, y_pred):
    """Directional Accuracy (Trend consistency)[cite: 18, 19]."""
    actual_diff = np.sign(np.diff(y_true))
    pred_diff = np.sign(np.diff(y_pred))
    return np.mean(actual_diff == pred_diff) * 100

def run_comprehensive_evaluation():
    print(f"Loading results from: {CSV_FILE_PATH}")
    df = pd.read_csv(CSV_FILE_PATH, parse_dates=['Date'])
    df = df.sort_values('Date')
    
    # Calculate Persistence (Naive) Baseline: y_t = y_{t-1} [cite: 50, 74]
    df['Persistence_Pred'] = df[TARGET_COL].shift(1)
    
    summary_results = []

    for model_name, split_date in MODEL_CONFIGS.items():
        print(f"Evaluating Model: {model_name}...")
        
        # 1. Column Mapping
        hybrid_col = f'Total_Pred_{model_name}'
        p_model_col = f'Fit_SSN_{model_name}'
        lgbm_col = f'LGBM_Pred_{model_name}'
        # Benchmark for DM: P-Model + LGBM (The state before LSTM) [cite: 5]
        bench_col = f'Bench_{model_name}'
        df[bench_col] = df[p_model_col] + df[lgbm_col].fillna(0)
        
        # 2. Filter Test Set
        mask = (df['Date'] > pd.to_datetime(split_date)) & (df['Date'] <= pd.to_datetime(TEST_END_DATE))
        test_df = df.loc[mask].dropna(subset=[TARGET_COL, hybrid_col, 'Persistence_Pred'])
        
        y_true = test_df[TARGET_COL].values
        y_hybrid = test_df[hybrid_col].values
        y_bench = test_df[bench_col].values
        y_pers = test_df['Persistence_Pred'].values
        
        # 3. Calculate Metrics
        # PSS (Prediction Skill Score) vs Persistence [cite: 52, 62]
        mse_hybrid = mean_squared_error(y_true, y_hybrid)
        mse_pers = mean_squared_error(y_true, y_pers)
        pss = 1 - (mse_hybrid / mse_pers)
        
        # DM Test (Hybrid vs Bench) [cite: 7]
        dm_stat, p_val = dm_test(y_true, y_bench, y_hybrid)
        
        # Multi-scale DA [cite: 21]
        da_daily = calculate_da(y_true, y_hybrid)
        
        # Weekly/Monthly Aggregation for DA
        test_resampled = test_df.set_index('Date')[[TARGET_COL, hybrid_col]]
        da_weekly = calculate_da(
            test_resampled[TARGET_COL].resample('W').mean().values,
            test_resampled[hybrid_col].resample('W').mean().values
        )
        da_monthly = calculate_da(
            test_resampled[TARGET_COL].resample('ME').mean().values,
            test_resampled[hybrid_col].resample('ME').mean().values
        )
        
        # R2 Score
        r2 = r2_score(y_true, y_hybrid)
        
        summary_results.append({
            'Model': model_name,
            'R2': r2,
            'PSS': pss,
            'DM_Stat': dm_stat,
            'p-value': p_val,
            'DA_Daily%': da_daily,
            'DA_Weekly%': da_weekly,
            'DA_Monthly%': da_monthly
        })

    # 4. Generate Final Table
    res_df = pd.DataFrame(summary_results)
    print("\n" + "="*95)
    print("FINAL MODEL VERIFICATION SUMMARY TABLE")
    print("="*95)
    pd.options.display.float_format = '{:,.4f}'.format
    print(res_df.to_string(index=False))
    print("="*95)
    
    # Save to CSV for LaTeX formatting
    # res_df.to_csv("model_verification_summary.csv", index=False)
    # print("\nResults saved to 'model_verification_summary.csv'.")

if __name__ == "__main__":
    run_comprehensive_evaluation()

Loading results from: ../../results/05_p_m_a_model/a_model_4/LSTM_Full_Results_SEQLEN400_20251229_221056.csv
Evaluating Model: M8+2...
Evaluating Model: M8+3...
Evaluating Model: M0+3...
Evaluating Model: M0+2...

FINAL MODEL VERIFICATION SUMMARY TABLE
Model     R2    PSS  DM_Stat  p-value  DA_Daily%  DA_Weekly%  DA_Monthly%
 M8+2 0.9538 0.0441  41.1802   0.0000    45.3873     87.9211      95.6989
 M8+3 0.9602 0.0572  61.8111   0.0000    47.1535     89.9367      95.2381
 M0+3 0.9612 0.0827  55.6150   0.0000    47.5539     89.5617      95.2261
 M0+2 0.9555 0.0784  52.3949   0.0000    45.5047     88.1676      96.4158
