# Verification 1: Series r2, peaks and troughs, etc.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.signal import find_peaks
from sklearn.metrics import r2_score, mean_squared_error
import warnings
import os

# --- 1. Initialization and Configuration ---

# Suppress warnings
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

# Apply a standard plot style (optional, good for readability)
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except:
    pass  # Fallback to default if style not found

# File Paths
DATA_FILE = "../../results/05_p_m_a_model/a_model_4/LSTM_Full_Results_SEQLEN400_20251229_221056.csv"
SOLAR_CYCLE_FILE = '../../data/ready/solar_cycle_minmax.csv'

PLOT_OUTPUT_DIR = "../../results/05_p_m_a_model/final_plots" 
if not os.path.exists(PLOT_OUTPUT_DIR):
    os.makedirs(PLOT_OUTPUT_DIR)
    print(f"Created directory: {PLOT_OUTPUT_DIR}")
    
# Date Ranges
TRAIN_START_DATE = '1855-12-02'
TEST_END_DATE = '2019-11-30'
FUTURE_KNOWN_END_DATE = '2025-07-31'
FUTURE_PRED_END_DATE = '2050-12-31'

# Start dates for OOTF (O3) and Future (F4)
OOTF_START_DATE = (pd.to_datetime(TEST_END_DATE) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
FUTURE_START_DATE = (pd.to_datetime(FUTURE_KNOWN_END_DATE) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')

# Model Split Dates
model_splits = {
    'M8+3': '1986-09-01',
    'M0+3': '1986-09-01',
    'M8+2': '1996-08-01',
    'M0+2': '1996-08-01'
}

# Load Main Data
try:
    df = pd.read_csv(DATA_FILE)
    # Ensure Date column is parsed. Assumes column name is 'Date' based on instruction.
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')
except FileNotFoundError:
    print(f"Error: Data file {DATA_FILE} not found.")
    df = pd.DataFrame()

# Load SIDC Solar Cycle Data (Ground Truth for Peaks/Valleys)
try:
    df_solar_cycle = pd.read_csv(SOLAR_CYCLE_FILE)
    known_peaks = df_solar_cycle.dropna(subset=['Max']).set_index(pd.to_datetime(df_solar_cycle['Max']))[['Max_SSN']].rename(columns={'Max_SSN': 'SSN'})
    known_valleys = df_solar_cycle.dropna(subset=['start_Min']).set_index(pd.to_datetime(df_solar_cycle['start_Min']))[['Min_SSN']].rename(columns={'Min_SSN': 'SSN'})
    known_peaks.index.name = 'date'
    known_valleys.index.name = 'date'
except FileNotFoundError:
    print("Warning: Solar cycle min/max file not found. Phase deviations cannot be calculated.")
    known_peaks = pd.DataFrame(columns=['SSN']).set_index(pd.to_datetime([]))
    known_valleys = pd.DataFrame(columns=['SSN']).set_index(pd.to_datetime([]))


# --- 2. Helper Functions ---

def find_peaks_valleys(data_series: pd.Series,
                       distance_days: int = 365 * 8,
                       prominence_peaks: int = 40,
                       prominence_valleys: int = 5):
    """
    Identify peaks and valleys in a time series using prominence.
    """
    data_series = data_series.dropna()
    if data_series.empty:
        return pd.DataFrame(columns=['SSN']).set_index(pd.to_datetime([])), pd.DataFrame(columns=['SSN']).set_index(pd.to_datetime([]))

    peaks_idx, _ = find_peaks(data_series, distance=distance_days, prominence=prominence_peaks)
    df_peaks = pd.DataFrame({
        'date': data_series.index[peaks_idx],
        'SSN': data_series.iloc[peaks_idx]
    }).set_index('date')

    valleys_idx, _ = find_peaks(-data_series, distance=distance_days, prominence=prominence_valleys)
    df_valleys = pd.DataFrame({
        'date': data_series.index[valleys_idx],
        'SSN': data_series.iloc[valleys_idx]
    }).set_index('date')

    return df_peaks, df_valleys

def plot_diagnostic(model_name, df_plot, p_col, pm_col, pma_col, 
                    p_peaks, p_valleys, 
                    pm_peaks, pm_valleys, 
                    pma_peaks, pma_valleys, 
                    period_name):
    """
    Plot P, PM, and PMA curves with their identified peaks and valleys.
    """
    plt.figure(figsize=(20, 10))
    
    # Plot Raw SSN if available
    if 'Raw_SSN' in df_plot.columns:
        plt.plot(df_plot.index, df_plot['Raw_SSN'], '.', color='gray', alpha=0.5, label='Raw SSN (Observed)')

    # Plot P Model
    plt.plot(df_plot.index, df_plot[p_col], 'b-', label=f'P Model ({p_col})', alpha=0.7)
    plt.plot(p_peaks.index, p_peaks['SSN'], 'bo', markersize=12, label='P Peak', mfc='none', mew=2)
    plt.plot(p_valleys.index, p_valleys['SSN'], 'bv', markersize=12, label='P Valley', mfc='none', mew=2)
    
    # Plot PM Model
    plt.plot(df_plot.index, df_plot[pm_col], 'g-', label=f'PM Model ({pm_col})', alpha=0.7)
    plt.plot(pm_peaks.index, pm_peaks['SSN'], 'go', markersize=12, label='PM Peak', mfc='none', mew=2)
    plt.plot(pm_valleys.index, pm_valleys['SSN'], 'gv', markersize=12, label='PM Valley', mfc='none', mew=2)
    
    # Plot PMA Model
    plt.plot(df_plot.index, df_plot[pma_col], 'r-', label=f'PMA Model ({pma_col})', linewidth=2)
    plt.plot(pma_peaks.index, pma_peaks['SSN'], 'ro', markersize=12, label='PMA Peak', mew=2)
    plt.plot(pma_valleys.index, pma_valleys['SSN'], 'rv', markersize=12, label='PMA Valley', mew=2)

    plt.title(f'Diagnostic Plot: {model_name} - {period_name}', fontsize=18)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('SSN', fontsize=12)
    
    # Format X-axis
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.YearLocator(5))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.xticks(rotation=45)
    
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    
    filename = f'Diagnostic_{model_name}_{period_name}.png'
    save_path = os.path.join(PLOT_OUTPUT_DIR, filename)
    plt.savefig(save_path) # 保存到指定路径
    print(f"--- [Plot] Saved: {save_path} ---")
    plt.close()

def calculate_deviation_vs_known_extrema(model_extrema: pd.DataFrame,
                                         known_extrema: pd.DataFrame,
                                         tolerance_days: int = 365 * 5):
    if model_extrema.empty or known_extrema.empty:
        return np.nan
    left_df = model_extrema.sort_index().reset_index().rename(
        columns={'date': 'model_date', 'index': 'model_date'}
    )
    right_df = known_extrema.sort_index().reset_index().rename(
        columns={'date': 'known_date', 'index': 'known_date'}
    )
    left_df['model_date'] = pd.to_datetime(left_df['model_date'])
    right_df['known_date'] = pd.to_datetime(right_df['known_date'])
    
    if 'model_date' not in left_df.columns or 'known_date' not in right_df.columns:
        return np.nan
        
    merged_df = pd.merge_asof(
        left_df.sort_values('model_date'), 
        right_df.sort_values('known_date'),
        left_on='model_date', 
        right_on='known_date',
        direction='nearest',
        tolerance=pd.Timedelta(days=tolerance_days)
    ).dropna()
    
    if merged_df.empty:
        return np.nan
    
    merged_df['phase_deviation_days'] = (merged_df['model_date'] - merged_df['known_date']).dt.days
    return np.abs(merged_df['phase_deviation_days']).mean()

def calculate_regression_metrics(y_true, y_pred):
    combined = pd.DataFrame({'true': y_true, 'pred': y_pred}).dropna()
    if combined.empty or len(combined) < 2:
        return np.nan, np.nan
    try:
        r2 = r2_score(combined['true'], combined['pred'])
        rmse = np.sqrt(mean_squared_error(combined['true'], combined['pred']))
        return r2, rmse
    except Exception:
        return np.nan, np.nan


# --- 3. Main Loop and Metric Calculation ---

results = {}

# Prominence configuration
prominence_config_base = {
    'prominence_peaks': 10,
    'prominence_valleys': 5
}
prominence_config_final = {
    'prominence_peaks': 40,
    'prominence_valleys': 5
}

for model_name, split_date in model_splits.items():
    if df.empty:
        print(f"Skipping {model_name}: Data is empty.")
        continue

    print(f"--- Processing Model: {model_name} (Split: {split_date}) ---")
    results[model_name] = {}
    
    # --- A. Define Column Names (Using English headers from file) ---
    target_ssn_orig = 'Raw_SSN'
    target_ssn_smooth = 'Smoothed_SSN'
    
    # Mapping based on provided file structure
    target_p_resid = f'Residual_{model_name}'
    target_m_resid = f'LGBM_Resid_{model_name}'
    
    pred_p = f'Fit_SSN_{model_name}'
    pred_m = f'LGBM_Pred_{model_name}'
    pred_a = f'LSTM_Mean_Pred_{model_name}'
    pred_pma = f'Total_Pred_{model_name}'
    
    pred_pm_series_name = f'PM_Pred_{model_name}'
    # Construct PM series manually if not in file
    if pred_p in df.columns and pred_m in df.columns:
        df[pred_pm_series_name] = df[pred_p] + df[pred_m]
    else:
        print(f"Warning: Columns for P or M model missing for {model_name}")
        df[pred_pm_series_name] = np.nan
        
    pred_pm = pred_pm_series_name

    # --- B. Define Time Slices ---
    # Ensure indices exist before slicing
    try:
        df_t = df.loc[TRAIN_START_DATE : split_date]
        cv_start_date = (pd.to_datetime(split_date) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
        df_cv = df.loc[cv_start_date : TEST_END_DATE]
        df_ootf = df.loc[OOTF_START_DATE : FUTURE_KNOWN_END_DATE]
        df_in_sample = df.loc[TRAIN_START_DATE : TEST_END_DATE] # T1 + V2
        df_future = df.loc[FUTURE_START_DATE : FUTURE_PRED_END_DATE]
    except KeyError as e:
        print(f"Date slicing error for {model_name}: {e}")
        continue

    # --- C. Calculate R2 and RMSE (T1, V2, O3) ---
    time_ranges = {
        'T1': df_t,   # Train
        'V2': df_cv,  # Validation
        'O3': df_ootf # Out-of-Time
    }
    
    for split_name, df_split in time_ranges.items():
        if df_split.empty: continue
        
        # P Model vs Raw
        r2, rmse = calculate_regression_metrics(df_split[target_ssn_orig], df_split[pred_p])
        results[model_name][f'P_vs_Raw_{split_name}_R2'] = r2
        results[model_name][f'P_vs_Raw_{split_name}_RMSE'] = rmse
        
        # P Model vs Smoothed
        r2, rmse = calculate_regression_metrics(df_split[target_ssn_smooth], df_split[pred_p])
        results[model_name][f'P_vs_Smooth_{split_name}_R2'] = r2
        results[model_name][f'P_vs_Smooth_{split_name}_RMSE'] = rmse
        
        # M Model vs P Residuals
        r2, rmse = calculate_regression_metrics(df_split[target_p_resid], df_split[pred_m])
        results[model_name][f'M_vs_P_Resid_{split_name}_R2'] = r2
        results[model_name][f'M_vs_P_Resid_{split_name}_RMSE'] = rmse
        
        # PM Model vs Raw
        r2, rmse = calculate_regression_metrics(df_split[target_ssn_orig], df_split[pred_pm])
        results[model_name][f'PM_vs_Raw_{split_name}_R2'] = r2
        results[model_name][f'PM_vs_Raw_{split_name}_RMSE'] = rmse
        
        # A Model vs M Residuals
        r2, rmse = calculate_regression_metrics(df_split[target_m_resid], df_split[pred_a])
        results[model_name][f'A_vs_M_Resid_{split_name}_R2'] = r2
        results[model_name][f'A_vs_M_Resid_{split_name}_RMSE'] = rmse
        
        # PMA Model vs Raw
        r2, rmse = calculate_regression_metrics(df_split[target_ssn_orig], df_split[pred_pma])
        results[model_name][f'PMA_vs_Raw_{split_name}_R2'] = r2
        results[model_name][f'PMA_vs_Raw_{split_name}_RMSE'] = rmse

    # --- D. Peak/Valley Analysis (T1, V2, O3) ---
    results[model_name]['History_Extrema'] = {}
    
    for range_name, df_range in time_ranges.items():
        if df_range.empty: continue
        
        # P Model (Base prominence)
        p_peaks, p_valleys = find_peaks_valleys(df_range[pred_p], **prominence_config_base)
        results[model_name][f'P_{range_name}_Peak_Dev'] = calculate_deviation_vs_known_extrema(p_peaks, known_peaks)
        results[model_name][f'P_{range_name}_Valley_Dev'] = calculate_deviation_vs_known_extrema(p_valleys, known_valleys)
        results[model_name]['History_Extrema'][f'P_{range_name}_Peaks'] = p_peaks
        results[model_name]['History_Extrema'][f'P_{range_name}_Valleys'] = p_valleys
        
        # PM Model (Base prominence)
        pm_peaks, pm_valleys = find_peaks_valleys(df_range[pred_pm], **prominence_config_base)
        results[model_name][f'PM_{range_name}_Peak_Dev'] = calculate_deviation_vs_known_extrema(pm_peaks, known_peaks)
        results[model_name][f'PM_{range_name}_Valley_Dev'] = calculate_deviation_vs_known_extrema(pm_valleys, known_valleys)
        results[model_name]['History_Extrema'][f'PM_{range_name}_Peaks'] = pm_peaks
        results[model_name]['History_Extrema'][f'PM_{range_name}_Valleys'] = pm_valleys

        # PMA Model (Final/High prominence)
        pma_peaks, pma_valleys = find_peaks_valleys(df_range[pred_pma], **prominence_config_final)
        results[model_name][f'PMA_{range_name}_Peak_Dev'] = calculate_deviation_vs_known_extrema(pma_peaks, known_peaks)
        results[model_name][f'PMA_{range_name}_Valley_Dev'] = calculate_deviation_vs_known_extrema(pma_valleys, known_valleys)
        results[model_name]['History_Extrema'][f'PMA_{range_name}_Peaks'] = pma_peaks
        results[model_name]['History_Extrema'][f'PMA_{range_name}_Valleys'] = pma_valleys
        
        # Diagnostic Plot for O3
        if range_name == 'O3':
            plot_diagnostic(model_name, df_range, pred_p, pred_pm, pred_pma,
                            p_peaks, p_valleys,
                            pm_peaks, pm_valleys,
                            pma_peaks, pma_valleys,
                            'O3_Out_of_Sample')

    # --- E. Future Predictions (F4) ---
    results[model_name]['Future_Extrema'] = {}
    if not df_future.empty:
        # P Model
        p_peaks_f, p_valleys_f = find_peaks_valleys(df_future[pred_p], **prominence_config_base)
        results[model_name]['Future_Extrema']['P_Peaks'] = p_peaks_f
        results[model_name]['Future_Extrema']['P_Valleys'] = p_valleys_f
        
        # PM Model
        pm_peaks_f, pm_valleys_f = find_peaks_valleys(df_future[pred_pm], **prominence_config_base)
        results[model_name]['Future_Extrema']['PM_Peaks'] = pm_peaks_f
        results[model_name]['Future_Extrema']['PM_Valleys'] = pm_valleys_f

        # PMA Model
        pma_peaks_f, pma_valleys_f = find_peaks_valleys(df_future[pred_pma], **prominence_config_final)
        results[model_name]['Future_Extrema']['PMA_Peaks'] = pma_peaks_f
        results[model_name]['Future_Extrema']['PMA_Valleys'] = pma_valleys_f
        
        # Diagnostic Plot for F4
        plot_diagnostic(model_name, df_future, pred_p, pred_pm, pred_pma,
                        p_peaks_f, p_valleys_f,
                        pm_peaks_f, pm_valleys_f,
                        pma_peaks_f, pma_valleys_f,
                        'F4_Future')
    else:
        print(f"Model {model_name}: No data in F4 range ({FUTURE_START_DATE} to {FUTURE_PRED_END_DATE}).")


# --- 4. Output Results ---

# Set display options to ensure all columns/rows are shown and floats are formatted nicely
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)

# === Output 1: Performance Metrics Summary ===

flat_results = []
for model_name, metrics in results.items():
    row = {'Model': model_name}
    for k, v in metrics.items():
        if not isinstance(v, dict):
            row[k] = v
    flat_results.append(row)

if flat_results:
    df_results = pd.DataFrame(flat_results).set_index('Model').T
    
    print("\n" + "="*80)
    print(" " * 25 + "Model Performance Summary (Metrics)")
    print("="*80)
    # Changed from to_markdown() to to_string() to avoid tabulate dependency
    print(df_results.to_string())
else:
    print("No summary metrics generated.")

# === Output 2: Extrema Details (T1, V2, O3, F4) ===

# Change float format for extrema details to 2 decimal places
pd.set_option('display.float_format', '{:.2f}'.format)

print("\n" + "="*80)
print(" " * 26 + "Extrema Details (T1, V2, O3, F4)")
print("="*80)

for model_name, data in results.items():
    print(f"\n--- Model: {model_name} ---")

    # --- Historical Extrema (T1, V2, O3) ---
    if 'History_Extrema' in data and data['History_Extrema']:
        
        # --- T1 (Train) ---
        print("\n" + "-"*30 + " T1 (Train Set) " + "-"*30)
        if 'PMA_T1_Peaks' in data['History_Extrema']:
            print("\n[PMA (Total) T1 Peaks]:")
            print(data['History_Extrema']['PMA_T1_Peaks'].to_string())
            print("\n[PMA (Total) T1 Valleys]:")
            print(data['History_Extrema']['PMA_T1_Valleys'].to_string())
            print("\n[PM (P+M) T1 Peaks]:")
            print(data['History_Extrema']['PM_T1_Peaks'].to_string())
            print("\n[P (Planetary) T1 Peaks]:")
            print(data['History_Extrema']['P_T1_Peaks'].to_string())
        else:
            print("(T1 data empty or not processed)")

        # --- V2 (Validation) ---
        print("\n" + "-"*30 + " V2 (Validation Set) " + "-"*30)
        if 'PMA_V2_Peaks' in data['History_Extrema']:
            print("\n[PMA (Total) V2 Peaks]:")
            print(data['History_Extrema']['PMA_V2_Peaks'].to_string())
            print("\n[PMA (Total) V2 Valleys]:")
            print(data['History_Extrema']['PMA_V2_Valleys'].to_string())
        else:
            print("(V2 data empty or not processed)")

        # --- O3 (Out-of-Sample) ---
        print("\n" + "-"*30 + " O3 (Out-of-Sample) " + "-"*30)
        if 'PMA_O3_Peaks' in data['History_Extrema']:
            print("\n[PMA (Total) O3 Peaks]:")
            print(data['History_Extrema']['PMA_O3_Peaks'].to_string())
            print("\n[PMA (Total) O3 Valleys]:")
            print(data['History_Extrema']['PMA_O3_Valleys'].to_string())
        else:
            print("(O3 data empty or not processed)")
            
    else:
        print("\n(No History Extrema Data)")

    # --- Future Extrema (F4) ---
    if 'Future_Extrema' in data and data['Future_Extrema']:
        print("\n" + "-"*30 + " F4 (Future Prediction) " + "-"*30)
        
        print("\n[PMA (Total) Future Peaks]:")
        print(data['Future_Extrema']['PMA_Peaks'].to_string())
        print("\n[PMA (Total) Future Valleys]:")
        print(data['Future_Extrema']['PMA_Valleys'].to_string())
        
        print("\n[PM (P+M) Future Peaks]:")
        print(data['Future_Extrema']['PM_Peaks'].to_string())
        
        print("\n[P (Planetary) Future Peaks]:")
        print(data['Future_Extrema']['P_Peaks'].to_string())
    else:
        print("\n(No Future Extrema Data)")

print("\n" + "="*80)
print("Execution Completed.")

  known_valleys = df_solar_cycle.dropna(subset=['start_Min']).set_index(pd.to_datetime(df_solar_cycle['start_Min']))[['Min_SSN']].rename(columns={'Min_SSN': 'SSN'})


--- Processing Model: M8+3 (Split: 1986-09-01) ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M8+3_O3_Out_of_Sample.png ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M8+3_F4_Future.png ---
--- Processing Model: M0+3 (Split: 1986-09-01) ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M0+3_O3_Out_of_Sample.png ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M0+3_F4_Future.png ---
--- Processing Model: M8+2 (Split: 1996-08-01) ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M8+2_O3_Out_of_Sample.png ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M8+2_F4_Future.png ---
--- Processing Model: M0+2 (Split: 1996-08-01) ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M0+2_O3_Out_of_Sample.png ---
--- [Plot] Saved: ../../results/05_p_m_a_model/final_plots\Diagnostic_M0+2_F4_Future.png ---

                       

# Verification 2: Final Residuals

In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import os

# --- 1. Constants & Configuration ---
MODEL_LABELS = ['M8+2', 'M8+3', 'M0+3', 'M0+2']
# Using the path from the previous conversation
DATA_FILE = "../../results/05_p_m_a_model/a_model_4/LSTM_Full_Results_SEQLEN400_20251229_221056.csv"
ORIGINAL_COL = "Raw_SSN"  # Updated to English column name
LAG_NUM = 50  # Lag order for Ljung-Box test

# --- 2. File Validation ---
if not os.path.exists(DATA_FILE):
    print(f"Error: File '{DATA_FILE}' not found.")
    print("Please check the path.")
else:
    print(f"File found: {DATA_FILE}\n")
    
    # --- 3. Load Data ---
    try:
        df = pd.read_csv(DATA_FILE)
        print("CSV loaded successfully.")

        # --- 4. Column Validation ---
        required_cols = [ORIGINAL_COL] + [f"Total_Pred_{label}" for label in MODEL_LABELS]
        missing_cols = [col for col in required_cols if col not in df.columns]

        if missing_cols:
            print(f"\nError: Missing required columns in DataFrame: {missing_cols}")
            print(f"Available columns: {df.columns.tolist()}")
        else:
            print("All required columns found.\n")
            
            results_summary = {}

            # --- 5. Analysis Loop ---
            print("--- Starting Residual Analysis ---")
            
            for label in MODEL_LABELS:
                print(f"\n" + "="*45)
                print(f"  Analyzing Model: {label}")
                print(f"="*45)
                
                pred_col = f"Total_Pred_{label}"
                
                try:
                    # --- 6. Calculate Residuals ---
                    original_data = pd.to_numeric(df[ORIGINAL_COL], errors='coerce')
                    predicted_data = pd.to_numeric(df[pred_col], errors='coerce')
                    
                    # Residuals = Actual - Predicted
                    residuals = (original_data - predicted_data).dropna()
                    
                    if residuals.empty:
                        print("  Error: Residuals are empty (possibly all-NaN). Skipping.")
                        continue

                    # --- 7. Statistical Tests ---
                    
                    # Test 1: Mean and Standard Deviation
                    mean = residuals.mean()
                    std_dev = residuals.std()
                    print(f"  [Test 1] Basic Statistics:")
                    print(f"    Mean: {mean:.4f}")
                    print(f"    Std Dev: {std_dev:.4f}")

                    # Test 2: Variance Constancy (Simple split test)
                    half_point = len(residuals) // 2
                    var1 = residuals.iloc[:half_point].var()
                    var2 = residuals.iloc[half_point:].var()
                    print(f"\n  [Test 2] Variance Stability (Split Test):")
                    print(f"    Variance (First Half): {var1:.4f}")
                    print(f"    Variance (Second Half): {var2:.4f}")
                    
                    var_ratio = (var2 / var1) if var1 != 0 else np.nan
                    if abs(1 - var_ratio) > 0.5:
                         print("    -> Observation: Variance might be unstable (>50% change).")
                    else:
                         print("    -> Observation: Variance appears relatively stable.")

                    # Test 3: Ljung-Box Test (White Noise Check)
                    # H0: The data is independently distributed (White Noise)
                    # H1: The data is not independently distributed (Serial Correlation exists)
                    print(f"\n  [Test 3] Ljung-Box Test (Autocorrelation):")
                    
                    try:
                        # Run test up to LAG_NUM
                        ljung_box_result_df = sm.stats.acorr_ljungbox(residuals, lags=LAG_NUM, return_df=True)
                        
                        # Get p-value for the specific lag (LAG_NUM)
                        p_value = ljung_box_result_df.iloc[-1]['lb_pvalue']

                        print(f"    Ljung-Box Test (at lag={LAG_NUM}):")
                        print(f"    P-value: {p_value:.6f}")
                        
                        if p_value < 0.05:
                            conclusion = "p < 0.05: Reject H0. Residuals are NOT white noise (Correlation exists)."
                            is_white_noise = False
                        else:
                            conclusion = "p >= 0.05: Fail to reject H0. Residuals ARE white noise."
                            is_white_noise = True
                        
                        print(f"    -> Conclusion: {conclusion}")
                        
                        # Store results
                        results_summary[label] = {
                            "Mean": mean,
                            "Std_Dev": std_dev,
                            "Var_Ratio": var_ratio,
                            "LB_P_Value": p_value,
                            "Is_White_Noise": is_white_noise
                        }
                    
                    except Exception as e:
                        print(f"    Ljung-Box Test Failed: {e}")
                        results_summary[label] = {
                            "Mean": mean, "Std_Dev": std_dev, "Var_Ratio": var_ratio,
                            "LB_P_Value": np.nan, "Is_White_Noise": "Error"
                        }

                except Exception as e:
                    print(f"  --- Unexpected error processing {label}: {e} ---")

            # --- 8. Final Summary Table ---
            print("\n\n" + "="*80)
            print(f"{'RESIDUAL ANALYSIS SUMMARY':^80}")
            print("="*80)
            print("Note: Ljung-Box H0 = White Noise (No Autocorrelation)")
            print("-" * 80)
            
            # Create a DataFrame for the summary to print nicely
            if results_summary:
                summary_data = []
                for label, metrics in results_summary.items():
                    
                    wn_status = metrics.get('Is_White_Noise')
                    if wn_status == True:
                        wn_str = "YES (p>=0.05)"
                    elif wn_status == False:
                        wn_str = "NO (p<0.05)"
                    else:
                        wn_str = "Error"

                    summary_data.append({
                        'Model': label,
                        'Mean': metrics['Mean'],
                        'Std Dev': metrics['Std_Dev'],
                        'Var Ratio': metrics['Var_Ratio'],
                        'LB P-Value': metrics['LB_P_Value'],
                        'White Noise?': wn_str
                    })
                
                df_summary = pd.DataFrame(summary_data)
                
                # Format options for clean printing
                pd.set_option('display.max_columns', None)
                pd.set_option('display.width', 1000)
                pd.set_option('display.float_format', lambda x: '%.4f' % x)
                
                print(df_summary.to_string(index=False))
            else:
                print("No results to display.")

            print("\nAnalysis Completed.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

File found: ../../results/05_p_m_a_model/a_model_4/LSTM_Full_Results_SEQLEN400_20251229_221056.csv

CSV loaded successfully.
All required columns found.

--- Starting Residual Analysis ---

  Analyzing Model: M8+2
  [Test 1] Basic Statistics:
    Mean: 1.1282
    Std Dev: 20.8793

  [Test 2] Variance Stability (Split Test):
    Variance (First Half): 591.1284
    Variance (Second Half): 280.7167
    -> Observation: Variance might be unstable (>50% change).

  [Test 3] Ljung-Box Test (Autocorrelation):
    Ljung-Box Test (at lag=50):
    P-value: 0.000000
    -> Conclusion: p < 0.05: Reject H0. Residuals are NOT white noise (Correlation exists).

  Analyzing Model: M8+3
  [Test 1] Basic Statistics:
    Mean: 0.9539
    Std Dev: 21.1970

  [Test 2] Variance Stability (Split Test):
    Variance (First Half): 614.9560
    Variance (Second Half): 283.6844
    -> Observation: Variance might be unstable (>50% change).

  [Test 3] Ljung-Box Test (Autocorrelation):
    Ljung-Box Test (at lag=50