In [None]:
from src.data.dataLoader import StructuralBreakDataLoader
# Structural Break Detection: Basic vs Synthetic Data Generation Approach
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from scipy import stats
from scipy.stats import ks_2samp, mannwhitneyu, levene, anderson_ksamp
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

# Import your existing data loader
import sys
sys.path.append('src')  # Adjust path as needed

print("Loading data...")
data_loader = StructuralBreakDataLoader()
data_loader.load_data(use_crunch=False)
train_data = data_loader.get_all_train_series()

print(f"Loaded {len(train_data)} time series")

# =============================================================================
# APPROACH 1: Basic Statistical Tests Between Pre/Post Break
# =============================================================================

def extract_basic_features(ts_data):
    """Extract basic statistical test features between pre and post break periods"""
    pre_break = ts_data.period_0_values
    post_break = ts_data.period_1_values
    
    if len(pre_break) < 5 or len(post_break) < 5:
        # Return NaN features for very short series
        return {
            'ks_statistic': np.nan, 'ks_pvalue': np.nan,
            'mw_statistic': np.nan, 'mw_pvalue': np.nan,
            'levene_statistic': np.nan, 'levene_pvalue': np.nan,
            'anderson_statistic': np.nan, 'anderson_pvalue': np.nan
        }
    
    features = {}
    
    # 1. Kolmogorov-Smirnov test (distribution difference)
    try:
        ks_stat, ks_pval = ks_2samp(pre_break, post_break)
        features['ks_statistic'] = ks_stat
        features['ks_pvalue'] = ks_pval
    except:
        features['ks_statistic'] = np.nan
        features['ks_pvalue'] = np.nan
    
    # 2. Mann-Whitney U test (location difference)
    try:
        mw_stat, mw_pval = mannwhitneyu(pre_break, post_break, alternative='two-sided')
        features['mw_statistic'] = mw_stat
        features['mw_pvalue'] = mw_pval
    except:
        features['mw_statistic'] = np.nan
        features['mw_pvalue'] = np.nan
    
    # 3. Levene's test (variance difference)
    try:
        levene_stat, levene_pval = levene(pre_break, post_break)
        features['levene_statistic'] = levene_stat
        features['levene_pvalue'] = levene_pval
    except:
        features['levene_statistic'] = np.nan
        features['levene_pvalue'] = np.nan
    
    # 4. Anderson-Darling test (distribution difference, sensitive to tails)
    try:
        anderson_result = anderson_ksamp([pre_break, post_break])
        features['anderson_statistic'] = anderson_result.statistic
        features['anderson_pvalue'] = anderson_result.pvalue
    except:
        features['anderson_statistic'] = np.nan
        features['anderson_pvalue'] = np.nan
    
    return features

print("Extracting basic features...")
basic_features = []
labels = []
series_ids = []

for series_id, ts_data in train_data.items():
    features = extract_basic_features(ts_data)
    basic_features.append(features)
    labels.append(ts_data.has_break)
    series_ids.append(series_id)

basic_df = pd.DataFrame(basic_features, index=series_ids)
y = pd.Series(labels, index=series_ids)

# Remove rows with all NaN features
basic_df_clean = basic_df.dropna()
y_clean = y.loc[basic_df_clean.index]

print(f"Basic features shape: {basic_df_clean.shape}")
print(f"Features: {list(basic_df_clean.columns)}")

# =============================================================================
# APPROACH 2: Synthetic Data Generation Method
# =============================================================================

def fit_ar_model(series, max_lags=10):
    """Fit AR model to time series, selecting optimal lag via AIC"""
    if len(series) < max_lags + 5:
        max_lags = max(1, len(series) - 5)
    
    best_aic = np.inf
    best_model = None
    best_lag = 1
    
    for lag in range(1, max_lags + 1):
        try:
            model = AutoReg(series, lags=lag, trend='c')
            fitted = model.fit()
            if fitted.aic < best_aic:
                best_aic = fitted.aic
                best_model = fitted
                best_lag = lag
        except:
            continue
    
    return best_model, best_lag

def generate_synthetic_continuations(pre_break_data, continuation_length, n_simulations=100):
    """Generate synthetic continuations using fitted AR model"""
    
    # Fit AR model
    model, lag = fit_ar_model(pre_break_data)
    
    synthetic_continuations = []
    for _ in range(n_simulations):
        # Generate forecast
        forecast = model.forecast(steps=continuation_length)
        
        # Add noise based on residual variance
        noise = np.random.normal(0, np.sqrt(model.sigma2), continuation_length)
        synthetic_series = forecast + noise
        
        synthetic_continuations.append(synthetic_series)
    
    return synthetic_continuations

def extract_synthetic_features(ts_data, n_simulations=50):
    """Extract features using synthetic data generation approach"""
    pre_break = ts_data.period_0_values
    post_break = ts_data.period_1_values
    
    if len(pre_break) < 5 or len(post_break) < 5:
        return {f'synth_{feat}': np.nan for feat in [
            'ks_percentile', 'ks_zscore', 'mw_percentile', 'mw_zscore',
            'levene_percentile', 'levene_zscore', 'anderson_percentile', 'anderson_zscore'
        ]}
    
    # Generate synthetic continuations
    synthetic_continuations = generate_synthetic_continuations(
        pre_break, len(post_break), n_simulations
    )
    
    features = {}
    
    # Define test functions
    test_functions = {
        'ks': lambda x, y: ks_2samp(x, y)[0],  # statistic only
        'mw': lambda x, y: mannwhitneyu(x, y, alternative='two-sided')[0],
        'levene': lambda x, y: levene(x, y)[0],
        'anderson': lambda x, y: anderson_ksamp([x, y]).statistic
    }
    
    for test_name, test_func in test_functions.items():
        try:
            # Build null distribution of test statistics
            null_stats = []
            for synthetic in synthetic_continuations:
                try:
                    stat = test_func(pre_break, synthetic)
                    null_stats.append(stat)
                except:
                    continue
            
            if len(null_stats) < 10:  # Need enough samples for meaningful distribution
                features[f'synth_{test_name}_percentile'] = np.nan
                features[f'synth_{test_name}_zscore'] = np.nan
                continue
            
            null_stats = np.array(null_stats)
            
            # Compute actual test statistic
            actual_stat = test_func(pre_break, post_break)
            
            # Compute percentile (what fraction of null stats are <= actual)
            percentile = np.mean(null_stats <= actual_stat)
            features[f'synth_{test_name}_percentile'] = percentile
            
            # Compute z-score
            if np.std(null_stats) > 0:
                zscore = (actual_stat - np.mean(null_stats)) / np.std(null_stats)
                features[f'synth_{test_name}_zscore'] = zscore
            else:
                features[f'synth_{test_name}_zscore'] = 0
                
        except Exception as e:
            features[f'synth_{test_name}_percentile'] = np.nan
            features[f'synth_{test_name}_zscore'] = np.nan
    
    return features

print("Extracting synthetic features (this may take a few minutes)...")
synthetic_features = []

# Process a subset for faster testing - remove this limit for full dataset
subset_size = min(1000, len(train_data))  
train_data_subset = dict(list(train_data.items())[:subset_size])

for i, (series_id, ts_data) in enumerate(train_data_subset.items()):
    if i % 5 == 0:
        print(f"Processing series {i+1}/{len(train_data_subset)}")
    
    features = extract_synthetic_features(ts_data, n_simulations=30)  # Reduced for speed
    synthetic_features.append(features)

synthetic_df = pd.DataFrame(synthetic_features, index=list(train_data_subset.keys()))
y_subset = pd.Series([ts.has_break for ts in train_data_subset.values()], 
                     index=list(train_data_subset.keys()))

# Clean synthetic features
synthetic_df_clean = synthetic_df.dropna()
y_synthetic_clean = y_subset.loc[synthetic_df_clean.index]

print(f"Synthetic features shape: {synthetic_df_clean.shape}")
print(f"Features: {list(synthetic_df_clean.columns)}")

# =============================================================================
# MODEL EVALUATION WITH CROSS-VALIDATION
# =============================================================================

def evaluate_features(X, y, feature_type_name):
    """Evaluate features using cross-validation"""
    if len(X) == 0:
        print(f"No valid data for {feature_type_name}")
        return None
    
    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_results = cross_validate(
        model, X, y,
        cv=cv,
        scoring='roc_auc',
        return_train_score=True,
        n_jobs=-1
    )
    
    val_scores = cv_results['test_score']
    train_scores = cv_results['train_score']
    
    results = {
        'feature_type': feature_type_name,
        'n_samples': len(X),
        'n_features': X.shape[1],
        'val_auc_mean': np.mean(val_scores),
        'val_auc_std': np.std(val_scores),
        'train_auc_mean': np.mean(train_scores),
        'train_auc_std': np.std(train_scores),
        'overfitting_gap': np.mean(train_scores) - np.mean(val_scores)
    }
    
    return results

In [4]:
# Evaluate both approaches
print("\n" + "="*60)
print("EVALUATING BOTH APPROACHES")
print("="*60)

# Basic features evaluation
print("\n1. Basic Statistical Tests Approach:")
basic_results = evaluate_features(basic_df_clean, y_clean, "Basic Tests")
if basic_results:
    print(f"   Samples: {basic_results['n_samples']}")
    print(f"   Features: {basic_results['n_features']}")
    print(f"   CV ROC AUC: {basic_results['val_auc_mean']:.4f} ± {basic_results['val_auc_std']:.4f}")
    print(f"   Train ROC AUC: {basic_results['train_auc_mean']:.4f} ± {basic_results['train_auc_std']:.4f}")
    print(f"   Overfitting Gap: {basic_results['overfitting_gap']:.4f}")

# Synthetic features evaluation
print("\n2. Synthetic Data Generation Approach:")
synthetic_results = evaluate_features(synthetic_df_clean, y_synthetic_clean, "Synthetic Tests")
if synthetic_results:
    print(f"   Samples: {synthetic_results['n_samples']}")
    print(f"   Features: {synthetic_results['n_features']}")
    print(f"   CV ROC AUC: {synthetic_results['val_auc_mean']:.4f} ± {synthetic_results['val_auc_std']:.4f}")
    print(f"   Train ROC AUC: {synthetic_results['train_auc_mean']:.4f} ± {synthetic_results['train_auc_std']:.4f}")
    print(f"   Overfitting Gap: {synthetic_results['overfitting_gap']:.4f}")

# Combined approach (if we have overlapping samples)
common_indices = basic_df_clean.index.intersection(synthetic_df_clean.index)
if len(common_indices) > 100:
    print("\n3. Combined Approach (Basic + Synthetic):")
    basic_subset = basic_df_clean.loc[common_indices]
    synthetic_subset = synthetic_df_clean.loc[common_indices]
    combined_features = pd.concat([basic_subset, synthetic_subset], axis=1)
    y_combined = y_clean.loc[common_indices]
    
    combined_results = evaluate_features(combined_features, y_combined, "Combined")
    if combined_results:
        print(f"   Samples: {combined_results['n_samples']}")
        print(f"   Features: {combined_results['n_features']}")
        print(f"   CV ROC AUC: {combined_results['val_auc_mean']:.4f} ± {combined_results['val_auc_std']:.4f}")
        print(f"   Train ROC AUC: {combined_results['train_auc_mean']:.4f} ± {combined_results['train_auc_std']:.4f}")
        print(f"   Overfitting Gap: {combined_results['overfitting_gap']:.4f}")

# =============================================================================
# FEATURE IMPORTANCE ANALYSIS
# =============================================================================

def get_feature_importance(X, y, top_n=10):
    """Get feature importance from trained model"""
    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )
    
    model.fit(X, y)
    
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return importance_df.head(top_n)

print(f"\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

if synthetic_results:
    print(f"\nTop 10 Most Important Synthetic Features:")
    synth_importance = get_feature_importance(synthetic_df_clean, y_synthetic_clean, 10)
    for i, (_, row) in enumerate(synth_importance.iterrows(), 1):
        print(f"{i:2d}. {row['feature']:<30} {row['importance']:.6f}")

if basic_results:
    print(f"\nTop 10 Most Important Basic Features:")
    basic_importance = get_feature_importance(basic_df_clean, y_clean, 10)
    for i, (_, row) in enumerate(basic_importance.iterrows(), 1):
        print(f"{i:2d}. {row['feature']:<30} {row['importance']:.6f}")

print(f"\n" + "="*60)
print("SUMMARY")
print("="*60)

if basic_results and synthetic_results:
    improvement = synthetic_results['val_auc_mean'] - basic_results['val_auc_mean']
    print(f"Synthetic approach improves AUC by: {improvement:.4f}")
    if improvement > 0:
        print("✅ Synthetic data generation method shows improvement!")
    else:
        print("❌ Basic approach performs better (or try tuning synthetic parameters)")
else:
    print("Could not compare both approaches - check data and feature extraction")

print(f"\nKey insights:")
print(f"- Basic tests directly compare pre/post break distributions")
print(f"- Synthetic method creates null hypothesis of 'no break' scenario")
print(f"- Percentile features show how extreme actual test stats are vs null")
print(f"- Z-score features normalize for different null distribution scales")


EVALUATING BOTH APPROACHES

1. Basic Statistical Tests Approach:
   Samples: 10001
   Features: 8
   CV ROC AUC: 0.6062 ± 0.0206
   Train ROC AUC: 0.8040 ± 0.0047
   Overfitting Gap: 0.1978

2. Synthetic Data Generation Approach:
   Samples: 1000
   Features: 8
   CV ROC AUC: 0.5887 ± 0.0200
   Train ROC AUC: 0.9827 ± 0.0011
   Overfitting Gap: 0.3940

3. Combined Approach (Basic + Synthetic):
   Samples: 1000
   Features: 16
   CV ROC AUC: 0.5722 ± 0.0169
   Train ROC AUC: 0.9991 ± 0.0005
   Overfitting Gap: 0.4269

FEATURE IMPORTANCE ANALYSIS

Top 10 Most Important Synthetic Features:
 1. synth_levene_zscore            0.161303
 2. synth_ks_zscore                0.156033
 3. synth_anderson_zscore          0.145570
 4. synth_ks_percentile            0.122698
 5. synth_mw_zscore                0.111069
 6. synth_levene_percentile        0.106068
 7. synth_anderson_percentile      0.105747
 8. synth_mw_percentile            0.091512

Top 10 Most Important Basic Features:
 1. anderson_p

In [2]:
# Structural Break Detection: GARCH-Based Synthetic Data Generation
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from scipy import stats
from scipy.stats import ks_2samp, mannwhitneyu, levene, anderson_ksamp
from arch import arch_model
import warnings
warnings.filterwarnings('ignore')

from src.data.dataLoader import StructuralBreakDataLoader

print("Loading data...")
data_loader = StructuralBreakDataLoader()
data_loader.load_data(use_crunch=False)
train_data = data_loader.get_all_train_series()

# Use same subset size for fair comparison
SUBSET_SIZE = 1000
np.random.seed(42)
train_data_subset = dict(list(train_data.items())[:SUBSET_SIZE])

print(f"Using {SUBSET_SIZE} time series for comparison")

# =============================================================================
# APPROACH 1: Basic Statistical Tests
# =============================================================================

def extract_basic_features(ts_data):
    """Extract basic statistical test features between pre and post break periods"""
    pre_break = ts_data.period_0_values
    post_break = ts_data.period_1_values
    
    features = {}
    
    # Statistical tests
    test_configs = [
        ('ks', lambda: ks_2samp(pre_break, post_break)),
        ('mw', lambda: mannwhitneyu(pre_break, post_break, alternative='two-sided')),
        ('levene', lambda: levene(pre_break, post_break)),
        ('anderson', lambda: anderson_ksamp([pre_break, post_break]))
    ]
    
    for test_name, test_func in test_configs:
        try:
            if test_name == 'anderson':
                result = test_func()
                features[f'basic_{test_name}_statistic'] = result.statistic
                features[f'basic_{test_name}_pvalue'] = result.pvalue
            else:
                stat, pval = test_func()
                features[f'basic_{test_name}_statistic'] = stat
                features[f'basic_{test_name}_pvalue'] = pval
        except:
            features[f'basic_{test_name}_statistic'] = np.nan
            features[f'basic_{test_name}_pvalue'] = np.nan
    
    return features

# =============================================================================
# APPROACH 2: GARCH-Based Synthetic Data Generation
# =============================================================================

class GARCHGenerator:
    """GARCH-based time series generator"""
    
    def __init__(self):
        self.fitted_model = None
        self.mean_level = 0.0
        self.is_fitted = False
    
    def fit(self, series):
        """Fit GARCH model to time series"""
        try:
            # Store mean level
            self.mean_level = np.mean(series)
            
            # Demean the series for GARCH (GARCH models the variance, not the mean)
            demeaned_series = series - self.mean_level
            
            # Try GARCH(1,1) first - most common and robust
            try:
                garch_model = arch_model(demeaned_series, vol='GARCH', p=1, q=1, mean='Zero')
                self.fitted_model = garch_model.fit(disp='off', show_warning=False)
                self.is_fitted = True
                return self
            except:
                pass
            
            # Try simpler ARCH(1) if GARCH fails
            try:
                arch_model_simple = arch_model(demeaned_series, vol='ARCH', p=1, mean='Zero')
                self.fitted_model = arch_model_simple.fit(disp='off', show_warning=False)
                self.is_fitted = True
                return self
            except:
                pass
            
            # Try GARCH with constant mean
            try:
                garch_with_mean = arch_model(series, vol='GARCH', p=1, q=1, mean='Constant')
                self.fitted_model = garch_with_mean.fit(disp='off', show_warning=False)
                self.mean_level = 0  # Model handles mean internally
                self.is_fitted = True
                return self
            except:
                pass
            
            self.is_fitted = False
            
        except Exception as e:
            self.is_fitted = False
        
        return self
    
    def generate_continuations(self, continuation_length, n_simulations=100):
        """Generate synthetic continuations using fitted GARCH model"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before generating continuations")
        
        continuations = []
        
        for _ in range(n_simulations):
            try:
                # Generate simulation from GARCH model
                sim_result = self.fitted_model.simulate(continuation_length, burn=100)
                
                # Extract the simulated data
                if hasattr(sim_result, 'data'):
                    simulated_series = sim_result['data'].values
                else:
                    simulated_series = sim_result.values
                
                # Add back mean level if we demeaned
                continuation = simulated_series + self.mean_level
                continuations.append(continuation)
                
            except Exception as e:
                # If simulation fails, generate using residuals
                try:
                    residuals = self.fitted_model.resid
                    if len(residuals) > 0:
                        # Bootstrap from residuals and add mean
                        continuation = np.random.choice(residuals, size=continuation_length, replace=True) + self.mean_level
                        continuations.append(continuation)
                    else:
                        # Last resort: normal noise with estimated volatility
                        volatility = np.sqrt(self.fitted_model.conditional_volatility.iloc[-1]) if hasattr(self.fitted_model, 'conditional_volatility') else 1.0
                        continuation = np.random.normal(self.mean_level, volatility, continuation_length)
                        continuations.append(continuation)
                except:
                    # Final fallback
                    continuation = np.random.normal(self.mean_level, 1.0, continuation_length)
                    continuations.append(continuation)
        
        return continuations

def extract_garch_synthetic_features(ts_data, n_simulations=100):
    """Extract features using GARCH synthetic data generation"""
    pre_break = ts_data.period_0_values
    post_break = ts_data.period_1_values
    
    # Fit GARCH generator
    generator = GARCHGenerator()
    generator.fit(pre_break)
    
    if not generator.is_fitted:
        # Return NaN features if GARCH fitting failed
        return {f'garch_{feat}': np.nan for feat in [
            'ks_percentile', 'ks_zscore', 'mw_percentile', 'mw_zscore',
            'levene_percentile', 'levene_zscore', 'anderson_percentile', 'anderson_zscore',
            'fit_success'
        ]}
    
    # Generate synthetic continuations
    try:
        synthetic_continuations = generator.generate_continuations(len(post_break), n_simulations)
    except:
        return {f'garch_{feat}': np.nan for feat in [
            'ks_percentile', 'ks_zscore', 'mw_percentile', 'mw_zscore',
            'levene_percentile', 'levene_zscore', 'anderson_percentile', 'anderson_zscore',
            'fit_success'
        ]}
    
    features = {'garch_fit_success': 1.0}  # Successfully fitted and generated
    
    # Statistical tests with null distribution approach
    test_functions = {
        'ks': lambda x, y: ks_2samp(x, y)[0],
        'mw': lambda x, y: mannwhitneyu(x, y, alternative='two-sided')[0],
        'levene': lambda x, y: levene(x, y)[0],
        'anderson': lambda x, y: anderson_ksamp([x, y]).statistic
    }
    
    for test_name, test_func in test_functions.items():
        try:
            # Build null distribution of test statistics
            null_stats = []
            for synthetic in synthetic_continuations:
                try:
                    stat = test_func(pre_break, synthetic)
                    if not np.isnan(stat) and not np.isinf(stat):
                        null_stats.append(stat)
                except:
                    continue
            
            if len(null_stats) < 50:  # Need sufficient samples for stable distribution
                features[f'garch_{test_name}_percentile'] = np.nan
                features[f'garch_{test_name}_zscore'] = np.nan
                continue
            
            null_stats = np.array(null_stats)
            
            # Compute actual test statistic
            actual_stat = test_func(pre_break, post_break)
            
            if np.isnan(actual_stat) or np.isinf(actual_stat):
                features[f'garch_{test_name}_percentile'] = np.nan
                features[f'garch_{test_name}_zscore'] = np.nan
                continue
            
            # Percentile: what fraction of null stats are <= actual
            percentile = np.mean(null_stats <= actual_stat)
            features[f'garch_{test_name}_percentile'] = percentile
            
            # Z-score: how many standard deviations from null mean
            null_mean = np.mean(null_stats)
            null_std = np.std(null_stats)
            if null_std > 1e-8:
                zscore = (actual_stat - null_mean) / null_std
                features[f'garch_{test_name}_zscore'] = zscore
            else:
                features[f'garch_{test_name}_zscore'] = 0.0
                
        except Exception as e:
            features[f'garch_{test_name}_percentile'] = np.nan
            features[f'garch_{test_name}_zscore'] = np.nan
    
    return features

# =============================================================================
# EXTRACT FEATURES FOR BOTH APPROACHES
# =============================================================================

print("Extracting basic features...")
basic_features = []
labels = []
series_ids = []

for series_id, ts_data in train_data_subset.items():
    features = extract_basic_features(ts_data)
    basic_features.append(features)
    labels.append(ts_data.has_break)
    series_ids.append(series_id)

basic_df = pd.DataFrame(basic_features, index=series_ids)
y = pd.Series(labels, index=series_ids)

print("Extracting GARCH synthetic features...")
garch_features = []

for i, (series_id, ts_data) in enumerate(train_data_subset.items()):
    if i % 100 == 0:
        print(f"Processing series {i+1}/{len(train_data_subset)}")
    
    features = extract_garch_synthetic_features(ts_data, n_simulations=100)
    garch_features.append(features)

garch_df = pd.DataFrame(garch_features, index=series_ids)

# Clean datasets - only remove rows where ALL features are NaN
basic_df_clean = basic_df.dropna(how='all')
y_basic_clean = y.loc[basic_df_clean.index]

garch_df_clean = garch_df.dropna(how='all')
y_garch_clean = y.loc[garch_df_clean.index]

print(f"Basic features shape: {basic_df_clean.shape}")
print(f"GARCH features shape: {garch_df_clean.shape}")

# Check GARCH fitting success rate
garch_success_rate = garch_df_clean['garch_fit_success'].mean()
print(f"GARCH fitting success rate: {garch_success_rate:.1%}")

# =============================================================================
# MODEL EVALUATION
# =============================================================================

def evaluate_features(X, y, feature_type_name):
    """Evaluate features using cross-validation"""
    if len(X) == 0:
        print(f"No valid data for {feature_type_name}")
        return None
    
    # Drop columns with too many NaNs (>50%)
    nan_threshold = 0.5
    valid_cols = X.columns[X.isnull().mean() < nan_threshold]
    X_clean = X[valid_cols].fillna(X[valid_cols].median())
    
    if X_clean.shape[1] == 0:
        print(f"No valid features for {feature_type_name} after cleaning")
        return None
    
    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_results = cross_validate(
        model, X_clean, y,
        cv=cv,
        scoring='roc_auc',
        return_train_score=True,
        n_jobs=-1
    )
    
    val_scores = cv_results['test_score']
    train_scores = cv_results['train_score']
    
    return {
        'feature_type': feature_type_name,
        'n_samples': len(X_clean),
        'n_features': X_clean.shape[1],
        'val_auc_mean': np.mean(val_scores),
        'val_auc_std': np.std(val_scores),
        'train_auc_mean': np.mean(train_scores),
        'train_auc_std': np.std(train_scores),
        'overfitting_gap': np.mean(train_scores) - np.mean(val_scores),
        'clean_features': list(X_clean.columns)
    }

print("\n" + "="*60)
print("COMPARING BASIC vs GARCH APPROACHES")
print("="*60)

# Basic features
basic_results = evaluate_features(basic_df_clean, y_basic_clean, "Basic Tests")
print(f"\n1. Basic Statistical Tests:")
print(f"   Samples: {basic_results['n_samples']}")
print(f"   Features: {basic_results['n_features']}")
print(f"   CV ROC AUC: {basic_results['val_auc_mean']:.4f} ± {basic_results['val_auc_std']:.4f}")
print(f"   Train ROC AUC: {basic_results['train_auc_mean']:.4f} ± {basic_results['train_auc_std']:.4f}")
print(f"   Overfitting Gap: {basic_results['overfitting_gap']:.4f}")

# GARCH synthetic features
garch_results = evaluate_features(garch_df_clean, y_garch_clean, "GARCH Synthetic")
print(f"\n2. GARCH Synthetic Generation:")
print(f"   Samples: {garch_results['n_samples']}")
print(f"   Features: {garch_results['n_features']}")
print(f"   CV ROC AUC: {garch_results['val_auc_mean']:.4f} ± {garch_results['val_auc_std']:.4f}")
print(f"   Train ROC AUC: {garch_results['train_auc_mean']:.4f} ± {garch_results['train_auc_std']:.4f}")
print(f"   Overfitting Gap: {garch_results['overfitting_gap']:.4f}")

# Combined approach
common_indices = basic_df_clean.index.intersection(garch_df_clean.index)
if len(common_indices) > 100:
    print(f"\n3. Combined Approach:")
    basic_subset = basic_df_clean.loc[common_indices]
    garch_subset = garch_df_clean.loc[common_indices]
    combined_features = pd.concat([basic_subset, garch_subset], axis=1)
    y_combined = y.loc[common_indices]
    
    combined_results = evaluate_features(combined_features, y_combined, "Combined")
    print(f"   Samples: {combined_results['n_samples']}")
    print(f"   Features: {combined_results['n_features']}")
    print(f"   CV ROC AUC: {combined_results['val_auc_mean']:.4f} ± {combined_results['val_auc_std']:.4f}")
    print(f"   Train ROC AUC: {combined_results['train_auc_mean']:.4f} ± {combined_results['train_auc_std']:.4f}")
    print(f"   Overfitting Gap: {combined_results['overfitting_gap']:.4f}")

# =============================================================================
# FEATURE IMPORTANCE ANALYSIS
# =============================================================================

def get_feature_importance(X, y, top_n=10):
    # Clean data for feature importance
    nan_threshold = 0.5
    valid_cols = X.columns[X.isnull().mean() < nan_threshold]
    X_clean = X[valid_cols].fillna(X[valid_cols].median())
    
    model = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
    model.fit(X_clean, y)
    
    importance_df = pd.DataFrame({
        'feature': X_clean.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    return importance_df.head(top_n)

print(f"\n" + "="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

print(f"\nTop 10 GARCH Synthetic Features:")
garch_importance = get_feature_importance(garch_df_clean, y_garch_clean, 10)
for i, (_, row) in enumerate(garch_importance.iterrows(), 1):
    print(f"{i:2d}. {row['feature']:<35} {row['importance']:.6f}")

print(f"\nTop 10 Basic Features:")
basic_importance = get_feature_importance(basic_df_clean, y_basic_clean, 10)
for i, (_, row) in enumerate(basic_importance.iterrows(), 1):
    print(f"{i:2d}. {row['feature']:<35} {row['importance']:.6f}")

print(f"\n" + "="*60)
print("FINAL RESULTS")
print("="*60)

improvement = garch_results['val_auc_mean'] - basic_results['val_auc_mean']
print(f"GARCH synthetic approach vs Basic: {improvement:+.4f} AUC improvement")
print(f"GARCH fitting success rate: {garch_success_rate:.1%}")

if improvement > 0.01:
    print("✅ GARCH synthetic method shows meaningful improvement!")
elif improvement > 0:
    print("🟡 Slight improvement with GARCH approach")
else:
    print("❌ GARCH approach still not outperforming basic tests")

# Analyze what makes GARCH work well
print(f"\nDiagnostics:")
print(f"- Basic approach overfitting gap: {basic_results['overfitting_gap']:.3f}")
print(f"- GARCH approach overfitting gap: {garch_results['overfitting_gap']:.3f}")

if garch_success_rate < 0.8:
    print(f"- Low GARCH success rate ({garch_success_rate:.1%}) may be limiting performance")
else:
    print(f"- Good GARCH success rate ({garch_success_rate:.1%})")

Loading data...


INFO:src.data.dataLoader:Data loaded successfully from local files


Using 1000 time series for comparison
Extracting basic features...
Extracting GARCH synthetic features...
Processing series 1/1000
Processing series 101/1000
Processing series 201/1000
Processing series 301/1000
Processing series 401/1000
Processing series 501/1000
Processing series 601/1000
Processing series 701/1000
Processing series 801/1000
Processing series 901/1000
Basic features shape: (1000, 8)
GARCH features shape: (1000, 9)
GARCH fitting success rate: 100.0%

COMPARING BASIC vs GARCH APPROACHES

1. Basic Statistical Tests:
   Samples: 1000
   Features: 8
   CV ROC AUC: 0.5559 ± 0.0420
   Train ROC AUC: 0.9903 ± 0.0030
   Overfitting Gap: 0.4344

2. GARCH Synthetic Generation:
   Samples: 1000
   Features: 9
   CV ROC AUC: 0.5365 ± 0.0397
   Train ROC AUC: 0.9818 ± 0.0029
   Overfitting Gap: 0.4454

3. Combined Approach:
   Samples: 1000
   Features: 17
   CV ROC AUC: 0.5482 ± 0.0202
   Train ROC AUC: 0.9975 ± 0.0013
   Overfitting Gap: 0.4493

FEATURE IMPORTANCE ANALYSIS

Top