In [None]:
import sys
import os
import pandas as pd
import numpy as np
import pickle
import optuna
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

# Add parent directory to path to access custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.modelling_functions import create_target_variable, check_feature_stability, detect_market_regimes, regime_aware_validation, calculate_sharpe_with_costs, calculate_max_drawdown

ImportError: cannot import name 'regimes_aware_validation' from 'src.modelling_functions' (c:\Users\epoch_bpjmdqk\Documents\Code\src\modelling_functions.py)

In [None]:
# --- CONFIGURATION ---
CONFIG = {
    'data_path': r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\consumer_staples_data.csv',
    'model_dir': r"C:\Users\epoch_bpjmdqk\Documents\Code\models",
    'window': 5,
    'threshold': 0.005,
    'target_ticker': 'WMT',
    'train_end': '2020-12-31',
    'val_end': '2021-12-31',
    'test_start': '2022-01-01',
    'n_trials': 100,
    'transaction_cost': 0.001,  # 0.1% per trade
    'random_state': 42
}

In [None]:
# --- 1. DATA PREPARATION ---
try:
    data = pd.read_csv(CONFIG['data_path'], index_col='Date', parse_dates=True)
    print(f"✅ Data loaded successfully: {data.shape}")
except FileNotFoundError:
    print(f"❌ Error: Data file not found at {CONFIG['data_path']}")
    sys.exit()

# Create target variable
data_target = create_target_variable(data.copy(), CONFIG['target_ticker'], 
                                   window=CONFIG['window'], threshold=CONFIG['threshold'])
target_return_col = f"{CONFIG['target_ticker']}_target_return_{CONFIG['window']}D_{CONFIG['threshold']}"

# Define features (exclude target-related columns)
exclude_cols = [
    f"{CONFIG['target_ticker']}_Target",
    target_return_col,
    f"Open_{CONFIG['target_ticker']}",
    f"High_{CONFIG['target_ticker']}",
    f"Low_{CONFIG['target_ticker']}",
    f"Close_{CONFIG['target_ticker']}",
    f"Volume_{CONFIG['target_ticker']}",
    f"Dividends_{CONFIG['target_ticker']}",
    f"Stock Splits_{CONFIG['target_ticker']}"
]

features = [col for col in data_target.columns if col not in exclude_cols]
data_target.dropna(inplace=True)

X_features = data_target[features]
y = data_target[f"{CONFIG['target_ticker']}_Target"]
returns_full = data_target[target_return_col]

# Clean data
X_features.replace([np.inf, -np.inf], np.nan, inplace=True)
X_features = X_features.fillna(X_features.mean()).fillna(0)

# Calculate class imbalance
neg_to_pos_ratio = (y == 0).sum() / (y == 1).sum()

print(f"\n--- Data Setup ---")
print(f"Window: {CONFIG['window']}, Threshold: {CONFIG['threshold']}")
print(f"Features: {len(features)}, Samples: {len(X_features)}")
print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")


In [None]:
# --- 2. THREE-WAY TIME SERIES SPLIT ---
print(f"\n--- Time Series Split ---")
train_mask = X_features.index <= CONFIG['train_end']
val_mask = (X_features.index > CONFIG['train_end']) & (X_features.index <= CONFIG['val_end'])
test_mask = X_features.index > CONFIG['val_end']

X_train = X_features[train_mask]
X_val = X_features[val_mask] 
X_test = X_features[test_mask]
y_train = y[train_mask]
y_val = y[val_mask]
y_test = y[test_mask]
returns_train = returns_full[train_mask]
returns_val = returns_full[val_mask]
returns_test = returns_full[test_mask]

print(f"Train: {len(X_train)} samples ({X_train.index.min()} to {X_train.index.max()})")
print(f"Val:   {len(X_val)} samples ({X_val.index.min()} to {X_val.index.max()})")
print(f"Test:  {len(X_test)} samples ({X_test.index.min()} to {X_test.index.max()})")


In [None]:
# --- 3. FEATURE STABILITY ANALYSIS ---
stable_features = check_feature_stability(X_train, y_train, X_train.columns, CONFIG['random_state'])
print(f"\nUsing {len(stable_features)} most stable features for modeling")

# Filter to stable features
X_train_stable = X_train[stable_features]
X_val_stable = X_val[stable_features]
X_test_stable = X_test[stable_features]

In [None]:
# --- 4. REGIME DETECTION ---
market_returns = returns_full  # Or use market index if available
regimes = detect_market_regimes(market_returns)
print(f"\nMarket regimes detected:")
print(regimes.value_counts().sort_index())

In [None]:
# --- 5. ENHANCED OPTUNA OPTIMIZATION ---
def enhanced_objective(trial):
    # Expanded search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'scale_pos_weight': neg_to_pos_ratio,
        'eval_metric': 'logloss',
        'random_state': CONFIG['random_state'],
        'n_jobs': -1,
        'tree_method': 'hist'
    }
    
    # Tunable pipeline components
    pca_components = trial.suggest_int('pca_n_components', 2, min(10, len(stable_features)))
    selector_threshold = trial.suggest_categorical('selector_threshold', ['mean', 'median', '0.75*mean'])
    
    model = XGBClassifier(**params)
    selector = SelectFromModel(model, threshold=selector_threshold)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_select', selector),
        ('pca', PCA(n_components=pca_components)),
        ('model', model)
    ])
    
    # Time series cross-validation on training data only
    tscv = TimeSeriesSplit(n_splits=3)
    sharpe_scores = []
    
    for train_idx, val_idx in tscv.split(X_train_stable):
        X_cv_train = X_train_stable.iloc[train_idx]
        X_cv_val = X_train_stable.iloc[val_idx]
        y_cv_train = y_train.iloc[train_idx]
        y_cv_val = y_train.iloc[val_idx]
        returns_cv_val = returns_train.iloc[val_idx]
        
        # Early stopping on CV validation set
        fit_params = {
            "model__early_stopping_rounds": 20,
            "model__eval_set": [(X_cv_val, y_cv_val)],
            "model__verbose": False
        }
        
        try:
            pipeline.fit(X_cv_train, y_cv_train, **fit_params)
            preds = pipeline.predict(X_cv_val)
            strategy_returns = preds * returns_cv_val
            sharpe = calculate_sharpe_with_costs(strategy_returns, CONFIG['transaction_cost'])
            sharpe_scores.append(sharpe if not np.isnan(sharpe) else 0)
        except Exception as e:
            print(f"Trial failed: {e}")
            return -10  # Penalty for failed trials
    
    return np.mean(sharpe_scores) if sharpe_scores else -10

print("\n--- Starting Optuna Optimization ---")

# Run Optuna study
study = optuna.create_study(direction='maximize', 
                          sampler=optuna.samplers.TPESampler(seed=CONFIG['random_state']))
study.optimize(enhanced_objective, n_trials=CONFIG['n_trials'], show_progress_bar=True)

print(f"\n--- Optimization Results ---")
print(f"Best Sharpe ratio: {study.best_value:.4f}")
print(f"Best parameters: {study.best_params}")

best_params = study.best_params.copy()

# Extract pipeline parameters
pca_components = best_params.pop('pca_n_components')
selector_threshold = best_params.pop('selector_threshold')


In [None]:
# --- 6. TRAIN FINAL MODEL ---
print("\n--- Training Final Model ---")

final_model = XGBClassifier(**best_params,
                          scale_pos_weight=neg_to_pos_ratio,
                          eval_metric='logloss',
                          random_state=CONFIG['random_state'],
                          n_jobs=-1,
                          tree_method='hist')

final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_select', SelectFromModel(final_model, threshold=selector_threshold)),
    ('pca', PCA(n_components=pca_components)),
    ('model', final_model)
])

# Fit on training data with validation for early stopping
fit_params = {
    "model__early_stopping_rounds": 20,
    "model__eval_set": [(X_val_stable, y_val)],
    "model__verbose": True
}

final_pipeline.fit(X_train_stable, y_train, **fit_params)
print("✅ Final model trained with early stopping")


In [None]:
# --- 7. COMPREHENSIVE MODEL EVALUATION ---
def evaluate_model_comprehensive(pipeline, X_test, y_test, returns_test, regimes_test, set_name="Test"):
    """Comprehensive model evaluation"""
    print(f"\n--- {set_name} Set Evaluation ---")
    
    # Basic predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Strategy returns
    strategy_returns = y_pred * returns_test
    
    # Performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    sharpe = calculate_sharpe_with_costs(strategy_returns, CONFIG['transaction_cost'])
    max_dd = calculate_max_drawdown(strategy_returns.cumsum())
    total_return = strategy_returns.sum()
    
    # Win rate and trade statistics
    winning_trades = strategy_returns[strategy_returns > 0]
    losing_trades = strategy_returns[strategy_returns < 0]
    win_rate = len(winning_trades) / len(strategy_returns[strategy_returns != 0]) if len(strategy_returns[strategy_returns != 0]) > 0 else 0
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Sharpe Ratio (w/ costs): {sharpe:.4f}")
    print(f"Total Return: {total_return:.4f}")
    print(f"Max Drawdown: {max_dd:.4f}")
    print(f"Win Rate: {win_rate:.4f}")
    print(f"Avg Winning Trade: {winning_trades.mean():.6f}")
    print(f"Avg Losing Trade: {losing_trades.mean():.6f}")
    print(f"Number of Trades: {(y_pred != 0).sum()}")
    
    # Regime-specific performance
    if regimes_test is not None:
        print(f"\nRegime Performance:")
        for regime in regimes_test.unique():
            regime_mask = regimes_test == regime
            if regime_mask.sum() > 10:
                regime_returns = strategy_returns[regime_mask]
                regime_sharpe = calculate_sharpe_with_costs(regime_returns, CONFIG['transaction_cost'])
                regime_names = {0: 'Low Vol', 1: 'Normal Vol', 2: 'High Vol'}
                print(f"  {regime_names.get(regime, f'Regime {regime}')}: Sharpe = {regime_sharpe:.3f} ({regime_mask.sum()} samples)")
    
    return {
        'accuracy': accuracy,
        'sharpe': sharpe,
        'total_return': total_return,
        'max_drawdown': max_dd,
        'win_rate': win_rate,
        'n_trades': (y_pred != 0).sum()
    }

# Evaluate on validation set
val_regimes = regimes[val_mask] if len(regimes[val_mask]) > 0 else None
val_results = evaluate_model_comprehensive(final_pipeline, X_val_stable, y_val, 
                                         returns_val, val_regimes, "Validation")

# Evaluate on test set (final, unbiased evaluation)
test_regimes = regimes[test_mask] if len(regimes[test_mask]) > 0 else None
test_results = evaluate_model_comprehensive(final_pipeline, X_test_stable, y_test, 
                                          returns_test, test_regimes, "Test")


In [None]:
# --- 8. ROBUST PERMUTATION IMPORTANCE ---
print("\n--- Permutation Importance Analysis ---")

# Use only training data for feature importance to avoid bias
pi_result = permutation_importance(final_pipeline, X_train_stable, y_train, 
                                 n_repeats=10, random_state=CONFIG['random_state'], n_jobs=-1)

# Get selected features after pipeline fitting
selected_features = X_train_stable.columns[final_pipeline.named_steps['feature_select'].get_support()]
sorted_idx = pi_result.importances_mean.argsort()[::-1]

print("Top 10 Most Important Features:")
for i in sorted_idx[:10]:
    feature_name = selected_features[i]
    importance_mean = pi_result.importances_mean[i]
    importance_std = pi_result.importances_std[i]
    print(f"  {feature_name}: {importance_mean:.4f} ± {importance_std:.4f}")


In [None]:
# --- 9. OVERFITTING CHECKS ---
print("\n--- Overfitting Analysis ---")

# Compare train vs validation performance
train_pred = final_pipeline.predict(X_train_stable)
train_strategy_returns = train_pred * returns_train
train_sharpe = calculate_sharpe_with_costs(train_strategy_returns, CONFIG['transaction_cost'])

print(f"Training Sharpe: {train_sharpe:.4f}")
print(f"Validation Sharpe: {val_results['sharpe']:.4f}")
print(f"Test Sharpe: {test_results['sharpe']:.4f}")

sharpe_degradation = (train_sharpe - test_results['sharpe']) / abs(train_sharpe) if train_sharpe != 0 else 0
print(f"Sharpe degradation (train→test): {sharpe_degradation:.2%}")

if sharpe_degradation > 0.5:
    print("⚠️  Warning: Significant performance degradation detected. Model may be overfitting.")
elif sharpe_degradation > 0.2:
    print("⚠️  Caution: Moderate performance degradation. Monitor in backtesting.")
else:
    print("✅ Performance degradation within acceptable limits.")

In [None]:
# --- 10. MODEL PERSISTENCE ---
os.makedirs(CONFIG['model_dir'], exist_ok=True)

# Save the final pipeline
model_filename = os.path.join(CONFIG['model_dir'], "final_xgb_optuna_pipeline_enhanced.pkl")
with open(model_filename, 'wb') as f:
    pickle.dump(final_pipeline, f)

# Save model metadata
metadata = {
    'config': CONFIG,
    'best_params': {**best_params, 'pca_n_components': pca_components, 'selector_threshold': selector_threshold},
    'class_imbalance_ratio': neg_to_pos_ratio,
    'selected_features': list(selected_features),
    'stable_features': stable_features,
    'validation_results': val_results,
    'test_results': test_results,
    'train_sharpe': train_sharpe,
    'feature_importance': {
        feature: float(importance) 
        for feature, importance in zip(selected_features[sorted_idx[:20]], 
                                     pi_result.importances_mean[sorted_idx[:20]])
    },
    'training_date': datetime.now().isoformat()
}

metadata_filename = os.path.join(CONFIG['model_dir'], "model_metadata_enhanced.json")
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f"\n✅ Enhanced pipeline saved as '{model_filename}'")
print(f"✅ Model metadata saved as '{metadata_filename}'")


In [None]:
# --- 11. REGIME-AWARE FINAL VALIDATION ---
if len(regimes) > 0:
    print("\n--- Final Regime Analysis ---")
    regime_performance = regime_aware_validation(X_test_stable, y_test, returns_test, test_regimes, final_pipeline, CONFIG['transaction_cost'])

print("\n--- Summary ---")
print(f"Final Test Sharpe Ratio: {test_results['sharpe']:.4f}")
print(f"Final Test Accuracy: {test_results['accuracy']:.4f}")
print(f"Model ready for backtesting phase.")