In [None]:
import sys
import os
import pandas as pd
import numpy as np
import pickle
import optuna
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path to access custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.modelling_functions import calculate_sharpe_ratio, create_target_variable


In [None]:
# --- 1. DATA PREPARATION ---
try:
    data = pd.read_csv(r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\consumer_staples_data.csv', index_col='Date', parse_dates=True)
except FileNotFoundError:
    print("Error: The data file was not found. Please update the file path.")
    sys.exit()


window = 5
threshold = 0.005
target_ticker = 'WMT'
split_date = '2021-01-01'


# Create target variable
data_target = create_target_variable(data.copy(), target_ticker, window=window, threshold=threshold)
target_return_col = f'{target_ticker}_target_return_{window}D_{threshold}'


exclude_cols = [
    f'{target_ticker}_Target',
    target_return_col,
    f'Open_{target_ticker}',
    f'High_{target_ticker}',
    f'Low_{target_ticker}',
    f'Close_{target_ticker}',
    f'Volume_{target_ticker}',
    f'Dividends_{target_ticker}',
    f'Stock Splits_{target_ticker}'
]
features = [col for col in data_target.columns if col not in exclude_cols]
data_target.dropna(inplace=True)


X_features = data_target[features]
y = data_target[f'{target_ticker}_Target']
returns_full = data_target[target_return_col]


# Clean data
X_features.replace([np.inf, -np.inf], np.nan, inplace=True)
X_features = X_features.fillna(X_features.mean()).fillna(0)

 
X_full, y_full = X_features.copy(), y.copy()
neg_to_pos_ratio = (y_full == 0).sum() / (y_full == 1).sum()


print(f"\n--- Data setup for Window={window}, Threshold={threshold} ---")
print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")

In [None]:
# --- 2. OPTUNA HYPERPARAMETER OPTIMIZATION ---
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'gamma': trial.suggest_float('gamma', 0, 5),
        'scale_pos_weight': neg_to_pos_ratio,
        'eval_metric': 'logloss',
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'
    }

    model = XGBClassifier(**params)
    selector = SelectFromModel(model, threshold="median")
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_select', selector),
        ('pca', PCA(n_components=3)),
        ('model', model)
    ])

    tscv = TimeSeriesSplit(n_splits=3)
    sharpe_scores = []

    for train_idx, test_idx in tscv.split(X_full):
        X_train, X_test = X_full.iloc[train_idx], X_full.iloc[test_idx]
        y_train, y_test = y_full.iloc[train_idx], y_full.iloc[test_idx]
        returns_test = returns_full.iloc[test_idx]

        # Pass early stopping arguments through fit_params
        fit_params = {
            "model__early_stopping_rounds": 20,
            "model__eval_set": [(X_test, y_test)],
            "model__verbose": False
        }

        pipeline.fit(X_train, y_train, **fit_params)
        preds = pipeline.predict(X_test)
        strategy_returns = preds * returns_test
        sharpe_scores.append(calculate_sharpe_ratio(strategy_returns))

    return np.mean(sharpe_scores)


In [None]:
# --- 3. TRAIN FINAL MODEL WITH BEST PARAMS + EARLY STOPPING ---
from sklearn.model_selection import train_test_split

# Split full data into train + validation (time-aware, last 20% as validation)
split_idx = int(len(X_full) * 0.8)
X_train, X_val = X_full.iloc[:split_idx], X_full.iloc[split_idx:]
y_train, y_val = y_full.iloc[:split_idx], y_full.iloc[split_idx:]

final_model = XGBClassifier(
    **best_params,
    scale_pos_weight=neg_to_pos_ratio,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_select', SelectFromModel(final_model, threshold="median")),
    ('pca', PCA(n_components=3)),
    ('model', final_model)
])

# Fit with early stopping on validation set
fit_params = {
    "model__early_stopping_rounds": 20,
    "model__eval_set": [(X_val, y_val)],
    "model__verbose": True   # can set False if you don’t want logs
}

final_pipeline.fit(X_train, y_train, **fit_params)

print("\n--- Final model trained with early stopping ---")


In [None]:
# --- 4. PERMUTATION IMPORTANCE ---
X_train_pi, X_test_pi, y_train_pi, y_test_pi = X_full.loc[:split_date], X_full.loc[split_date:], y_full.loc[:split_date], y_full.loc[split_date:]
final_pipeline.fit(X_train_pi, y_train_pi)
pi_result = permutation_importance(final_pipeline, X_test_pi, y_test_pi, n_repeats=10, random_state=42, n_jobs=-1)


selected_features = X_train_pi.columns[final_pipeline.named_steps['feature_select'].get_support()]
sorted_idx = pi_result.importances_mean.argsort()[::-1]
print("\nPermutation Importance:")
for i in sorted_idx[:10]:
    print(f" {selected_features[i]}: {pi_result.importances_mean[i]:.4f} +/- {pi_result.importances_std[i]:.4f}")

In [None]:
# --- 5. SAVE FINAL MODEL ---
model_dir = "C:\\Users\\epoch_bpjmdqk\\Documents\\Code\\models"
os.makedirs(model_dir, exist_ok=True)
model_filename = f"{model_dir}\\final_xgb_optuna_pipeline.pkl"
with open(model_filename, 'wb') as f:
    pickle.dump(final_pipeline, f)
print(f"\n✅ Final production pipeline saved as '{model_filename}'")