In [None]:
import sys
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
from collections import defaultdict
from itertools import product

# Add parent directory to path to access custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import create_target_variable

In [6]:
# --- 1. DATA PREPARATION ---
# Load the prepared data
try:
    data = pd.read_csv(r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\consumer_staples_data.csv', index_col='Date', parse_dates=True)
except FileNotFoundError:
    print("Error: The data file was not found. Please update the file path.")
    sys.exit()

# Define the optimal data preparation parameters from the coarse modeling stage
window = 5
threshold = 0.005
target_ticker = 'WMT'
split_date = '2021-01-01'

# Create the target variable based on the selected parameters
data_target = create_target_variable(data.copy(), target_ticker, window=window, threshold=threshold)

# Define columns to drop to create the feature set (X)
target_col_name = f'{target_ticker}_Target'
target_return_col_name = f'{target_ticker}_target_return_{window}D_{threshold}'
columns_to_drop = [
    target_col_name,
    target_return_col_name,
    f'Open_{target_ticker}',
    f'High_{target_ticker}',
    f'Low_{target_ticker}',
    f'Close_{target_ticker}'
]

# Handle NaN values and split the data
data_target.dropna(inplace=True)
# Filter for 'pca_3' features, as this was the top-performing feature set
features_pca_3 = [col for col in data_target.columns if 'PCA' in col]
X = data_target[features_pca_3]
y = data_target[target_col_name]

# Create a fixed training and testing set for the final evaluation
X_train_full = X.loc[:split_date].copy()
y_train_full = y.loc[:split_date].copy()
X_test_full = X.loc[split_date:].copy()
y_test_full = y.loc[split_date:].copy()

neg_to_pos_ratio = (y_train_full == 0).sum() / (y_train_full == 1).sum()
print(f"\n--- Data setup for Window={window}, Threshold={threshold} ---")
print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")


--- Data setup for Window=5, Threshold=0.005 ---
Class imbalance ratio (0/1): 1.20


In [7]:
# --- 2. DEFINE REFINED XGBOOST EXPERIMENT CONFIGURATION ---
# We use the results from the coarse search to narrow down the hyperparameter space.
# Focusing exclusively on XGBoost with a refined parameter grid.
refined_experiment_config = {
    'model_name': 'XGBoost',
    'model_class': XGBClassifier,
    'initial_params': {'eval_metric': 'logloss', 'use_label_encoder': False, 'random_state': 42, 'scale_pos_weight': neg_to_pos_ratio},
    # Refined param grid based on coarse search results and best practices
    'param_grid': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.5]
    }
}

In [8]:
# --- 3. FINE-TUNING AND WALK-FORWARD VALIDATION ---
model_name = refined_experiment_config['model_name']
model_class = refined_experiment_config['model_class']
initial_params = refined_experiment_config['initial_params']
param_grid = refined_experiment_config['param_grid']

print(f"\n--- Starting Walk-Forward Validation for {model_name} ---")

# Generate all possible parameter combinations
param_combinations = [dict(zip(param_grid.keys(), v)) for v in product(*param_grid.values())]

# Define the walk-forward validation settings
train_window_size = 100 # Number of samples in the training window
test_window_size = 20 # Number of samples in the testing window
num_walks = 10 # Number of walk-forward steps

cv_metrics = defaultdict(lambda: defaultdict(list))

# Loop through each walk-forward step
for i in range(num_walks):
    start_train_index = i * test_window_size
    end_train_index = start_train_index + train_window_size
    start_test_index = end_train_index
    end_test_index = start_test_index + test_window_size
    
    if end_test_index > len(X_train_full):
        break
        
    X_train_walk = X_train_full.iloc[start_train_index:end_train_index]
    y_train_walk = y_train_full.iloc[start_train_index:end_train_index]
    X_test_walk = X_train_full.iloc[start_test_index:end_test_index]
    y_test_walk = y_train_full.iloc[start_test_index:end_test_index]

    print(f"Walk {i+1}/{num_walks}: Training from index {start_train_index} to {end_train_index}, testing from {start_test_index} to {end_test_index}")

    for params in param_combinations:
        # Create and train the model with the current parameters
        model = model_class(**initial_params, **params)
        model.fit(X_train_walk, y_train_walk)
        
        # Make predictions and evaluate
        y_pred = model.predict(X_test_walk)
        
        # Store metrics for this parameter combination and walk
        cv_metrics[str(params)]['accuracy'].append(accuracy_score(y_test_walk, y_pred))
        cv_metrics[str(params)]['f1_macro'].append(f1_score(y_test_walk, y_pred, average='macro', zero_division=0))

# Find the best parameters based on average f1_macro score across all walks
best_params = None
best_avg_f1 = -1
for params_str, metrics in cv_metrics.items():
    avg_f1 = np.mean(metrics['f1_macro'])
    if avg_f1 > best_avg_f1:
        best_avg_f1 = avg_f1
        best_params = eval(params_str) # eval() is safe here as param_combinations are controlled

print("\n--- Walk-Forward Validation Results Summary ---")
print(f"Best parameters found through walk-forward validation: {best_params}")
print(f"Average F1-macro score for best parameters: {best_avg_f1:.4f}")


--- Starting Walk-Forward Validation for XGBoost ---


NameError: name 'product' is not defined

In [None]:
# --- 4. FINAL MODEL TRAINING AND EVALUATION ---
# Train the final model with the best parameters on the full training set
print("\n--- Training final model on full training data with best parameters ---")
best_model = model_class(**initial_params, **best_params)
best_model.fit(X_train_full, y_train_full)

# Evaluate the final best model on the unseen test set
y_pred_final = best_model.predict(X_test_full)
report_str = classification_report(y_test_full, y_pred_final, zero_division=0)

print(f"\n--- Final Model Evaluation on Unseen Test Data for {model_name} ---")
print("\n--- Final Classification Report ---")
print(report_str)

In [None]:
# --- 5. SAVE THE FINAL MODEL ---
# Ensure the models directory exists
model_dir = "C:\\Users\\epoch_bpjmdqk\\Documents\\Code\\models"
os.makedirs(model_dir, exist_ok=True)

# Save the best model using pickle
model_filename = f"{model_dir}\\{model_name.lower()}_w{window}_t{threshold}.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)
print(f"\n✅ Refined model saved as '{model_filename}'")