In [1]:
import sys
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

# Add parent directory to path to access custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import create_target_variable

In [2]:
# --- 1. DATA PREPARATION ---
# Load the prepared data
try:
    data = pd.read_csv(r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\stock_and_macro.csv', index_col='Date', parse_dates=True)
except FileNotFoundError:
    print("Error: The data file was not found. Please update the file path.")
    sys.exit()

# Define the optimal data preparation parameters from the coarse modeling stage
results = []
window = 10
threshold = 0.005
target_ticker = 'WMT'
split_date = '2021-01-01'

# Create the target variable based on the selected parameters
data_target = create_target_variable(data.copy(), target_ticker, window=window, threshold=threshold)

# Define columns to drop to create the feature set (X)
target_col_name = f'{target_ticker}_Target'
target_return_col_name = f'{target_ticker}_target_return_{window}D_{threshold}'
columns_to_drop = [
    target_col_name,
    target_return_col_name,
    f'Open_{target_ticker}',
    f'High_{target_ticker}',
    f'Low_{target_ticker}',
    f'Close_{target_ticker}'
]

# Handle NaN values and split the data
data_target.dropna(inplace=True)
X = data_target.drop(columns=columns_to_drop, errors='ignore')
y = data_target[target_col_name]

# Create a fixed training and testing set for the final evaluation
# This is a fixed, chronological split, which is essential for financial data.
X_train_full = X.loc[:split_date].copy()
y_train_full = y.loc[:split_date].copy()
X_test_full = X.loc[split_date:].copy()
y_test_full = y.loc[split_date:].copy()

neg_to_pos_ratio = (y_train_full == 0).sum() / (y_train_full == 1).sum()
print(f"\n--- Data setup for Window={window}, Threshold={threshold} ---")
print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")


--- Data setup for Window=10, Threshold=0.005 ---
Class imbalance ratio (0/1): 0.96


In [3]:
# --- 2. DEFINE REFINED EXPERIMENT CONFIGURATIONS ---
# We use the results from the coarse search to narrow down the hyperparameter space.
refined_experiment_configs = [
    # # Logistic Regression: Good baseline model.
    # {
    #     'model_name': 'LogisticRegression',
    #     'model_class': LogisticRegression,
    #     'initial_params': {'random_state': 42, 'class_weight': 'balanced'},
    #     # Refined param grid based on coarse search results
    #     'param_grid': {
    #         'C': [0.05, 0.1, 0.5, 1.0],
    #         'solver': ['liblinear']
    #     }
    # },
    # # XGBoost: A powerful, gradient-boosting model often used for tabular data.
    # {
    #     'model_name': 'XGBoost',
    #     'model_class': XGBClassifier,
    #     'initial_params': {'eval_metric': 'logloss', 'use_label_encoder': False, 'random_state': 42, 'scale_pos_weight': neg_to_pos_ratio},
    #     # Refined param grid based on coarse search results
    #     'param_grid': {
    #         'n_estimators': [75, 100, 125, 150],
    #         'learning_rate': [0.08, 0.1, 0.12],
    #         'max_depth': [3, 4],
    #         'subsample': [0.8, 1.0],
    #         'colsample_bytree': [0.8, 1.0]
    #     }
    # },
    # # CatBoost: Another robust gradient-boosting model that handles categorical features automatically.
    # {
    #     'model_name': 'CatBoost',
    #     'model_class': CatBoostClassifier,
    #     'initial_params': {'verbose': False, 'random_state': 42, 'early_stopping_rounds': 50, 'class_weights': [1, neg_to_pos_ratio]},
    #     # Refined param grid based on coarse search results
    #     'param_grid': {
    #         'n_estimators': [75, 100, 125, 150],
    #         'learning_rate': [0.08, 0.1, 0.12],
    #         'depth': [3, 4]
    #     }
    # },
    # RandomForest: An ensemble tree method that is often less prone to overfitting.
    {
        'model_name': 'RandomForest',
        'model_class': RandomForestClassifier,
        'initial_params': {'random_state': 42, 'class_weight': 'balanced'},
        # Refined param grid based on coarse search results
        'param_grid': {
            'n_estimators': [100, 150, 200, 250],
            'max_depth': [4, 5, 6],
            'min_samples_split': [2, 3],
            'min_samples_leaf': [1, 2]
        }
    }
]

In [4]:
# --- 3. FINE-TUNING LOOP WITH TIME-SERIES CROSS-VALIDATION ---
for exp in refined_experiment_configs:
    model_name = exp['model_name']
    model_class = exp['model_class']
    initial_params = exp['initial_params']
    param_grid = exp['param_grid']
    
    print(f"\n--- Starting Time-Series Cross-Validation for {model_name} ---")
    
    # Define a TimeSeriesSplit object for the outer cross-validation loop.
    # This will create chronological splits of the data.
    tscv = TimeSeriesSplit(n_splits=5)
    
    cv_metrics = defaultdict(list)
    
    # Loop through each time-series split
    # This is the outer cross-validation loop to get robust performance metrics
    for fold, (train_index, test_index) in enumerate(tscv.split(X_train_full)):
        X_train, X_test = X_train_full.iloc[train_index], X_train_full.iloc[test_index]
        y_train, y_test = y_train_full.iloc[train_index], y_train_full.iloc[test_index]
        
        print(f"Fold {fold+1}/{tscv.n_splits}: Training on {len(X_train)} samples, testing on {len(X_test)} samples")
        
        # Perform grid search for this specific fold
        # We use a nested TimeSeriesSplit to validate parameters within each fold.
        grid_search_fold = GridSearchCV(
            estimator=model_class(**initial_params),
            param_grid=param_grid,
            scoring='f1_macro', # Use F1 score for a balanced evaluation
            cv=TimeSeriesSplit(n_splits=2), # Use a nested TimeSeriesSplit for internal CV
            n_jobs=-1
        )
        grid_search_fold.fit(X_train, y_train)
        best_model_fold = grid_search_fold.best_estimator_
        
        # Evaluate the best model from this fold on the held-out test split for this fold
        y_pred = best_model_fold.predict(X_test)
        
        # Collect metrics for this fold
        cv_metrics['accuracy'].append(accuracy_score(y_test, y_pred))
        cv_metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
        cv_metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
        cv_metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
        cv_metrics['best_params'].append(grid_search_fold.best_params_)
        
    print("\n--- Cross-Validation Results Summary ---")
    for metric, values in cv_metrics.items():
        if metric == 'best_params':
            # We don't average parameters, just print a list of the best found per fold
            print(f"Best params per fold: {values}")
        else:
            print(f"Average {metric}: {np.mean(values):.4f} (+/- {np.std(values):.4f})")



--- Starting Time-Series Cross-Validation for RandomForest ---
Fold 1/5: Training on 455 samples, testing on 451 samples
Fold 2/5: Training on 906 samples, testing on 451 samples
Fold 3/5: Training on 1357 samples, testing on 451 samples
Fold 4/5: Training on 1808 samples, testing on 451 samples
Fold 5/5: Training on 2259 samples, testing on 451 samples

--- Cross-Validation Results Summary ---
Average accuracy: 0.5162 (+/- 0.0600)
Average precision: 0.4753 (+/- 0.2408)
Average recall: 0.1817 (+/- 0.1800)
Average f1: 0.2351 (+/- 0.1827)
Best params per fold: [{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}, {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}, {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}, {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}, {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 200}]


In [5]:
# --- 4. FINAL MODEL TRAINING AND EVALUATION ---
# Re-run a final GridSearchCV on the full training data to find the best parameters overall
final_grid_search = GridSearchCV(
    estimator=model_class(**initial_params),
    param_grid=param_grid,
    scoring='f1_macro',
    cv=tscv, # Use TimeSeriesSplit for the final Grid Search
    n_jobs=-1
)
final_grid_search.fit(X_train_full, y_train_full)
best_model = final_grid_search.best_estimator_

# Evaluate the final best model on the unseen test set
y_pred_final = best_model.predict(X_test_full)
report_str = classification_report(y_test_full, y_pred_final)

print(f"\n--- Final Model Evaluation on Unseen Test Data for {model_name} ---")
print(f"Best parameters found: {final_grid_search.best_params_}")
print(f"Best cross-validation score: {final_grid_search.best_score_:.2f}")
print("\n--- Final Classification Report ---")
print(report_str)


--- Final Model Evaluation on Unseen Test Data for RandomForest ---
Best parameters found: {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.43

--- Final Classification Report ---
              precision    recall  f1-score   support

           0       0.56      0.59      0.58       250
           1       0.56      0.53      0.54       243

    accuracy                           0.56       493
   macro avg       0.56      0.56      0.56       493
weighted avg       0.56      0.56      0.56       493



In [6]:
# --- 5. SAVE THE FINAL MODEL ---
# Ensure the models directory exists
model_dir = "C:\\Users\\epoch_bpjmdqk\\Documents\\Code\\models"
os.makedirs(model_dir, exist_ok=True)

# Save the best model using pickle
model_filename = f"{model_dir}\\{exp['model_name'].lower()}_w{window}_t{threshold}.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)
print(f"\n✅ Refined model saved as '{model_filename}'")


✅ Refined model saved as 'C:\Users\epoch_bpjmdqk\Documents\Code\models\randomforest_w10_t0.005.pkl'
