In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# Add parent directory to path to access custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import create_target_variable

In [None]:
# --- 1. DATA PREPARATION ---
try:
    data = pd.read_csv(r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\stock_and_macro_filtered.csv', index_col='Date', parse_dates=True)
except FileNotFoundError:
    print("Error: The data file was not found. Please update the file path to point to your processed data.")
    sys.exit()

In [None]:
# --- 2. DEFINE THE EXPERIMENTS ---
# A list of dictionaries, where each dictionary defines an experiment with a specific model and its parameters.
experiment_configs = [
    {
        'model_name': 'XGBoost',
        'model_class': XGBClassifier,
        'initial_params': {'eval_metric': 'logloss', 'random_state': 42},
        'param_grid': {
            'n_estimators': [100], 
            'learning_rate': [0.1], 
            'max_depth': [3],
        }
    },
    {
        'model_name': 'CatBoost',
        'model_class': CatBoostClassifier,
        'initial_params': {'verbose': False, 'random_state': 42, 'early_stopping_rounds': 50},
        'param_grid': {
            'n_estimators': [100], 
            'learning_rate': [0.1], 
            'depth': [3],
        }
    },
    {
        'model_name': 'RandomForest',
        'model_class': RandomForestClassifier,
        'initial_params': {'random_state': 42, 'class_weight': 'balanced'},
        'param_grid': {
            'n_estimators': [100, 200], 
            'max_depth': [5, 10],
        }
    },
    {
        'model_name': 'LogisticRegression',
        'model_class': LogisticRegression,
        'initial_params': {'random_state': 42, 'class_weight': 'balanced'},
        'param_grid': {
            'C': [0.1, 1.0, 10.0], 
            'solver': ['liblinear']
        }
    }
]

# Define the different hyperparameters for the data preparation step
window_sizes = [5, 10]
thresholds = [0.005, 0.01]

In [None]:
# --- 3. THE EXPERIMENTAL LOOP (MODEL SCREENING) ---
results = []
target_ticker = 'WMT'
split_date = '2021-01-01'

for window in window_sizes:
    for threshold in thresholds:
        print(f"\n--- Starting Model Screening for Window={window}, Threshold={threshold} ---")

        # A. Dynamically create the target variable for this experiment
        data_target = create_target_variable(data.copy(), target_ticker, window=window, threshold=threshold)

        # B. Define features (X) and target (y)
        # Dynamically get the target column name from the `create_target_variable` function output
        target_col_name = f'{target_ticker}_Target'
        target_return_col_name = f'{target_ticker}_target_return_{window}D_{threshold}'
        
        # Define columns to drop to create the feature set (X)
        columns_to_drop = [
            target_col_name,
            target_return_col_name,
            f'Open_{target_ticker}',
            f'High_{target_ticker}',
            f'Low_{target_ticker}',
            f'Close_{target_ticker}'
        ]

        # Create feature (X) and target (y) sets
        X = data_target.drop(columns=columns_to_drop, errors='ignore')
        y = data_target[target_col_name]
        
        # Split data into training and testing sets based on the split date
        X_train = X.loc[:split_date].copy()
        y_train = y.loc[:split_date].copy()
        X_test = X.loc[split_date:].copy()
        y_test = y.loc[split_date:].copy()

        # C. Handle Class Imbalance
        neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
        print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")

        for exp in experiment_configs:
            print(f"-> Running model: {exp['model_name']}")
            
            # Create a new model instance for this run
            model = exp['model_class'](**exp['initial_params'])
            
            # Get the param_grid and add the scale_pos_weight for tree-based models
            param_grid = exp['param_grid'].copy()
            if exp['model_name'] in ['XGBoost', 'CatBoost']:
                param_grid['scale_pos_weight'] = [neg_to_pos_ratio]
            # Logistic Regression and RandomForest have `class_weight='balanced'` which handles this internally

            # D. Fit with a lightweight GridSearchCV
            # Uses a TimeSeriesSplit-like cross-validation (cv=3) for initial tuning
            grid_search = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                scoring='f1_macro', # Use F1 score for a balanced evaluation on imbalanced data
                cv=3,
                verbose=0,
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            # E. Evaluate the model and collect results
            y_pred = best_model.predict(X_test)
            report = classification_report(y_test, y_pred, output_dict=True)
            
            # F. Extract and store top features
            top_features = {}
            if hasattr(best_model, 'feature_importances_'):
                # For tree-based models like Random Forest, XGBoost, CatBoost
                importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
                top_features = importances.nlargest(10).to_dict()
            elif hasattr(best_model, 'coef_'):
                # For linear models like Logistic Regression
                coefficients = pd.Series(best_model.coef_[0], index=X_train.columns)
                top_features = coefficients.abs().nlargest(10).to_dict()
            
            print("   Top 10 Features (or Coefficients):")
            for feature, score in top_features.items():
                print(f"   - {feature}: {score:.4f}")
            
            results.append({
                'Model': exp['model_name'],
                'Window': window,
                'Threshold': threshold,
                'Test_Accuracy': report['accuracy'],
                'Test_Precision_1': report['1']['precision'],
                'Test_Recall_1': report['1']['recall'],
                'Test_F1_1': report['1']['f1-score'],
                'Best_Params': grid_search.best_params_,
                'Top_Features': top_features
            })

In [None]:
# --- 4. DISPLAY FINAL RESULTS ---
# Convert results list to a DataFrame for easy viewing
results_df = pd.DataFrame(results)
# Sort the results by F1 score in descending order and reset the index
results_df = results_df.sort_values(by='Test_F1_1', ascending=False).reset_index(drop=True)
results_df.drop(columns=['Top_Features'], inplace=True) # Drop top features for cleaner output

print("\n--- Final Experiment Results Summary ---")
print(results_df.to_string())