In [1]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, mean_squared_error, f1_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, CatBoostRegressor
from collections import defaultdict
from sklearn.impute import SimpleImputer
import pickle

# Add parent directory to path to access custom modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.stock_features import create_target_variable
from src.modelling_functions import create_multi_class_target

In [2]:
# --- 1. Load Data ---
try:
    data = pd.read_csv(r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\consumer_staples_data.csv', index_col='Date', parse_dates=True)
except FileNotFoundError:
    print("Error: The data file was not found. Please update the file path.")
    sys.exit()

# Define the target and split date
target_ticker = 'WMT'
# Split date set earlier to ensure test set is not empty with longer windows
split_date = '2023-01-01'

In [3]:
# --- 2. DEFINE THE EXPERIMENTS ---
# Define feature engineering configurations
feature_configs = {
    'baseline': None,
    'poly_2': {'name': 'PolynomialFeatures', 'params': {'degree': 2, 'include_bias': False}},
    'pca_3': {'name': 'PCA', 'params': {'n_components': 3}},
}

# Define target configurations
target_configs = [
    {'type': 'binary', 'window': 5, 'threshold': 0.005},
    {'type': 'multi_class', 'window': 5, 'threshold': 0.01},
    {'type': 'regression', 'window': 5},
]

# Define model configurations for classification and regression
models_cls = [
    {'name': 'XGBoost', 'class': XGBClassifier, 'params': {'eval_metric': 'logloss', 'random_state': 42, 'n_estimators': 100}},
    {'name': 'CatBoost', 'class': CatBoostClassifier, 'params': {'verbose': False, 'random_state': 42, 'n_estimators': 100}},
    {'name': 'RandomForest', 'class': RandomForestClassifier, 'params': {'random_state': 42, 'n_estimators': 100}},
]

models_reg = [
    {'name': 'XGBoost', 'class': XGBRegressor, 'params': {'objective': 'reg:squarederror', 'random_state': 42, 'n_estimators': 100}},
    {'name': 'LinearRegression', 'class': LinearRegression, 'params': {}},
]

In [None]:
# --- 3. THE EXPERIMENTAL LOOP ---
all_results = []

for target_conf in target_configs:
    # A. Create the target variable
    data_target = data.copy()
    
    if target_conf['type'] == 'binary':
        data_target = create_target_variable(data_target, target_ticker, window=target_conf['window'], threshold=target_conf['threshold'])
        target_col = f'{target_ticker}_Target'
        model_list = models_cls
    elif target_conf['type'] == 'multi_class':
        # Custom multi-class target creation
        # (You need to implement this function)
        data_target = create_multi_class_target(data_target, target_ticker, window=target_conf['window'], threshold=target_conf['threshold'])
        target_col = f'{target_ticker}_Target_Multi'
        model_list = models_cls
    else: # regression
        data_target[f'{target_ticker}_target_return'] = data_target[f'Close_{target_ticker}'].pct_change(periods=target_conf['window']).shift(-target_conf['window'])
        target_col = f'{target_ticker}_target_return'
        model_list = models_reg

    # Drop rows with NaN in the target and drop target-related columns
    data_target.dropna(subset=[target_col], inplace=True)
    columns_to_drop = [col for col in data_target.columns if target_ticker in col and 'Target' not in col]
    X_full = data_target.drop(columns=columns_to_drop, errors='ignore')
    y_full = data_target[target_col]

    # Split data chronologically
    X_train_full = X_full.loc[:split_date]
    y_train_full = y_full.loc[:split_date]
    X_test_full = X_full.loc[split_date:]
    y_test_full = y_full.loc[split_date:]

    # B. Loop through feature engineering configurations
for feat_name, feat_config in feature_configs.items():
    print(f"\n--- Starting Experiment: Target={target_conf['type']}, Features={feat_name} ---")

    # Create a copy to avoid side effects
    X_train_transformed = X_train_full.copy()
    X_test_transformed = X_test_full.copy()

    # CRITICAL: Convert inf to nan before imputation
    X_train_transformed.replace([np.inf, -np.inf], np.nan, inplace=True)
    X_test_transformed.replace([np.inf, -np.inf], np.nan, inplace=True)

    # D. Apply feature engineering inside the loop to prevent leakage
    if feat_name != 'baseline':
        # Create an imputer to handle NaNs and Infs before other transformations
        # Using 'mean' as the strategy, you could also use 'median'
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

        if feat_name == 'poly_2':
            # Apply imputer and then the polynomial features
            # The imputer learns from the training data and applies to both
            X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_transformed))
            X_test_imputed = pd.DataFrame(imputer.transform(X_test_transformed))
            
            transformer = PolynomialFeatures(**feat_config['params'])
            X_train_transformed = pd.DataFrame(transformer.fit_transform(X_train_imputed))
            X_test_transformed = pd.DataFrame(transformer.transform(X_test_imputed))

        elif feat_name == 'pca_3':
            # Apply imputer, then standard scaler, then PCA
            imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
            X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_transformed))
            X_test_imputed = pd.DataFrame(imputer.transform(X_test_transformed))
            
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_imputed)
            X_test_scaled = scaler.transform(X_test_imputed)

            transformer = PCA(**feat_config['params'])
            X_train_transformed = pd.DataFrame(transformer.fit_transform(X_train_scaled))
            X_test_transformed = pd.DataFrame(transformer.transform(X_test_scaled))

        # E. Loop through models
        for model_conf in model_list:
            print(f"-> Running model: {model_conf['name']}")
            
            # F. Apply walk-forward validation (on the training data)
            tscv = TimeSeriesSplit(n_splits=5)
            cv_results = defaultdict(list)
            
            for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_transformed)):
                X_train, y_train = X_train_transformed.iloc[train_idx], y_train_full.iloc[train_idx]
                X_val, y_val = X_train_transformed.iloc[val_idx], y_train_full.iloc[val_idx]
                
                model = model_conf['class'](**model_conf['params'])
                
                if model_conf['name'] in ['XGBoost', 'CatBoost'] and target_conf['type'] in ['binary', 'multi_class']:
                    neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
                    model.set_params(scale_pos_weight=neg_to_pos_ratio)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                
                if target_conf['type'] == 'regression':
                    cv_results['rmse'].append(np.sqrt(mean_squared_error(y_val, y_pred)))
                else:
                    cv_results['f1'].append(f1_score(y_val, y_pred, average='macro', zero_division=0))
            
            # G. Store results
            avg_performance = np.mean(cv_results['f1']) if 'f1' in cv_results else np.mean(cv_results['rmse'])
            all_results.append({
                'Feature_Set': feat_name,
                'Target_Type': target_conf['type'],
                'Window': target_conf['window'],
                'Model': model_conf['name'],
                'CV_Performance': avg_performance,
                'Model_Params': model_conf['params']
            })


--- Starting Experiment: Target=binary, Features=baseline ---
-> Running model: XGBoost
-> Running model: CatBoost
-> Running model: RandomForest

--- Starting Experiment: Target=binary, Features=poly_2 ---


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# --- 4. Display Final Summary ---
results_df = pd.DataFrame(all_results)
results_df['CV_Performance'] = results_df['CV_Performance'].round(4)
print("\n--- Final Coarse Modeling Results Summary ---")
print(results_df.sort_values(by='CV_Performance', ascending=False).to_string())