In [43]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from src.stock_features import create_target_variable

In [44]:
# Load the prepared data
data = pd.read_csv(r'C:\Users\epoch_bpjmdqk\Documents\Code\data\processed\stock_and_macro.csv', index_col='Date', parse_dates=True)

In [None]:
# --- 1. DEFINE YOUR EXPERIMENTS ---
experiment_configs = [
    {
        'model_name': 'XGBoost',
        'model_class': XGBClassifier,
        'initial_params': {'eval_metric': 'logloss', 'random_state': 42},
        'param_grid': {
            'n_estimators': [100], 
            'learning_rate': [0.1], 
            'max_depth': [3],
        }
    },
    {
        'model_name': 'CatBoost',
        'model_class': CatBoostClassifier,
        'initial_params': {'verbose': False, 'random_state': 42, 'early_stopping_rounds': 50},
        'param_grid': {
            'n_estimators': [100], 
            'learning_rate': [0.1], 
            'depth': [3],
        }
    },
    {
        'model_name': 'RandomForest',
        'model_class': RandomForestClassifier,
        'initial_params': {'random_state': 42, 'class_weight': 'balanced'},
        'param_grid': {
            'n_estimators': [100, 200], 
            'max_depth': [5, 10],
        }
    },
    {
        'model_name': 'LogisticRegression',
        'model_class': LogisticRegression,
        'initial_params': {'random_state': 42, 'class_weight': 'balanced'},
        'param_grid': {
            'C': [0.1, 1.0, 10.0], 
            'solver': ['liblinear']
        }
    }
]

# Define the different hyperparameters for the data preparation step
window_sizes = [5, 10]
thresholds = [0.005, 0.01]

In [None]:
# --- 2. THE EXPERIMENTAL LOOP (STAGE 1: MODEL SCREENING) ---
results = []
target_ticker = 'WMT'
split_date = '2021-01-01'

for window in window_sizes:
    for threshold in thresholds:
        # A. Dynamically create the target variable for this experiment
        data_target = create_target_variable(data.copy(), target_ticker, window=window, threshold=threshold)

        # B. Separate features (X) and target (y)
        target_col_name = f'{target_ticker}_Target'
        # Dynamically create the target return column name to match the function's output
        target_return_col_name = f'{target_ticker}_target_return_{window}D_{threshold}'
        
        columns_to_drop = [
            target_col_name,
            target_return_col_name,
            f'Open_{target_ticker}',
            f'High_{target_ticker}',
            f'Low_{target_ticker}',
            f'Close_{target_ticker}'
        ]

        X = data_target.drop(columns=columns_to_drop)
        y = data_target[target_col_name]
        
        X_train = X.loc[:split_date].copy()
        y_train = y.loc[:split_date].copy()
        X_test = X.loc[split_date:].copy()
        y_test = y.loc[split_date:].copy()

        # C. Handle Class Imbalance
        neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
        print(f"\n--- Data setup for Window={window}, Threshold={threshold} ---")
        print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")

        for exp in experiment_configs:
            print(f"-> Running model: {exp['model_name']}")
            
            # Create a new model instance for this run
            model = exp['model_class'](**exp['initial_params'])
            
            # Get the param_grid and add the scale_pos_weight if necessary
            param_grid = exp['param_grid'].copy()
            if exp['model_name'] in ['XGBoost', 'CatBoost']:
                param_grid['scale_pos_weight'] = [neg_to_pos_ratio]

            # D. Fit with a lightweight GridSearchCV
            grid_search = GridSearchCV(
                estimator=model,
                param_grid=param_grid,
                scoring='f1_macro',
                cv=3,
                verbose=0,
                n_jobs=-1
            )
            
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_

            # E. Evaluate the model and collect results
            y_pred = best_model.predict(X_test)
            report = classification_report(y_test, y_pred, output_dict=True)
            
            # F. Extract and store top features
            top_features = {}
            if hasattr(best_model, 'feature_importances_'):
                importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
                top_features = importances.nlargest(10).to_dict()
            elif hasattr(best_model, 'coef_'):
                coefficients = pd.Series(best_model.coef_[0], index=X_train.columns)
                top_features = coefficients.abs().nlargest(10).to_dict()
            
            print("   Top 10 Features (or Coefficients):")
            for feature, score in top_features.items():
                print(f"   - {feature}: {score:.4f}")
            
            results.append({
                'Model': exp['model_name'],
                'Window': window,
                'Threshold': threshold,
                'Test_Accuracy': report['accuracy'],
                'Test_Precision_1': report['1']['precision'],
                'Test_Recall_1': report['1']['recall'],
                'Test_F1_1': report['1']['f1-score'],
                'Best_Params': grid_search.best_params_,
                'Top_Features': top_features
            })

# --- 3. DISPLAY FINAL RESULTS ---
results_df = pd.DataFrame(results)
# Sort the results by F1 score in descending order and reset the index
results_df = results_df.sort_values(by='Test_F1_1', ascending=False).reset_index(drop=True)
print("\n--- Final Experiment Results Summary ---")
print(results_df.to_string())



--- Data setup for Window=5, Threshold=0.005 ---
Class imbalance ratio (0/1): 1.21
-> Running model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


-> Running model: CatBoost
-> Running model: RandomForest
-> Running model: LogisticRegression

--- Data setup for Window=5, Threshold=0.01 ---
Class imbalance ratio (0/1): 1.88
-> Running model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


-> Running model: CatBoost
-> Running model: RandomForest
-> Running model: LogisticRegression

--- Data setup for Window=10, Threshold=0.005 ---
Class imbalance ratio (0/1): 0.97
-> Running model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


-> Running model: CatBoost
-> Running model: RandomForest
-> Running model: LogisticRegression

--- Data setup for Window=10, Threshold=0.01 ---
Class imbalance ratio (0/1): 1.30
-> Running model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


-> Running model: CatBoost
-> Running model: RandomForest
-> Running model: LogisticRegression

--- Final Experiment Results Summary ---
                 Model  Window  Threshold  Test_Accuracy  Test_Precision_1  Test_Recall_1  Test_F1_1                                                                                          Best_Params
0   LogisticRegression      10      0.005       0.548708          0.517467       0.975309   0.676177                                                                    {'C': 0.1, 'solver': 'liblinear'}
1   LogisticRegression      10      0.010       0.491054          0.454348       0.976636   0.620178                                                                    {'C': 1.0, 'solver': 'liblinear'}
2   LogisticRegression       5      0.005       0.497018          0.465596       0.910314   0.616085                                                                    {'C': 1.0, 'solver': 'liblinear'}
3              XGBoost      10      0.005       0.57455

In [47]:
# Sort the results by F1 score in descending order and reset the index
results_df = results_df.sort_values(by='Test_F1_1', ascending=False).reset_index(drop=True)
print("\n--- Final Experiment Results Summary ---")
print(results_df.to_string())


--- Final Experiment Results Summary ---
                 Model  Window  Threshold  Test_Accuracy  Test_Precision_1  Test_Recall_1  Test_F1_1                                                                                          Best_Params
0   LogisticRegression      10      0.005       0.548708          0.517467       0.975309   0.676177                                                                    {'C': 0.1, 'solver': 'liblinear'}
1   LogisticRegression      10      0.010       0.491054          0.454348       0.976636   0.620178                                                                    {'C': 1.0, 'solver': 'liblinear'}
2   LogisticRegression       5      0.005       0.497018          0.465596       0.910314   0.616085                                                                    {'C': 1.0, 'solver': 'liblinear'}
3              XGBoost      10      0.005       0.574553          0.560166       0.555556   0.557851  {'learning_rate': 0.1, 'max_depth': 3, 'n_estima

In [48]:

#print all data coloums
print("\n--- All Data Columns ---")
print(data.columns.tolist())



--- All Data Columns ---
['Close_COST', 'Close_KO', 'Close_PEP', 'Close_PG', 'Close_WMT', 'Close_^GSPC', 'High_COST', 'High_KO', 'High_PEP', 'High_PG', 'High_WMT', 'High_^GSPC', 'Low_COST', 'Low_KO', 'Low_PEP', 'Low_PG', 'Low_WMT', 'Low_^GSPC', 'Open_COST', 'Open_KO', 'Open_PEP', 'Open_PG', 'Open_WMT', 'Open_^GSPC', 'Volume_COST', 'Volume_KO', 'Volume_PEP', 'Volume_PG', 'Volume_WMT', 'Volume_^GSPC', 'COST_HighLow_Range', 'COST_OpenClose_Range', 'COST_Close_to_Range_Ratio', 'COST_True_Range', 'COST_ATR14', 'COST_Volume_Daily_Change', 'COST_Volume_MA_20D', 'COST_Volume_MA_Ratio', 'COST_OBV', 'COST_RSI14', 'COST_MACD_Line', 'COST_MACD_Signal', 'COST_MACD_Hist', 'COST_SMA_10', 'COST_SMA_20', 'COST_SMA_50', 'COST_EMA_12', 'COST_EMA_26', 'COST_BB_Middle20', 'COST_BB_Upper20', 'COST_BB_Lower20', 'COST_BB_Bandwidth20', 'COST_BB_PctB20', 'COST_Stoch_K_14', 'COST_Stoch_D_14_3', 'COST_PlusDI_14', 'COST_MinusDI_14', 'COST_DX_14', 'COST_ADX_14', 'KO_HighLow_Range', 'KO_OpenClose_Range', 'KO_Close_