In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from stock_features import prepare_data_for_ml, create_target_variable
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '1986-01-01'
end_date_str = '2023-01-01'
output_filename = "consumer_stocks_final_engineered.csv"

# Make the single function call to run the entire pipeline
final_engineered_df = prepare_data_for_ml(
    tickers=tickers_list,
    start_date=start_date_str,
    end_date=end_date_str,
    output_engineered_csv=output_filename
)

In [None]:
# The data is ready to be used with the best performing target
target_ticker = 'WMT'
data_target = create_target_variable(final_engineered_df.copy(), target_ticker, window=5, threshold=0.01)

# Separate features (X) and target (y)
target_col_name = f'{target_ticker}_Target'
target_return_col_name = [col for col in data_target.columns if col.startswith(f'{target_ticker}_target_return_')][0]
columns_to_drop = [target_col_name, target_return_col_name, f'Open_{target_ticker}', f'High_{target_ticker}', f'Low_{target_ticker}', f'Close_{target_ticker}']
X = data_target.drop(columns=columns_to_drop)
y = data_target[target_col_name]

X_train = X.loc[:'2021-01-01'].copy()
y_train = y.loc[:'2021-01-01'].copy()
X_test = X.loc['2021-01-01':].copy()
y_test = y.loc['2021-01-01':].copy()

# C. Handle Class Imbalance with CatBoost's built-in feature
# CatBoost handles class imbalance automatically with 'auto_class_weights'
# No need to manually calculate neg_to_pos_ratio here
print(f"\n--- Starting Refined CatBoost Grid Search ---")

# Define the model with CatBoost's class imbalance parameter
catboost_model = CatBoostClassifier(
    random_state=42,
    verbose=0, # Set to a higher number to see training output
    auto_class_weights='Balanced', # CatBoost's way of handling imbalance
)

# Define a more refined grid of parameters centered around your best results
# CatBoost has different hyperparameter names than XGBoost
param_grid = {
    'iterations': [150, 200, 250],           # Equivalent to n_estimators
    'learning_rate': [0.05, 0.1, 0.2],       # Same name, different best values
    'depth': [3, 5, 7],                     # Equivalent to max_depth
}

grid_search = GridSearchCV(
    estimator=catboost_model, # <-- CHANGE MODEL HERE
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
y_pred_tuned = best_model.predict(X_test)
print("\n--- Tuned CatBoost Model Evaluation on Test Data ---")
print(classification_report(y_test, y_pred_tuned))

# Print final results
print("\nBest parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))