In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from stock_features import prepare_data_for_ml, create_target_variable
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

In [None]:
# Define the parameters for your data pipeline
tickers_list = ['PG', 'KO', 'PEP', 'WMT', 'COST', '^GSPC']
start_date_str = '1986-01-01'
end_date_str = '2023-01-01'
output_filename = "consumer_stocks_final_engineered.csv"

# Make the single function call to run the entire pipeline
final_engineered_df = prepare_data_for_ml(
    tickers=tickers_list,
    start_date=start_date_str,
    end_date=end_date_str,
    output_engineered_csv=output_filename
)

In [None]:
# --- 1. DEFINE YOUR EXPERIMENTS ---
# Add new experiments by expanding this list
experiments = [
    # XGBoost with a weekly target and 1% threshold
    {'model': 'XGBoost', 'window': 5, 'threshold': 0.01, 'model_params': {}},
    
    # Random Forest with a weekly target and 1% threshold
    {'model': 'RandomForest', 'window': 5, 'threshold': 0.01, 'model_params': {}},

    # Let's try a different target definition: shorter window, lower threshold
    {'model': 'XGBoost', 'window': 3, 'threshold': 0.005, 'model_params': {}},
    {'model': 'RandomForest', 'window': 3, 'threshold': 0.005, 'model_params': {}},

    # Let's try a different target definition: longer window, higher threshold
    {'model': 'XGBoost', 'window': 10, 'threshold': 0.02, 'model_params': {}},
]

In [None]:
# --- 2. THE EXPERIMENTAL LOOP ---
results = []
target_ticker = 'WMT'

for exp in experiments:
    print(f"\n--- Running experiment: Model={exp['model']}, Window={exp['window']}, Threshold={exp['threshold']} ---")

    # A. Dynamically create the target variable for this experiment
    data_target = create_target_variable(final_engineered_df.copy(), target_ticker, window=exp['window'], threshold=exp['threshold'])
    
    # B. Separate features (X) and target (y)
    target_col_name = f'{target_ticker}_Target'
    target_return_col_name = [col for col in data_target.columns if col.startswith(f'{target_ticker}_target_return_')][0]
    
    columns_to_drop = [target_col_name, target_return_col_name, f'Open_{target_ticker}', f'High_{target_ticker}', f'Low_{target_ticker}', f'Close_{target_ticker}']
    
    X = data_target.drop(columns=columns_to_drop)
    y = data_target[target_col_name]
    
    X_train = X.loc[:'2021-01-01'].copy()
    y_train = y.loc[:'2021-01-01'].copy()
    X_test = X.loc['2021-01-01':].copy()
    y_test = y.loc['2021-01-01':].copy()
    
    # C. Handle Class Imbalance
    neg_to_pos_ratio = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"Class imbalance ratio (0/1): {neg_to_pos_ratio:.2f}")

    # D. Set up the correct model and GridSearchCV for this experiment
    if exp['model'] == 'XGBoost':
        model = xgb.XGBClassifier(
            objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=neg_to_pos_ratio
        )
        param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}
    
    elif exp['model'] == 'RandomForest':
        model = RandomForestClassifier(random_state=42, class_weight='balanced')
        param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_leaf': [1, 5]}

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1_macro',
        cv=3,
        verbose=3,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    # E. Evaluate the model and collect results
    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    results.append({
        'Model': exp['model'],
        'Window': exp['window'],
        'Threshold': exp['threshold'],
        'Test_Accuracy': report['accuracy'],
        'Test_Precision_1': report['1']['precision'],
        'Test_Recall_1': report['1']['recall'],
        'Test_F1_1': report['1']['f1-score'],
        'Best_Params': grid_search.best_params_,
    })

# --- 3. DISPLAY FINAL RESULTS ---
results_df = pd.DataFrame(results)
print("\n--- Final Experiment Results Summary ---")
print(results_df.to_string())