In [4]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, make_scorer

current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    project_root = os.path.dirname(current_dir)
elif current_dir.endswith("signal_synthesiser"):
    project_root = current_dir
else:
    project_root = os.path.dirname(current_dir)

data_path = os.path.join(project_root, "data", "processed", "labelled_data.csv")

if os.path.exists(data_path):
    df = pd.read_csv(data_path, index_col=0, parse_dates=True)
    print(f"Data Loaded. Shape: {df.shape}")
else:
    raise FileNotFoundError(f"Could not find data at {data_path}. Run 01_data_prep first.")

Data Loaded. Shape: (2161, 24)


In [5]:
# Prepare Training Data
train_end = "2020-01-01"
train = df[df.index < train_end]

# FEATURES
features = [
    'Signal_RSI', 'Signal_Trend', 'Signal_MACD', 'Signal_Bollinger', # The Strategies
    'Volatility', 'VIX_Norm', 'Is_Friday', 'Volume'                  # The Weather
]
target = 'Target_Label'

X_train = train[features]
y_train = train[target]

print(f"Training with {len(features)} features: {features}")
print(f"Training Samples: {len(X_train)}")

precision_scorer = make_scorer(precision_score)

# ==========================================
# PART 1: Random Forest Optimisation
# ==========================================

print("\n Starting Random Forest Grid Search...")

# The Grid: We test different depths and strictness levels
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],       
    'min_samples_leaf': [10, 50, 100]  
}

rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced_subsample'),
    param_grid=rf_params,
    scoring=precision_scorer,
    cv=3,       
    n_jobs=-1,  
    verbose=1
)

rf_grid.fit(X_train, y_train)

print(f" RF Best Params: {rf_grid.best_params_}")
print(f" RF Best Precision: {rf_grid.best_score_:.2%}")

# Save the model
rf_best = rf_grid.best_estimator_
rf_path = os.path.join(project_root, "models", "rf_opt.joblib")
joblib.dump(rf_best, rf_path)
print(f" Saved Optimized RF to {rf_path}")

# ==========================================
# PART 2: XGBoost Optimization
# ==========================================

print("\n Starting XGBoost Grid Search...")

ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],         
    'learning_rate': [0.01, 0.05, 0.1], 
    'scale_pos_weight': [ratio]    
}

xgb_grid = GridSearchCV(
    estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    param_grid=xgb_params,
    scoring=precision_scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train, y_train)

print(f" XGB Best Params: {xgb_grid.best_params_}")
print(f" XGB Best Precision: {xgb_grid.best_score_:.2%}")

# Save the model
xgb_best = xgb_grid.best_estimator_
xgb_path = os.path.join(project_root, "models", "xgb_opt.joblib")
joblib.dump(xgb_best, xgb_path)
print(f"Saved Optimized XGBoost to {xgb_path}")

Training with 8 features: ['Signal_RSI', 'Signal_Trend', 'Signal_MACD', 'Signal_Bollinger', 'Volatility', 'VIX_Norm', 'Is_Friday', 'Volume']
Training Samples: 1160

 Starting Random Forest Grid Search...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
 RF Best Params: {'max_depth': 10, 'min_samples_leaf': 10, 'n_estimators': 200}
 RF Best Precision: 54.93%
 Saved Optimized RF to /mnt/c/Users/sohan/Projects/signal_synthesiser/models/rf_opt.joblib

 Starting XGBoost Grid Search...
Fitting 3 folds for each of 18 candidates, totalling 54 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


 XGB Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'scale_pos_weight': np.float64(2.558282208588957)}
 XGB Best Precision: 53.91%
Saved Optimized XGBoost to /mnt/c/Users/sohan/Projects/signal_synthesiser/models/xgb_opt.joblib
