# In this notebook we would train the catboost model

In [None]:
drop_catboost_data_driven = [
    # Raw OHLC (massively correlated with close)
    'open', 'high', 'low',
    
    # ATR components
    'high_low', 'high_close', 'low_close',
    
    # Helper calculations  
    'typical_price',  # 99.99% corr with close
    'true_range',     # Captured in atr_14
    'volume_mean_20', # Used in ratio
    
    # Highly correlated MAs (keep EMA_7, drop others)
    'EMA_21',         # 99.99% corr with SMA_20
    'SMA_20',         # 99.99% corr with vwap_24h  
    'vwap_24h',       # Redundant with SMA_20
    'close_4h',       # 99.99% corr with close
    
    # Bollinger components (keep bb_position, width)
    'bollinger_upper', 'bollinger_lower',  # 99.98% corr with SMA_20
    
    # MACD components (keep histogram)
    'MACD_line', 'MACD_signal',  # 95% corr, histogram captures key info
    
    # Support/resistance (99.85% corr with each other)
    'resistance_level', 'support_level',  # Highly correlated
    
    # Zero-importance binary flags
    'vol_spike_1_5x',           # 0 LightGBM splits
    'near_upper_band', 'near_lower_band',  # 0 LightGBM splits
    'break_upper_band', 'break_lower_band', # 0 LightGBM splits
    'rsi_oversold',             # 0 LightGBM splits
    'above_sma20',              # 0 LightGBM splits, derivable
    'macd_positive',            # 0 LightGBM splits
    'volume_breakout', 'volume_breakdown',  # 0 LightGBM splits
    
    # Highly correlated position flags
    'above_sma50', 'ema7_above_ema21',  # 77% corr, derivable
    
    # Low-importance cross signals (keep best ones)
    'ema_cross_down',           # Lower importance than ema_cross_up
    
    # Some oscillator extremes (keep the reversals)
    'rsi_overbought', 'stoch_overbought', 'stoch_oversold',
    'cci_overbought', 'cci_oversold',
    
    # Trend flags vs continuous values
    'trending_market',          # Keep ADX value instead
    
    # All scenario features (very low importance)
    'bullish_scenario_1', 'bullish_scenario_2', 'bullish_scenario_3',
    'bullish_scenario_4', 'bullish_scenario_5', 'bullish_scenario_6',
    'bearish_scenario_1', 'bearish_scenario_2', 'bearish_scenario_3',
    'bearish_scenario_4', 'bearish_scenario_6',
]

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from itertools import product
from catboost import CatBoostClassifier, Pool, CatBoostError
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, f1_score
)

# ───── CONFIG ─────
CSV_PATH = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
DROP_COLS = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close',
    'typical_price', 'volume_mean_20', 'EMA_21', 'SMA_20', 'vwap_24h',
    'bollinger_upper', 'bollinger_lower', 'MACD_line', 'MACD_signal',
    'resistance_level', 'support_level', 'vol_spike_1_5x', 'near_upper_band',
    'near_lower_band', 'break_upper_band', 'break_lower_band', 'rsi_oversold',
    'above_sma20', 'macd_positive', 'volume_breakout', 'volume_breakdown',
    'above_sma50', 'ema7_above_ema21', 'ema_cross_down', 'rsi_overbought',
    'stoch_overbought', 'stoch_oversold', 'cci_overbought', 'cci_oversold',
    'trending_market', 'bullish_scenario_1', 'bullish_scenario_2',
    'bullish_scenario_3', 'bullish_scenario_4', 'bullish_scenario_5',
    'bullish_scenario_6', 'bearish_scenario_1', 'bearish_scenario_2',
    'bearish_scenario_3', 'bearish_scenario_4', 'bearish_scenario_6',
    'close'
]
VAL_FRAC = 0.20
RANDOM_SEED = 42

# ───── LOAD & PREPARE DATA ─────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df = df[df.index >= "2018-01-01"]

if 'target' not in df.columns:
    raise ValueError("Target column not found!")
if len(df) == 0:
    raise ValueError("DataFrame is empty after filtering!")

df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int).values

split_row = int(len(df) * (1 - VAL_FRAC))
X_train, X_val = X.iloc[:split_row], X.iloc[split_row:]
y_train, y_val = y[:split_row], y[split_row:]

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_val, y_val)

# ───── GRID SEARCH ─────
depths = [4, 6, 8]
learning_rates = [0.01, 0.03, 0.05]
l2_regs = [3, 5, 7]

best_f1 = 0
best_config = {}
results = []

print("\n🧪 Starting grid search...\n")
for depth, lr, l2 in product(depths, learning_rates, l2_regs):
    print(f"🔍 Testing config: depth={depth}, lr={lr}, l2={l2}")
    try:
        model = CatBoostClassifier(
            iterations=3000,
            learning_rate=lr,
            depth=depth,
            l2_leaf_reg=l2,
            loss_function="Logloss",
            eval_metric="F1",
            random_seed=RANDOM_SEED,
            early_stopping_rounds=300,
            task_type="GPU",
            verbose=False
        )
        model.fit(train_pool, eval_set=val_pool, use_best_model=True)

        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)

        acc = accuracy_score(y_val, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(y_val, y_pred, labels=[0, 1], zero_division=0)
        macro_f1 = f1_score(y_val, y_pred, average='macro')

        print(f"→ F1: {f1[1]:.4f} | Acc: {acc:.4f} | Prec: {prec[1]:.4f} | Rec: {rec[1]:.4f}")

        results.append({
            "depth": depth, "learning_rate": lr, "l2_leaf_reg": l2,
            "F1": round(f1[1], 4),
            "Accuracy": round(acc, 4),
            "Precision": round(prec[1], 4),
            "Recall": round(rec[1], 4),
            "Macro-F1": round(macro_f1, 4)
        })

        if f1[1] > best_f1:
            best_f1 = f1[1]
            best_config = results[-1]

    except CatBoostError as e:
        print(f"❌ Failed with GPU error: {e}")

# ───── SUMMARY ─────
print("\n\n✅ Grid Search Completed.\n")
summary = pd.DataFrame(results).sort_values(by="F1", ascending=False).reset_index(drop=True)
print(summary.to_string(index=False))

print("\n🏆 Best Configuration:")
for k, v in best_config.items():
    print(f"{k:15s}: {v}")



🧪 Starting grid search...

🔍 Testing config: depth=4, lr=0.01, l2=3
→ F1: 0.5396 | Acc: 0.5383 | Prec: 0.5630 | Rec: 0.5181
🔍 Testing config: depth=4, lr=0.01, l2=5
→ F1: 0.5393 | Acc: 0.5399 | Prec: 0.5652 | Rec: 0.5157
🔍 Testing config: depth=4, lr=0.01, l2=7
→ F1: 0.5385 | Acc: 0.5405 | Prec: 0.5663 | Rec: 0.5133
🔍 Testing config: depth=4, lr=0.03, l2=3
→ F1: 0.5325 | Acc: 0.5393 | Prec: 0.5664 | Rec: 0.5024
🔍 Testing config: depth=4, lr=0.03, l2=5
→ F1: 0.5365 | Acc: 0.5380 | Prec: 0.5635 | Rec: 0.5121
🔍 Testing config: depth=4, lr=0.03, l2=7
→ F1: 0.5352 | Acc: 0.5421 | Prec: 0.5695 | Rec: 0.5048
🔍 Testing config: depth=4, lr=0.05, l2=3
→ F1: 0.5325 | Acc: 0.5333 | Prec: 0.5583 | Rec: 0.5091
🔍 Testing config: depth=4, lr=0.05, l2=5
→ F1: 0.5345 | Acc: 0.5402 | Prec: 0.5671 | Rec: 0.5054
🔍 Testing config: depth=4, lr=0.05, l2=7
→ F1: 0.5375 | Acc: 0.5339 | Prec: 0.5578 | Rec: 0.5187
🔍 Testing config: depth=6, lr=0.01, l2=3
→ F1: 0.5139 | Acc: 0.5323 | Prec: 0.5620 | Rec: 0.4734
🔍 

In [6]:
import numpy as np, pandas as pd
from pathlib import Path
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, fbeta_score

# ─── CONFIG ─────────────────────────────────────────────
CSV_PATH = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
VAL_FRAC   = 0.20
BETA       = 2.0
SAVE_PATH  = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\src\Models\models\models\catboost_val_predictions_final.csv")

# ─── LOAD DATA ──────────────────────────────────────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df = df[df.index >= "2018-01-01"]

# Drop only final NaNs (target shift), keep others
df = df.select_dtypes(include=[np.number]).iloc[:-1]

if 'target' not in df.columns:
    raise ValueError("Missing 'target' column in the dataset.")

X, y = df.drop(columns=["target"]), df["target"].astype(int).values
cut  = int(len(df) * (1 - VAL_FRAC))

train_pool = Pool(X.iloc[:cut],  y[:cut])
val_pool   = Pool(X.iloc[cut:], y[cut:])

# ─── FINAL TRAINING (best from grid) ───────────────────
model = CatBoostClassifier(
    iterations=1000,               # upper bound
    learning_rate=0.03,
    depth=4,
    l2_leaf_reg=7,
    loss_function="Logloss",
    eval_metric="F1",
    early_stopping_rounds=300,
    use_best_model=True,
    random_seed=42,
    task_type="GPU",
    verbose=100
)

model.fit(train_pool, eval_set=val_pool)

# ─── METRICS ────────────────────────────────────────────
y_prob = model.predict_proba(val_pool)[:, 1]
y_pred = (y_prob >= 0.50).astype(int)

acc  = accuracy_score(y[cut:], y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y[cut:], y_pred, labels=[0, 1], zero_division=0)
f2_score = fbeta_score(y[cut:], y_pred, beta=BETA, zero_division=0)

print("\n──── Validation metrics (thr = 0.50) ────")
print(f"Accuracy          : {acc:6.3f}")
print(f"Class 0 (Down) →  Precision: {prec[0]:6.3f}  Recall: {rec[0]:6.3f}  F1: {f1[0]:6.3f}")
print(f"Class 1 (Up  ) →  Precision: {prec[1]:6.3f}  Recall: {rec[1]:6.3f}  F1: {f1[1]:6.3f}")
print(f"Macro-F1          : {f1.mean():6.3f}")
print(f"Weighted-F1 (β=2) : {f2_score:6.3f}")

# ─── SAVE PREDICTIONS ──────────────────────────────────
pd.DataFrame({
    "prob_up": y_prob,
    "pred_0.50": y_pred
}, index=X.iloc[cut:].index).to_csv(SAVE_PATH)

print("✓ Saved predictions →", SAVE_PATH.resolve())


0:	learn: 0.5952616	test: 0.5716788	best: 0.5716788 (0)	total: 14.9ms	remaining: 14.8s
100:	learn: 0.5738574	test: 0.4579371	best: 0.5716788 (0)	total: 1.49s	remaining: 13.2s
200:	learn: 0.5913030	test: 0.4301401	best: 0.5716788 (0)	total: 2.99s	remaining: 11.9s
300:	learn: 0.6087157	test: 0.4195912	best: 0.5716788 (0)	total: 4.46s	remaining: 10.4s
bestTest = 0.5716788321
bestIteration = 0
Shrink model to first 1 iterations.

──── Validation metrics (thr = 0.50) ────
Accuracy          :  0.537
Class 0 (Down) →  Precision:  0.517  Recall:  0.479  F1:  0.497
Class 1 (Up  ) →  Precision:  0.554  Recall:  0.591  F1:  0.572
Macro-F1          :  0.534
Weighted-F1 (β=2) :  0.583
✓ Saved predictions → C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\src\Models\models\models\catboost_val_predictions_final.csv
