# In this notebook we would train the catboost model

In [2]:
"""
catboost_gpu_btc_direction.py
─────────────────────────────
GPU-accelerated CatBoostClassifier that predicts whether the *next-hour* BTC
close will go Up (1) or Down (0).  It mirrors the preprocessing of your other
models, respects chronological order, computes the β = 2 weighted-F1 you’ve
been using, and prints full validation metrics.  It also saves raw
probabilities so you can apply any threshold later.

Dependencies
------------
pip install catboost pandas numpy scikit-learn
(CatBoost wheels include CUDA kernels; RTX 3060 is fully supported.)
"""

# ════════════════════════════════════════════════════════════════════════
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support
)

# ────── Configuration ───────────────────────────────────────────────────
CSV_PATH  = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features_2016_final.csv"

DROP_COLS = [
    "vol_ratio_24h", "macd_diff", "macd_line",
    "upper_shadow", "lower_shadow"
]

VAL_FRAC  = 0.20          # 80 % train, 20 % validation (chronological)
BETA      = 2.0           # precision weight in weighted-F1
SAVE_PROB = True          # set False to skip saving CSV

# ────── Load & preprocess ───────────────────────────────────────────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)

df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df["Volume BTC"] = np.log1p(df["Volume BTC"])

# label: 1 if next close > current close, else 0
df["target"] = (df["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int).values

# chronological split
split_row = int(len(df) * (1 - VAL_FRAC))
X_train, X_val = X.iloc[:split_row], X.iloc[split_row:]
y_train, y_val = y[:split_row], y[split_row:]

train_pool = Pool(X_train, y_train)
val_pool   = Pool(X_val,   y_val)

# ────── CatBoost GPU model ──────────────────────────────────────────────
model = CatBoostClassifier(
    iterations          = 5000,          # upper bound, early-stop will trim
    learning_rate       = 0.05,
    depth               = 6,
    l2_leaf_reg         = 3,
    loss_function       = "Logloss",
    eval_metric         = "Logloss",
    random_seed         = 42,
    early_stopping_rounds = 300,
    task_type           = "GPU",         # ← GPU enabled
    devices             = "0",           # change if multi-GPU
    verbose             = 200            # prints every 200 iters
)

model.fit(
    train_pool,
    eval_set  = val_pool,
    use_best_model = True
)

# ────── Predict & evaluate ──────────────────────────────────────────────
y_prob = model.predict_proba(X_val)[:, 1]
y_pred = (y_prob >= 0.50).astype(int)      # default threshold 0.50

acc  = accuracy_score(y_val, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, y_pred, labels=[0, 1], zero_division=0
)

weighted_f1 = (1 + BETA**2) * prec[1] * rec[1] / (BETA**2 * prec[1] + rec[1] + 1e-12)

print("\n──── Validation metrics (thr = 0.50) ────")
print(f"Accuracy          : {acc:6.3f}")
print(f"Class 0 (Down) →  Precision: {prec[0]:6.3f}  Recall: {rec[0]:6.3f}  F1: {f1[0]:6.3f}")
print(f"Class 1 (Up  ) →  Precision: {prec[1]:6.3f}  Recall: {rec[1]:6.3f}  F1: {f1[1]:6.3f}")
print(f"Macro-F1          : {f1.mean():6.3f}")
print(f"Weighted-F1 (β=2) : {weighted_f1:6.3f}")

# ────── Save probabilities for threshold sweeps ─────────────────────────
if SAVE_PROB:
    pred_df = pd.DataFrame({
        "prob_up": y_prob,
        "pred_0.50": y_pred
    }, index=X_val.index)                       # original timestamps
    pred_df.to_csv("catboost_val_predictions.csv")
    print("Saved → catboost_val_predictions.csv")


0:	learn: 0.6926472	test: 0.6928779	best: 0.6928779 (0)	total: 13.7ms	remaining: 1m 8s
200:	learn: 0.6743876	test: 0.6989597	best: 0.6910961 (15)	total: 879ms	remaining: 21s
bestTest = 0.6910961062
bestIteration = 15
Shrink model to first 16 iterations.

──── Validation metrics (thr = 0.50) ────
Accuracy          :  0.530
Class 0 (Down) →  Precision:  0.517  Recall:  0.589  F1:  0.551
Class 1 (Up  ) →  Precision:  0.546  Recall:  0.473  F1:  0.507
Macro-F1          :  0.529
Weighted-F1 (β=2) :  0.486
Saved → catboost_val_predictions.csv


In [3]:
"""
catboost_final_gpu.py
---------------------
Train CatBoost on GPU with the tuned best-iteration parameters
(bestIteration = 15  ⇒  Shrunk model uses 16 trees).

Dependencies
------------
pip install catboost pandas numpy scikit-learn
"""

import numpy as np, pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ───────── Config ─────────
CSV_PATH  = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features_2016_final.csv"
DROP_COLS = ["vol_ratio_24h", "macd_diff", "macd_line",
             "upper_shadow", "lower_shadow"]

VAL_FRAC = 0.20
BETA     = 2.0
SAVE_PROB = True

# ───────── Load & preprocess ─────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df["Volume BTC"] = np.log1p(df["Volume BTC"])
df["target"] = (df["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int).values

split = int(len(df) * (1 - VAL_FRAC))
X_train, X_val = X.iloc[:split], X.iloc[split:]
y_train, y_val = y[:split], y[split:]

train_pool = Pool(X_train, y_train)
val_pool   = Pool(X_val,   y_val)

# ───────── Fixed best-iteration CatBoost ─────────
model = CatBoostClassifier(
    iterations          = 16,      # bestIteration 15  + 1
    learning_rate       = 0.05,    # same as earlier run
    depth               = 6,
    l2_leaf_reg         = 3,
    loss_function       = "Logloss",
    eval_metric         = "Logloss",
    random_seed         = 42,
    task_type           = "GPU",
    devices             = "0",
    verbose             = False    # silences per-iteration logging
)

model.fit(train_pool, eval_set=val_pool, use_best_model=False)

# ───────── Predict & metrics ─────────
y_prob = model.predict_proba(X_val)[:, 1]
y_pred = (y_prob >= 0.50).astype(int)

acc  = accuracy_score(y_val, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, y_pred, labels=[0, 1], zero_division=0
)
weighted_f1 = (1 + BETA**2) * prec[1] * rec[1] / (BETA**2 * prec[1] + rec[1] + 1e-12)

print("\n──── Validation metrics (thr = 0.50) ────")
print(f"Accuracy          : {acc:6.3f}")
print(f"Class 0 (Down) →  Precision: {prec[0]:6.3f}  Recall: {rec[0]:6.3f}  F1: {f1[0]:6.3f}")
print(f"Class 1 (Up  ) →  Precision: {prec[1]:6.3f}  Recall: {rec[1]:6.3f}  F1: {f1[1]:6.3f}")
print(f"Macro-F1          : {f1.mean():6.3f}")
print(f"Weighted-F1 (β=2) : {weighted_f1:6.3f}")

# ───────── Save probabilities ─────────
if SAVE_PROB:
    pred_df = pd.DataFrame({
        "prob_up": y_prob,
        "pred_0.50": y_pred
    }, index=X_val.index)
    pred_df.to_csv("catboost_val_predictions_final.csv")
    print("Saved → catboost_val_predictions_final.csv")



──── Validation metrics (thr = 0.50) ────
Accuracy          :  0.530
Class 0 (Down) →  Precision:  0.517  Recall:  0.588  F1:  0.550
Class 1 (Up  ) →  Precision:  0.545  Recall:  0.474  F1:  0.507
Macro-F1          :  0.529
Weighted-F1 (β=2) :  0.487
Saved → catboost_val_predictions_final.csv
