# In this notebook we would create the LIGHTGBM model


In [None]:
DROP_COLS = [ 
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'rsi_oversold', 'rsi_overbought', 'stoch_overbought',
    'stoch_oversold', 'cci_overbought', 'cci_oversold', 'near_upper_band',
    'near_lower_band', 'overbought_reversal', 'oversold_reversal',
    'ema_cross_up', 'ema_cross_down', 'macd_cross_up', 'macd_cross_down',
    'trending_market', 'trend_alignment', 'ema7_above_ema21', 'macd_rising',
    'bollinger_upper', 'bollinger_lower', 'bullish_scenario_1',
    'bullish_scenario_5', 'bearish_scenario_1'
]

In [18]:
import numpy as np
import pandas as pd
import optuna
import json
import warnings
from pathlib import Path
from sklearn.metrics import fbeta_score
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping

# ───── CONFIG ─────────────────────────────────────────
SEED         = 42
FOLDS        = 3
GAP_ROWS     = 12
EARLY_STOP   = 200
N_TRIALS     = 60
TIMEOUT      = 1800  # seconds (30 min max)
MAX_TREES    = 2500
BETA         = 2
CSV_PATH     = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")

DROP_COLS = [
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_mean_20', 'EMA_21', 'SMA_20', 'vwap_24h', 'bollinger_upper', 'bollinger_lower',
    'MACD_line', 'MACD_signal', 'resistance_level', 'support_level', 'vol_spike_1_5x',
    'near_upper_band', 'near_lower_band', 'break_upper_band', 'break_lower_band',
    'rsi_oversold', 'above_sma20', 'macd_positive', 'volume_breakout', 'volume_breakdown',
    'above_sma50', 'ema7_above_ema21', 'ema_cross_down', 'rsi_overbought', 'stoch_overbought',
    'stoch_oversold', 'cci_overbought', 'cci_oversold', 'trending_market',
    'bullish_scenario_1', 'bullish_scenario_2', 'bullish_scenario_3', 'bullish_scenario_4',
    'bullish_scenario_5', 'bullish_scenario_6', 'bearish_scenario_1', 'bearish_scenario_2',
    'bearish_scenario_3', 'bearish_scenario_4', 'bearish_scenario_6', 'close'
]

warnings.filterwarnings("ignore")
np.random.seed(SEED)

# ───── Load & preprocess data ───────────────────────
def load_data():
    df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
    df = df[df.index >= "2018-01-01"]
    df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
    df = df[df["target"].notna()]
    df = df.dropna().select_dtypes(include=[np.number])
    X = df.drop(columns=["target"]).values
    y = df["target"].astype(int).values
    return X, y

# ───── Custom F2 eval function ───────────────────────
def f2_score_eval(y_true, y_pred):
    y_pred_binary = (y_pred >= 0.5).astype(int)
    f2 = fbeta_score(y_true, y_pred_binary, beta=BETA, zero_division=0)
    return 'f2', f2, True

# ───── Objective function ─────────────────────────────
def objective(trial, data):
    X, y = data
    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "dart"])

    params = {
        "boosting_type": boosting_type,
        "device":        "gpu",
        "gpu_use_dp":    False,
        "random_state":  SEED,
        "verbosity":     -1,

        # Optimized search space
        "learning_rate":     trial.suggest_float("lr", 0.02, 0.2, log=True),
        "num_leaves":        trial.suggest_int("leaves", 31, 256, step=8),
        "max_depth":         trial.suggest_int("depth", 4, 12),
        "min_child_samples": trial.suggest_int("min_child", 20, 300),
        "feature_fraction":  trial.suggest_float("feat_frac", 0.6, 1.0),
        "bagging_fraction":  trial.suggest_float("bag_frac", 0.6, 1.0),
        "bagging_freq":      trial.suggest_int("bag_freq", 1, 5),
        "reg_alpha":         trial.suggest_float("l1", 0.0, 6.0),
        "reg_lambda":        trial.suggest_float("l2", 0.0, 6.0),
        "min_split_gain":    trial.suggest_float("gamma", 0.0, 3.0),
        "extra_trees":       trial.suggest_categorical("extra_t", [True, False]),
        "max_bin":           trial.suggest_int("max_bin", 127, 255)
    }

    if boosting_type == "dart":
        params.update({
            "drop_rate": trial.suggest_float("drop", 0.05, 0.4),
            "skip_drop": trial.suggest_float("skip", 0.2, 0.8)
        })

    cv = TimeSeriesSplit(n_splits=FOLDS, gap=GAP_ROWS)
    scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        clf = LGBMClassifier(**params, n_estimators=MAX_TREES)

        clf.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric=f2_score_eval,
            callbacks=[early_stopping(EARLY_STOP, verbose=False)]
        )

        preds = clf.predict(X_val)
        f2 = fbeta_score(y_val, preds, beta=BETA, zero_division=0)
        scores.append(f2)

        trial.report(f2, fold)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return -np.mean(scores)

# ───── Optuna runner ─────────────────────────────
def run_optuna():
    data = load_data()
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=SEED),
        pruner=optuna.pruners.SuccessiveHalvingPruner(
            min_resource=1,
            reduction_factor=3,
            min_early_stopping_rate=0
        ),
        study_name="lgbm_gpu_fast"
    )

    print("\n🚀 Starting LightGBM GPU Optimization...")
    study.optimize(lambda trial: objective(trial, data), n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)

    print("\n🏆 Best F2 score:", -study.best_value)
    print("🔧 Best params:")
    for k, v in study.best_params.items():
        print(f"  {k:20}: {v}")

    with open("best_lgbm_params.json", "w") as f:
        json.dump(study.best_params, f, indent=2)

if __name__ == "__main__":
    run_optuna()


[I 2025-06-06 19:38:17,572] A new study created in memory with name: lgbm_gpu_fast



🚀 Starting LightGBM GPU Optimization...


Best trial: 0. Best value: -0.444346:   2%|▏         | 1/60 [00:26<25:55, 26.37s/it, 26.37/1800 seconds]

[I 2025-06-06 19:38:43,937] Trial 0 finished with value: -0.4443464670992617 and parameters: {'boosting_type': 'dart', 'lr': 0.10790061933340454, 'leaves': 167, 'depth': 5, 'min_child': 63, 'feat_frac': 0.6232334448672797, 'bag_frac': 0.9464704583099741, 'bag_freq': 4, 'l1': 4.248435466776273, 'l2': 0.12350696577481468, 'gamma': 2.909729556485983, 'extra_t': True, 'max_bin': 150, 'drop': 0.11419157844870184, 'skip': 0.38254534577572263}. Best is trial 0 with value: -0.4443464670992617.


Best trial: 0. Best value: -0.444346:   3%|▎         | 2/60 [00:28<11:41, 12.10s/it, 28.48/1800 seconds]

[I 2025-06-06 19:38:46,055] Trial 1 pruned. 


Best trial: 0. Best value: -0.444346:   5%|▌         | 3/60 [00:41<11:50, 12.46s/it, 41.37/1800 seconds]

[I 2025-06-06 19:38:58,940] Trial 2 pruned. 


Best trial: 0. Best value: -0.444346:   7%|▋         | 4/60 [01:10<17:55, 19.20s/it, 70.90/1800 seconds]

[I 2025-06-06 19:39:28,469] Trial 3 pruned. 


Best trial: 0. Best value: -0.444346:   8%|▊         | 5/60 [01:42<21:41, 23.66s/it, 102.47/1800 seconds]

[I 2025-06-06 19:40:00,041] Trial 4 pruned. 


Best trial: 0. Best value: -0.444346:  10%|█         | 6/60 [02:12<23:13, 25.81s/it, 132.46/1800 seconds]

[I 2025-06-06 19:40:30,033] Trial 5 pruned. 


Best trial: 0. Best value: -0.444346:  12%|█▏        | 7/60 [02:47<25:33, 28.93s/it, 167.80/1800 seconds]

[I 2025-06-06 19:41:05,370] Trial 6 pruned. 


Best trial: 0. Best value: -0.444346:  13%|█▎        | 8/60 [03:26<27:42, 31.97s/it, 206.28/1800 seconds]

[I 2025-06-06 19:41:43,849] Trial 7 pruned. 


Best trial: 0. Best value: -0.444346:  15%|█▌        | 9/60 [03:28<19:14, 22.64s/it, 208.39/1800 seconds]

[I 2025-06-06 19:41:45,964] Trial 8 pruned. 


Best trial: 0. Best value: -0.444346:  17%|█▋        | 10/60 [03:57<20:30, 24.61s/it, 237.43/1800 seconds]

[I 2025-06-06 19:42:15,000] Trial 9 pruned. 


Best trial: 0. Best value: -0.444346:  18%|█▊        | 11/60 [03:58<14:16, 17.49s/it, 238.77/1800 seconds]

[I 2025-06-06 19:42:16,337] Trial 10 pruned. 


Best trial: 11. Best value: -0.530335:  20%|██        | 12/60 [04:42<20:18, 25.38s/it, 282.18/1800 seconds]

[I 2025-06-06 19:42:59,755] Trial 11 finished with value: -0.5303350102272654 and parameters: {'boosting_type': 'dart', 'lr': 0.12676220987029993, 'leaves': 247, 'depth': 10, 'min_child': 195, 'feat_frac': 0.9753113105533291, 'bag_frac': 0.6026501737613297, 'bag_freq': 1, 'l1': 3.1666733581801925, 'l2': 1.4371765823465306, 'gamma': 1.2071455195192666, 'extra_t': False, 'max_bin': 169, 'drop': 0.16165720755103122, 'skip': 0.5318994586248574}. Best is trial 11 with value: -0.5303350102272654.


Best trial: 11. Best value: -0.530335:  22%|██▏       | 13/60 [05:28<24:45, 31.61s/it, 328.14/1800 seconds]

[I 2025-06-06 19:43:45,714] Trial 12 pruned. 


Best trial: 11. Best value: -0.530335:  23%|██▎       | 14/60 [05:53<22:53, 29.85s/it, 353.94/1800 seconds]

[I 2025-06-06 19:44:11,510] Trial 13 finished with value: -0.529039998044602 and parameters: {'boosting_type': 'dart', 'lr': 0.15805092543420288, 'leaves': 55, 'depth': 4, 'min_child': 146, 'feat_frac': 0.694303701978972, 'bag_frac': 0.9657522852506555, 'bag_freq': 5, 'l1': 3.0844711427992384, 'l2': 0.849505449914107, 'gamma': 2.1836119479312397, 'extra_t': True, 'max_bin': 157, 'drop': 0.1672800830475732, 'skip': 0.41314460808411185}. Best is trial 11 with value: -0.5303350102272654.


Best trial: 11. Best value: -0.530335:  25%|██▌       | 15/60 [05:55<15:53, 21.20s/it, 355.08/1800 seconds]

[I 2025-06-06 19:44:12,647] Trial 14 pruned. 


Best trial: 11. Best value: -0.530335:  27%|██▋       | 16/60 [06:25<17:41, 24.11s/it, 385.96/1800 seconds]

[I 2025-06-06 19:44:43,532] Trial 15 pruned. 


Best trial: 11. Best value: -0.530335:  28%|██▊       | 17/60 [06:37<14:33, 20.30s/it, 397.40/1800 seconds]

[I 2025-06-06 19:44:54,973] Trial 16 pruned. 


Best trial: 11. Best value: -0.530335:  30%|███       | 18/60 [06:51<12:50, 18.34s/it, 411.19/1800 seconds]

[I 2025-06-06 19:45:08,756] Trial 17 pruned. 


Best trial: 11. Best value: -0.530335:  32%|███▏      | 19/60 [06:53<09:14, 13.51s/it, 413.45/1800 seconds]

[I 2025-06-06 19:45:11,021] Trial 18 pruned. 


Best trial: 11. Best value: -0.530335:  33%|███▎      | 20/60 [07:27<13:03, 19.58s/it, 447.18/1800 seconds]

[I 2025-06-06 19:45:44,748] Trial 19 finished with value: -0.5287690761445184 and parameters: {'boosting_type': 'dart', 'lr': 0.19622267588053344, 'leaves': 55, 'depth': 9, 'min_child': 184, 'feat_frac': 0.7282074739953629, 'bag_frac': 0.8800573936900216, 'bag_freq': 3, 'l1': 5.5927318326913245, 'l2': 0.8385567276019783, 'gamma': 2.5490779353507316, 'extra_t': False, 'max_bin': 216, 'drop': 0.1381048517028889, 'skip': 0.3025942211349937}. Best is trial 11 with value: -0.5303350102272654.


Best trial: 11. Best value: -0.530335:  35%|███▌      | 21/60 [08:02<15:49, 24.34s/it, 482.59/1800 seconds]

[I 2025-06-06 19:46:20,164] Trial 20 finished with value: -0.4844737542094904 and parameters: {'boosting_type': 'dart', 'lr': 0.14366343487874117, 'leaves': 103, 'depth': 6, 'min_child': 138, 'feat_frac': 0.9476327125032618, 'bag_frac': 0.8168514801545984, 'bag_freq': 3, 'l1': 2.192624840012526, 'l2': 5.843541733207559, 'gamma': 1.3850853703975743, 'extra_t': True, 'max_bin': 163, 'drop': 0.3085348954909881, 'skip': 0.4633395791221595}. Best is trial 11 with value: -0.5303350102272654.


Best trial: 21. Best value: -0.574594:  37%|███▋      | 22/60 [08:35<17:01, 26.88s/it, 515.41/1800 seconds]

[I 2025-06-06 19:46:52,978] Trial 21 finished with value: -0.5745937549832779 and parameters: {'boosting_type': 'dart', 'lr': 0.19828032454088634, 'leaves': 55, 'depth': 9, 'min_child': 175, 'feat_frac': 0.7130442375531619, 'bag_frac': 0.8882038735166922, 'bag_freq': 3, 'l1': 5.864295384664072, 'l2': 0.7590459155477124, 'gamma': 2.4479447773503455, 'extra_t': False, 'max_bin': 221, 'drop': 0.14341673941852526, 'skip': 0.3156528480563624}. Best is trial 21 with value: -0.5745937549832779.


Best trial: 21. Best value: -0.574594:  38%|███▊      | 23/60 [08:50<14:26, 23.41s/it, 530.72/1800 seconds]

[I 2025-06-06 19:47:08,295] Trial 22 pruned. 


Best trial: 21. Best value: -0.574594:  40%|████      | 24/60 [09:05<12:34, 20.96s/it, 545.98/1800 seconds]

[I 2025-06-06 19:47:23,550] Trial 23 pruned. 


Best trial: 21. Best value: -0.574594:  42%|████▏     | 25/60 [09:42<14:52, 25.51s/it, 582.08/1800 seconds]

[I 2025-06-06 19:47:59,652] Trial 24 pruned. 


Best trial: 21. Best value: -0.574594:  43%|████▎     | 26/60 [10:30<18:20, 32.38s/it, 630.49/1800 seconds]

[I 2025-06-06 19:48:48,061] Trial 25 finished with value: -0.5035997466640948 and parameters: {'boosting_type': 'dart', 'lr': 0.17106981624259426, 'leaves': 127, 'depth': 11, 'min_child': 168, 'feat_frac': 0.6776891612614915, 'bag_frac': 0.8858536194005484, 'bag_freq': 3, 'l1': 2.6922932406091227, 'l2': 1.3871241508686056, 'gamma': 1.3570366679836776, 'extra_t': False, 'max_bin': 177, 'drop': 0.16732520353905456, 'skip': 0.43292476347426767}. Best is trial 21 with value: -0.5745937549832779.


Best trial: 21. Best value: -0.574594:  45%|████▌     | 27/60 [10:31<12:40, 23.05s/it, 631.77/1800 seconds]

[I 2025-06-06 19:48:49,345] Trial 26 pruned. 


Best trial: 21. Best value: -0.574594:  47%|████▋     | 28/60 [10:53<12:07, 22.75s/it, 653.82/1800 seconds]

[I 2025-06-06 19:49:11,394] Trial 27 pruned. 


Best trial: 21. Best value: -0.574594:  48%|████▊     | 29/60 [11:19<12:17, 23.78s/it, 679.99/1800 seconds]

[I 2025-06-06 19:49:37,564] Trial 28 pruned. 


Best trial: 21. Best value: -0.574594:  50%|█████     | 30/60 [11:38<11:06, 22.21s/it, 698.56/1800 seconds]

[I 2025-06-06 19:49:56,135] Trial 29 pruned. 


Best trial: 21. Best value: -0.574594:  52%|█████▏    | 31/60 [11:57<10:15, 21.22s/it, 717.47/1800 seconds]

[I 2025-06-06 19:50:15,044] Trial 30 finished with value: -0.49466769699828134 and parameters: {'boosting_type': 'dart', 'lr': 0.10761211260586466, 'leaves': 151, 'depth': 5, 'min_child': 137, 'feat_frac': 0.9201680901239078, 'bag_frac': 0.7509120029700046, 'bag_freq': 1, 'l1': 5.0285383795071805, 'l2': 0.036465093241190405, 'gamma': 2.3330605490276817, 'extra_t': False, 'max_bin': 168, 'drop': 0.05801894978507775, 'skip': 0.6522605935159739}. Best is trial 21 with value: -0.5745937549832779.


Best trial: 21. Best value: -0.574594:  53%|█████▎    | 32/60 [12:31<11:39, 24.97s/it, 751.20/1800 seconds]

[I 2025-06-06 19:50:48,772] Trial 31 finished with value: -0.4732620536775931 and parameters: {'boosting_type': 'dart', 'lr': 0.19715388960035612, 'leaves': 55, 'depth': 9, 'min_child': 186, 'feat_frac': 0.7269016027677196, 'bag_frac': 0.8758231758613796, 'bag_freq': 3, 'l1': 5.493543406792641, 'l2': 0.9585284338177564, 'gamma': 2.509541525694733, 'extra_t': False, 'max_bin': 220, 'drop': 0.13737787295532883, 'skip': 0.2894412011581523}. Best is trial 21 with value: -0.5745937549832779.


Best trial: 21. Best value: -0.574594:  55%|█████▌    | 33/60 [12:47<10:07, 22.48s/it, 767.88/1800 seconds]

[I 2025-06-06 19:51:05,446] Trial 32 pruned. 


Best trial: 21. Best value: -0.574594:  57%|█████▋    | 34/60 [13:07<09:21, 21.61s/it, 787.44/1800 seconds]

[I 2025-06-06 19:51:25,011] Trial 33 pruned. 


Best trial: 21. Best value: -0.574594:  58%|█████▊    | 35/60 [14:00<12:59, 31.17s/it, 840.91/1800 seconds]

[I 2025-06-06 19:52:18,479] Trial 34 finished with value: -0.5029066853852731 and parameters: {'boosting_type': 'dart', 'lr': 0.16768608002003843, 'leaves': 95, 'depth': 8, 'min_child': 120, 'feat_frac': 0.7052026823251575, 'bag_frac': 0.8450810952170513, 'bag_freq': 3, 'l1': 3.0800854772763295, 'l2': 0.38799472025485227, 'gamma': 1.9093252387759643, 'extra_t': False, 'max_bin': 214, 'drop': 0.12294092252358453, 'skip': 0.270186900295351}. Best is trial 21 with value: -0.5745937549832779.


Best trial: 35. Best value: -0.579402:  60%|██████    | 36/60 [14:28<12:05, 30.24s/it, 868.99/1800 seconds]

[I 2025-06-06 19:52:46,564] Trial 35 finished with value: -0.579401987893854 and parameters: {'boosting_type': 'dart', 'lr': 0.19731201682078386, 'leaves': 55, 'depth': 11, 'min_child': 160, 'feat_frac': 0.6724592869258237, 'bag_frac': 0.9489949933149696, 'bag_freq': 5, 'l1': 4.4227827642916, 'l2': 2.0060690532095347, 'gamma': 2.8417089291092066, 'extra_t': False, 'max_bin': 237, 'drop': 0.21483212147621572, 'skip': 0.33142968843519877}. Best is trial 35 with value: -0.579401987893854.


Best trial: 35. Best value: -0.579402:  62%|██████▏   | 37/60 [14:30<08:14, 21.49s/it, 870.05/1800 seconds]

[I 2025-06-06 19:52:47,627] Trial 36 pruned. 


Best trial: 35. Best value: -0.579402:  63%|██████▎   | 38/60 [14:47<07:25, 20.24s/it, 887.39/1800 seconds]

[I 2025-06-06 19:53:04,964] Trial 37 pruned. 


Best trial: 35. Best value: -0.579402:  65%|██████▌   | 39/60 [15:10<07:21, 21.03s/it, 910.27/1800 seconds]

[I 2025-06-06 19:53:27,844] Trial 38 pruned. 


Best trial: 35. Best value: -0.579402:  67%|██████▋   | 40/60 [15:30<06:53, 20.69s/it, 930.17/1800 seconds]

[I 2025-06-06 19:53:47,742] Trial 39 pruned. 


Best trial: 35. Best value: -0.579402:  68%|██████▊   | 41/60 [16:04<07:51, 24.84s/it, 964.68/1800 seconds]

[I 2025-06-06 19:54:22,249] Trial 40 pruned. 


Best trial: 35. Best value: -0.579402:  70%|███████   | 42/60 [16:23<06:52, 22.92s/it, 983.11/1800 seconds]

[I 2025-06-06 19:54:40,680] Trial 41 pruned. 


Best trial: 35. Best value: -0.579402:  72%|███████▏  | 43/60 [16:48<06:41, 23.64s/it, 1008.44/1800 seconds]

[I 2025-06-06 19:55:06,013] Trial 42 pruned. 


Best trial: 35. Best value: -0.579402:  73%|███████▎  | 44/60 [17:01<05:28, 20.51s/it, 1021.66/1800 seconds]

[I 2025-06-06 19:55:19,227] Trial 43 pruned. 


Best trial: 35. Best value: -0.579402:  75%|███████▌  | 45/60 [17:26<05:27, 21.81s/it, 1046.49/1800 seconds]

[I 2025-06-06 19:55:44,057] Trial 44 pruned. 


Best trial: 35. Best value: -0.579402:  77%|███████▋  | 46/60 [17:28<03:40, 15.76s/it, 1048.15/1800 seconds]

[I 2025-06-06 19:55:45,721] Trial 45 pruned. 


Best trial: 35. Best value: -0.579402:  78%|███████▊  | 47/60 [17:55<04:09, 19.22s/it, 1075.43/1800 seconds]

[I 2025-06-06 19:56:13,006] Trial 46 finished with value: -0.5243960725739402 and parameters: {'boosting_type': 'dart', 'lr': 0.13387857591396937, 'leaves': 87, 'depth': 12, 'min_child': 128, 'feat_frac': 0.7580833768533604, 'bag_frac': 0.8137059558238351, 'bag_freq': 5, 'l1': 4.296021804156931, 'l2': 0.3289493928002679, 'gamma': 2.1002111898163283, 'extra_t': False, 'max_bin': 241, 'drop': 0.08085218841275547, 'skip': 0.542365983124359}. Best is trial 35 with value: -0.579401987893854.


Best trial: 35. Best value: -0.579402:  80%|████████  | 48/60 [18:12<03:42, 18.56s/it, 1092.45/1800 seconds]

[I 2025-06-06 19:56:30,022] Trial 47 pruned. 


Best trial: 35. Best value: -0.579402:  82%|████████▏ | 49/60 [18:43<04:04, 22.24s/it, 1123.27/1800 seconds]

[I 2025-06-06 19:57:00,837] Trial 48 pruned. 


Best trial: 35. Best value: -0.579402:  83%|████████▎ | 50/60 [18:44<02:39, 15.91s/it, 1124.41/1800 seconds]

[I 2025-06-06 19:57:01,982] Trial 49 pruned. 


Best trial: 35. Best value: -0.579402:  85%|████████▌ | 51/60 [19:03<02:31, 16.79s/it, 1143.27/1800 seconds]

[I 2025-06-06 19:57:20,842] Trial 50 finished with value: -0.5731660669133289 and parameters: {'boosting_type': 'dart', 'lr': 0.14803727800052846, 'leaves': 79, 'depth': 9, 'min_child': 252, 'feat_frac': 0.9716185423051529, 'bag_frac': 0.7152266410799213, 'bag_freq': 1, 'l1': 4.764360268184389, 'l2': 1.1430180159256793, 'gamma': 1.8172740615908736, 'extra_t': True, 'max_bin': 197, 'drop': 0.24429708379758275, 'skip': 0.3839023388585794}. Best is trial 35 with value: -0.579401987893854.


Best trial: 35. Best value: -0.579402:  87%|████████▋ | 52/60 [19:15<02:02, 15.29s/it, 1155.04/1800 seconds]

[I 2025-06-06 19:57:32,614] Trial 51 pruned. 


Best trial: 35. Best value: -0.579402:  88%|████████▊ | 53/60 [19:34<01:56, 16.68s/it, 1174.99/1800 seconds]

[I 2025-06-06 19:57:52,557] Trial 52 finished with value: -0.4457555423648038 and parameters: {'boosting_type': 'dart', 'lr': 0.14028089428631552, 'leaves': 55, 'depth': 8, 'min_child': 257, 'feat_frac': 0.6901631436250277, 'bag_frac': 0.6434824875411712, 'bag_freq': 1, 'l1': 4.946560006795655, 'l2': 1.5829001810813677, 'gamma': 1.7594531141416327, 'extra_t': True, 'max_bin': 206, 'drop': 0.3146529798494843, 'skip': 0.29971641454366993}. Best is trial 35 with value: -0.579401987893854.


Best trial: 53. Best value: -0.580691:  90%|█████████ | 54/60 [19:55<01:47, 17.89s/it, 1195.68/1800 seconds]

[I 2025-06-06 19:58:13,256] Trial 53 finished with value: -0.580690514282784 and parameters: {'boosting_type': 'dart', 'lr': 0.19857041309331078, 'leaves': 255, 'depth': 9, 'min_child': 172, 'feat_frac': 0.9754014272237063, 'bag_frac': 0.66931809977155, 'bag_freq': 1, 'l1': 1.9563928569633862, 'l2': 0.7561089183959027, 'gamma': 2.1785543202146207, 'extra_t': True, 'max_bin': 175, 'drop': 0.22119536646782006, 'skip': 0.2483700005269152}. Best is trial 53 with value: -0.580690514282784.


Best trial: 53. Best value: -0.580691:  92%|█████████▏| 55/60 [20:16<01:34, 18.89s/it, 1216.91/1800 seconds]

[I 2025-06-06 19:58:34,477] Trial 54 finished with value: -0.5386418941949923 and parameters: {'boosting_type': 'dart', 'lr': 0.17278485622604162, 'leaves': 255, 'depth': 9, 'min_child': 169, 'feat_frac': 0.9665754958840989, 'bag_frac': 0.6701935547802167, 'bag_freq': 1, 'l1': 1.9860084328663687, 'l2': 3.642280112429058, 'gamma': 2.1519107336898293, 'extra_t': True, 'max_bin': 174, 'drop': 0.24398044304913172, 'skip': 0.24423692645766468}. Best is trial 53 with value: -0.580690514282784.


Best trial: 53. Best value: -0.580691:  93%|█████████▎| 56/60 [20:39<01:19, 19.99s/it, 1239.48/1800 seconds]

[I 2025-06-06 19:58:57,052] Trial 55 finished with value: -0.5189864428886858 and parameters: {'boosting_type': 'dart', 'lr': 0.17997301322594697, 'leaves': 255, 'depth': 8, 'min_child': 208, 'feat_frac': 0.9606503848652664, 'bag_frac': 0.6676608018913398, 'bag_freq': 1, 'l1': 2.0740830368378194, 'l2': 3.825626819062116, 'gamma': 1.8337757773731396, 'extra_t': True, 'max_bin': 175, 'drop': 0.28884900900925903, 'skip': 0.24559219858137765}. Best is trial 53 with value: -0.580690514282784.


Best trial: 53. Best value: -0.580691:  95%|█████████▌| 57/60 [21:01<01:01, 20.64s/it, 1261.64/1800 seconds]

[I 2025-06-06 19:59:19,213] Trial 56 finished with value: -0.43434988797024504 and parameters: {'boosting_type': 'dart', 'lr': 0.19950882297860167, 'leaves': 247, 'depth': 9, 'min_child': 166, 'feat_frac': 0.9445145750412491, 'bag_frac': 0.6212522415402298, 'bag_freq': 1, 'l1': 1.1406543686193789, 'l2': 3.970590873968383, 'gamma': 2.149734647561914, 'extra_t': True, 'max_bin': 181, 'drop': 0.22184364159662018, 'skip': 0.20053149494413514}. Best is trial 53 with value: -0.580690514282784.


Best trial: 53. Best value: -0.580691:  97%|█████████▋| 58/60 [21:25<00:43, 21.71s/it, 1285.84/1800 seconds]

[I 2025-06-06 19:59:43,410] Trial 57 finished with value: -0.5218909908319844 and parameters: {'boosting_type': 'dart', 'lr': 0.14823970926612529, 'leaves': 247, 'depth': 10, 'min_child': 225, 'feat_frac': 0.9803010207329786, 'bag_frac': 0.6647017582754398, 'bag_freq': 1, 'l1': 1.838193186625527, 'l2': 2.3507208400940076, 'gamma': 1.4409611118918684, 'extra_t': True, 'max_bin': 184, 'drop': 0.251556402496246, 'skip': 0.2412691572429682}. Best is trial 53 with value: -0.580690514282784.


Best trial: 53. Best value: -0.580691:  98%|█████████▊| 59/60 [21:38<00:19, 19.09s/it, 1298.80/1800 seconds]

[I 2025-06-06 19:59:56,372] Trial 58 pruned. 


Best trial: 53. Best value: -0.580691: 100%|██████████| 60/60 [21:55<00:00, 21.92s/it, 1315.38/1800 seconds]

[I 2025-06-06 20:00:12,951] Trial 59 pruned. 

🏆 Best F2 score: 0.580690514282784
🔧 Best params:
  boosting_type       : dart
  lr                  : 0.19857041309331078
  leaves              : 255
  depth               : 9
  min_child           : 172
  feat_frac           : 0.9754014272237063
  bag_frac            : 0.66931809977155
  bag_freq            : 1
  l1                  : 1.9563928569633862
  l2                  : 0.7561089183959027
  gamma               : 2.1785543202146207
  extra_t             : True
  max_bin             : 175
  drop                : 0.22119536646782006
  skip                : 0.2483700005269152





In [19]:
"""
lgbm_dart_final.py
==================
Train an optimal LightGBM-DART model for 4-hour BTC direction.

Requires:
    pip install lightgbm>=3.3 numpy pandas scikit-learn
    # GPU: build LightGBM with CUDA support
"""

import numpy as np
import pandas as pd
import json, time, warnings
from pathlib import Path
from sklearn.metrics import (
    fbeta_score, accuracy_score, precision_recall_fscore_support, roc_auc_score
)
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation

warnings.filterwarnings("ignore")

# ───── Config ──────────────────────────────────────────────────────────
CSV_PATH = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
                r"\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")

DROP_COLS = [
    'open','high','low','close','high_low','high_close','low_close','true_range',
    'typical_price','volume_mean_20','EMA_21','SMA_20','vwap_24h','close_4h',
    'bollinger_upper','bollinger_lower','MACD_line','MACD_signal',
    'resistance_level','support_level','vol_spike_1_5x','near_upper_band',
    'near_lower_band','break_upper_band','break_lower_band','rsi_oversold',
    'rsi_overbought','above_sma20','macd_positive','volume_breakout',
    'volume_breakdown','above_sma50','ema7_above_ema21','ema_cross_down',
    'ema_cross_up','stoch_overbought','stoch_oversold','cci_overbought',
    'cci_oversold','trending_market','trend_alignment','macd_cross_up',
    'macd_cross_down','macd_rising','overbought_reversal','oversold_reversal',
    'bullish_scenario_1','bullish_scenario_2','bullish_scenario_3',
    'bullish_scenario_4','bullish_scenario_5','bullish_scenario_6',
    'bearish_scenario_1','bearish_scenario_2','bearish_scenario_3',
    'bearish_scenario_4','bearish_scenario_6','timestamp','date','Unnamed: 0'
]

TEST_FRACTION      = 0.20   # chronological split
EARLY_STOP_ROUNDS  = 300
N_ESTIMATORS       = 3000
BETA               = 2.0
SEED               = 42

OPTIMAL_PARAMS = dict(
    boosting_type   = "dart",
    learning_rate   = 0.19857041309331078,
    num_leaves      = 255,
    max_depth       = 9,
    min_child_samples = 172,
    feature_fraction= 0.9754014272237063,
    bagging_fraction= 0.66931809977155,
    bagging_freq    = 1,
    reg_alpha       = 1.9563928569633862,
    reg_lambda      = 0.7561089183959027,
    min_split_gain  = 2.1785543202146207,
    extra_trees     = True,
    max_bin         = 175,
    drop_rate       = 0.22119536646782006,
    skip_drop       = 0.2483700005269152,
    objective       = "binary",
    n_estimators    = N_ESTIMATORS,
    random_state    = SEED,
    verbosity       = -1,
    device          = "gpu"     # auto-fallback to CPU handled below
)

# ───── Data load ───────────────────────────────────────────────────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df = df[df.index >= "2020-01-01"]
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df = df[df["target"].notna()].dropna()

X = df.drop(columns="target")
y = df["target"].astype(int).values
dates = df.index

split = int(len(df) * (1 - TEST_FRACTION))
X_train, X_val = X.iloc[:split], X.iloc[split:]
y_train, y_val = y[:split],      y[split:]
dates_val      = dates[split:]

print(f"Train rows: {len(X_train)} | Val rows: {len(X_val)}")

# ───── Model fit ───────────────────────────────────────────────────────
def fit_lightgbm(params):
    model = LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="binary_logloss",
        callbacks=[
            early_stopping(EARLY_STOP_ROUNDS, verbose=False),
            log_evaluation(period=200)
        ]
    )
    return model

try:
    print("Trying GPU training …")
    model = fit_lightgbm(OPTIMAL_PARAMS)
except Exception as gpu_err:
    print("⚠ GPU failed:", gpu_err, "\n→ Falling back to CPU.")
    cpu_params = OPTIMAL_PARAMS.copy(); cpu_params["device"] = "cpu"
    model = fit_lightgbm(cpu_params)

print(f"Best iteration: {model.best_iteration_}")

# ───── Evaluation ─────────────────────────────────────────────────────
prob = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
pred = (prob >= 0.5).astype(int)

acc  = accuracy_score(y_val, pred)
f2   = fbeta_score(y_val, pred, beta=BETA, zero_division=0)
auc  = roc_auc_score(y_val, prob)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, pred, labels=[0,1], zero_division=0
)

print("\n── Validation ──")
print(f"Accuracy : {acc:.3f}")
print(f"AUC      : {auc:.3f}")
print(f"F{int(BETA)}-score : {f2:.3f}")

# ───── Feature importance ─────────────────────────────────────────────
imp_df = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 10 features")
print(imp_df.head(10).to_string(index=False))

# ───── Save artefacts ─────────────────────────────────────────────────
model.booster_.save_model("lgbm_btc_direction_optimal.txt")
pd.DataFrame({
    "prob_up": prob,
    "pred_0.50": pred,
    "actual": y_val
}, index=dates_val).to_csv("lgbm_optimal_predictions.csv")
imp_df.to_csv("lgbm_feature_importance.csv", index=False)

summary = {
    "metrics": {
        "accuracy": float(acc),
        "auc":      float(auc),
        "f2":       float(f2),
        "precision_0": float(prec[0]),
        "precision_1": float(prec[1]),
        "recall_0":    float(rec[0]),
        "recall_1":    float(rec[1])
    },
    "best_iteration": int(model.best_iteration_),
    "n_features": int(X.shape[1]),
    "train_rows": int(len(X_train)),
    "val_rows": int(len(X_val)),
    "top_features": imp_df.head(15)["feature"].tolist(),
    "params": OPTIMAL_PARAMS
}

with open("lgbm_training_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n✔ All artefacts saved")


Train rows: 9180 | Val rows: 2296
Trying GPU training …
[200]	valid_0's binary_logloss: 0.690121
[400]	valid_0's binary_logloss: 0.691164
[600]	valid_0's binary_logloss: 0.695676
[800]	valid_0's binary_logloss: 0.699606
[1000]	valid_0's binary_logloss: 0.701675
[1200]	valid_0's binary_logloss: 0.703902
[1400]	valid_0's binary_logloss: 0.705691
[1600]	valid_0's binary_logloss: 0.707865
[1800]	valid_0's binary_logloss: 0.713544
[2000]	valid_0's binary_logloss: 0.715526
[2200]	valid_0's binary_logloss: 0.717318
[2400]	valid_0's binary_logloss: 0.714333
[2600]	valid_0's binary_logloss: 0.716142
[2800]	valid_0's binary_logloss: 0.716964
[3000]	valid_0's binary_logloss: 0.714762
Best iteration: 0

── Validation ──
Accuracy : 0.494
AUC      : 0.504
F2-score : 0.136

Top 10 features
         feature  importance
          roc_4h         324
 buying_pressure         223
             CCI         165
             OBV         143
  obv_rising_24h         138
fear_greed_score         137
         ro

In [21]:
"""
lgbm_dart_final.py
==================
Final LightGBM-DART model for 4h BTC direction, trained on 2018+ data.
Validation is strictly the last 20% (starting 2023-08-08), to mimic real deployment.
"""

import numpy as np
import pandas as pd
import warnings, json
from pathlib import Path
from sklearn.metrics import (
    fbeta_score, accuracy_score, precision_recall_fscore_support, roc_auc_score
)
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation

warnings.filterwarnings("ignore")

# ───── Configuration ─────────────────────────────────────────────────────
CSV_PATH = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
                r"\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")

DROP_COLS = [
    'open','high','low','close','high_low','high_close','low_close','true_range',
    'typical_price','volume_mean_20','EMA_21','SMA_20','vwap_24h','close_4h',
    'bollinger_upper','bollinger_lower','MACD_line','MACD_signal',
    'resistance_level','support_level','vol_spike_1_5x','near_upper_band',
    'near_lower_band','break_upper_band','break_lower_band','rsi_oversold',
    'rsi_overbought','above_sma20','macd_positive','volume_breakout',
    'volume_breakdown','above_sma50','ema7_above_ema21','ema_cross_down',
    'ema_cross_up','stoch_overbought','stoch_oversold','cci_overbought',
    'cci_oversold','trending_market','trend_alignment','macd_cross_up',
    'macd_cross_down','macd_rising','overbought_reversal','oversold_reversal',
    'bullish_scenario_1','bullish_scenario_2','bullish_scenario_3',
    'bullish_scenario_4','bullish_scenario_5','bullish_scenario_6',
    'bearish_scenario_1','bearish_scenario_2','bearish_scenario_3',
    'bearish_scenario_4','bearish_scenario_6','timestamp','date','Unnamed: 0'
]

VALIDATION_START_DATE = "2023-08-08"
BETA               = 2.0
EARLY_STOP_ROUNDS  = 300
N_ESTIMATORS       = 3000
SEED               = 42

# Optimal params (lowered LR slightly for stability)
PARAMS = dict(
    boosting_type   = "dart",
    learning_rate   = 0.09,  # lowered from 0.19 for better generalization
    num_leaves      = 255,
    max_depth       = 9,
    min_child_samples = 172,
    feature_fraction= 0.975,
    bagging_fraction= 0.669,
    bagging_freq    = 1,
    reg_alpha       = 1.956,
    reg_lambda      = 0.756,
    min_split_gain  = 2.178,
    extra_trees     = True,
    max_bin         = 175,
    drop_rate       = 0.221,
    skip_drop       = 0.248,
    objective       = "binary",
    n_estimators    = N_ESTIMATORS,
    random_state    = SEED,
    verbosity       = -1,
    device          = "gpu"
)

# ───── Data Loading ─────────────────────────────────────────────────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df = df[df.index >= "2018-01-01"]
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df = df[df["target"].notna()].dropna()

X = df.drop(columns="target")
y = df["target"].astype(int).values
dates = df.index

# ───── Train/Validation Split ───────────────────────────────────────────
val_start = pd.to_datetime(VALIDATION_START_DATE)
X_train, X_val = X[dates < val_start], X[dates >= val_start]
y_train, y_val = y[dates < val_start], y[dates >= val_start]
dates_val = dates[dates >= val_start]

print(f"Train rows: {len(X_train)} | Val rows: {len(X_val)}")

# ───── Training ─────────────────────────────────────────────────────────
def train_model(params):
    model = LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="binary_logloss",
        callbacks=[
            early_stopping(EARLY_STOP_ROUNDS, verbose=False),
            log_evaluation(period=200)
        ]
    )
    return model

try:
    print("🚀 Training with GPU …")
    model = train_model(PARAMS)
except Exception as err:
    print("⚠ GPU failed:", err, "\n→ Falling back to CPU.")
    cpu_params = PARAMS.copy(); cpu_params["device"] = "cpu"
    model = train_model(cpu_params)

print(f"Best iteration: {model.best_iteration_}")

# ───── Evaluation ──────────────────────────────────────────────────────
prob = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]
pred = (prob >= 0.5).astype(int)

acc  = accuracy_score(y_val, pred)
f2   = fbeta_score(y_val, pred, beta=BETA, zero_division=0)
auc  = roc_auc_score(y_val, prob)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, pred, labels=[0,1], zero_division=0
)

print("\n── Validation ──")
print(f"Accuracy : {acc:.3f}")
print(f"AUC      : {auc:.3f}")
print(f"F{int(BETA)}-score : {f2:.3f}")
print(f"Precision (1): {prec[1]:.3f} | Recall (1): {rec[1]:.3f} | F1 (1): {f1[1]:.3f}")

# ───── Save Results ────────────────────────────────────────────────────
pd.DataFrame({
    "prob_up": prob,
    "pred_0.50": pred,
    "actual": y_val
}, index=dates_val).to_csv("lgbm_optimal_predictions.csv")

model.booster_.save_model("lgbm_btc_direction_optimal.txt")

imp_df = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)
imp_df.to_csv("lgbm_feature_importance.csv", index=False)

summary = {
    "metrics": {
        "accuracy": float(acc),
        "auc":      float(auc),
        "f2":       float(f2),
        "precision_1": float(prec[1]),
        "recall_1":    float(rec[1])
    },
    "best_iteration": int(model.best_iteration_),
    "n_features": int(X.shape[1]),
    "train_rows": int(len(X_train)),
    "val_rows": int(len(X_val)),
    "top_features": imp_df.head(15)["feature"].tolist(),
    "params": PARAMS
}

with open("lgbm_training_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n✔ All artefacts saved")


Train rows: 12266 | Val rows: 3589
🚀 Training with GPU …
[200]	valid_0's binary_logloss: 0.688358
[400]	valid_0's binary_logloss: 0.688617
[600]	valid_0's binary_logloss: 0.688596
[800]	valid_0's binary_logloss: 0.689414
[1000]	valid_0's binary_logloss: 0.69066
[1200]	valid_0's binary_logloss: 0.691415
[1400]	valid_0's binary_logloss: 0.691624
[1600]	valid_0's binary_logloss: 0.692386
[1800]	valid_0's binary_logloss: 0.69353
[2000]	valid_0's binary_logloss: 0.693635
[2200]	valid_0's binary_logloss: 0.694973
[2400]	valid_0's binary_logloss: 0.694637
[2600]	valid_0's binary_logloss: 0.695546
[2800]	valid_0's binary_logloss: 0.695825
[3000]	valid_0's binary_logloss: 0.696568
Best iteration: 0

── Validation ──
Accuracy : 0.527
AUC      : 0.537
F2-score : 0.422
Precision (1): 0.559 | Recall (1): 0.398 | F1 (1): 0.465

✔ All artefacts saved
