# In this notebook we would create the LIGHTGBM model


In [8]:
"""
Mini LightGBM hyper-parameter search (15 random trials) for BTC direction
------------------------------------------------------------------------
* Keeps chronological 80 / 20 split
* Evaluates weighted-F1 (precision ×2)
* Prints the best parameter set and its weighted-F1
"""

import numpy as np, pandas as pd, random, itertools, time
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import precision_recall_fscore_support
from pathlib import Path

# ───────── config ─────────
CSV_PATH  = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features_2016_final.csv"
DROP_COLS = ["vol_ratio_24h", "macd_diff", "macd_line", "upper_shadow", "lower_shadow"]
VAL_FRAC  = 0.20
W_PREC    = 2.0
N_TRIALS  = 15          # “a little bit more runs”

# ───────── data prep (same as before) ─────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df["Volume BTC"] = np.log1p(df["Volume BTC"])
df["target"] = (df["close"].shift(-1) > df["close"]).astype(int)
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int)
split_idx = int(len(df) * (1 - VAL_FRAC))
X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

# ───────── search space ─────────
space = {
    "learning_rate"   : [0.01, 0.02, 0.05],
    "num_leaves"      : [31, 63, 127],
    "feature_fraction": [0.8, 0.9],
    "bagging_fraction": [0.7, 0.8],
    "bagging_freq"    : [1],
    "min_child_samples": [20, 40]
}

def random_param():
    return {k: random.choice(v) for k, v in space.items()}

# ───────── weighted-F1 helper ─────────
def weighted_f1(y_true, y_pred_prob, thr=0.5, w=2.0):
    y_pred = (y_pred_prob >= thr).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", pos_label=1, zero_division=0
    )
    return (1 + w) * prec * rec / (w * prec + rec + 1e-12)

# ───────── mini search ─────────
best_score, best_params = -1, None
tic = time.time()

for t in range(1, N_TRIALS + 1):
    params = random_param()
    model = LGBMClassifier(
        objective="binary",
        n_estimators=2000,
        **params,
        verbose=-1,
        random_state=42+t
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="binary_logloss",
        callbacks=[
            early_stopping(stopping_rounds=150),
            log_evaluation(0)  # silent
        ]
    )
    y_prob = model.predict_proba(X_val)[:, 1]
    score = weighted_f1(y_val, y_prob, w=W_PREC)

    print(f"Trial {t:02d} → weighted-F1 = {score:.4f}  |  params: {params}")
    if score > best_score:
        best_score, best_params = score, params

toc = time.time()
print(f"\nSearch finished in {toc - tic:.1f} s")

print("\n──────── Best parameters ────────")
for k, v in best_params.items():
    print(f"{k:<17}: {v}")
print(f"Best weighted-F1 : {best_score:.4f}")


Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[22]	valid_0's binary_logloss: 0.69157
Trial 01 → weighted-F1 = 0.4786  |  params: {'learning_rate': 0.02, 'num_leaves': 63, 'feature_fraction': 0.8, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'min_child_samples': 40}
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[5]	valid_0's binary_logloss: 0.692375
Trial 02 → weighted-F1 = 0.5135  |  params: {'learning_rate': 0.05, 'num_leaves': 127, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'min_child_samples': 20}
Training until validation scores don't improve for 150 rounds
Early stopping, best iteration is:
[22]	valid_0's binary_logloss: 0.691437
Trial 03 → weighted-F1 = 0.4799  |  params: {'learning_rate': 0.02, 'num_leaves': 63, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'min_child_samples': 40}
Training until validation scores don't improve for

In [9]:
"""
Final LightGBM model
--------------------
Uses the best mini-search parameters:

    learning_rate     = 0.05
    num_leaves        = 63
    feature_fraction  = 0.9
    bagging_fraction  = 0.8
    bagging_freq      = 1
    min_child_samples = 20

Prints accuracy, precision, recall and F1 for each class.
"""

# ───────────────────── imports ─────────────────────
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ────────── file path & columns to drop ────────────
CSV_PATH  = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_data_final_version_with_features_2016_final.csv"
DROP_COLS = ["vol_ratio_24h", "macd_diff", "macd_line",
             "upper_shadow", "lower_shadow"]

VAL_FRAC = 0.20         # 80 % train · 20 % validation
PREC_W   = 2.0          # precision weight for weighted-F1

# ───────────────── data preparation ────────────────
df = pd.read_csv(CSV_PATH, index_col=0, parse_dates=True)
df.drop(columns=[c for c in DROP_COLS if c in df.columns], inplace=True)
df["Volume BTC"] = np.log1p(df["Volume BTC"])

df["target"] = (df["close"].shift(-1) > df["close"]).astype(int)   # 1 = Up
df = df.dropna().select_dtypes(include=[np.number])

X = df.drop(columns=["target"])
y = df["target"].astype(int)

split_idx = int(len(df) * (1 - VAL_FRAC))
X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

# ─────────── best-parameter LightGBM model ─────────
best_params = dict(
    objective         = "binary",
    learning_rate     = 0.05,
    num_leaves        = 63,
    feature_fraction  = 0.9,
    bagging_fraction  = 0.8,
    bagging_freq      = 1,
    min_child_samples = 20,
    n_estimators      = 4000,   # large upper bound – early stop will trim
    verbose           = -1,
    random_state      = 42
)

model = LGBMClassifier(**best_params)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="binary_logloss",
    callbacks=[
        early_stopping(stopping_rounds=300, first_metric_only=True),
        log_evaluation(100)
    ]
)

# ─────────────────── evaluation ────────────────────
y_prob = model.predict_proba(X_val)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

acc  = accuracy_score(y_val, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, y_pred, labels=[0, 1], zero_division=0
)

print("\n──── Validation metrics (thr = 0.50) ────")
print(f"Accuracy          : {acc:6.3f}")
print(f"Class 0 (Down) →  Precision: {prec[0]:6.3f}  Recall: {rec[0]:6.3f}  F1: {f1[0]:6.3f}")
print(f"Class 1 (Up  ) →  Precision: {prec[1]:6.3f}  Recall: {rec[1]:6.3f}  F1: {f1[1]:6.3f}")
print(f"Macro-F1          : {f1.mean():6.3f}")


Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.701439
[200]	valid_0's binary_logloss: 0.705638
[300]	valid_0's binary_logloss: 0.709782
Early stopping, best iteration is:
[6]	valid_0's binary_logloss: 0.691879
Evaluated only: binary_logloss

──── Validation metrics (thr = 0.50) ────
Accuracy          :  0.529
Class 0 (Down) →  Precision:  0.517  Recall:  0.580  F1:  0.547
Class 1 (Up  ) →  Precision:  0.544  Recall:  0.481  F1:  0.511
Macro-F1          :  0.529
