# BTC 5‑Minute Breakout Prediction
This notebook builds a predictive model for breakout events in Bitcoin prices at 5‑minute granularity.
It follows a structured pipeline:
1. Load data
2. Feature engineering (indicators, time features)
3. Breakout labeling
4. Train LightGBM model with time series cross‑validation
5. Evaluate performance
6. Save trained model


In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, average_precision_score
from lightgbm import LGBMClassifier, log_evaluation
import matplotlib.pyplot as plt
import joblib


## 1. Load Data

In [3]:

def load_data(path):
    df = pd.read_csv(path)
    if "datetime" in df.columns:
        df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    elif "timestamp" in df.columns:
        df['datetime'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
    else:
        df.iloc[:,0] = pd.to_datetime(df.iloc[:,0], errors='coerce')
        df = df.rename(columns={df.columns[0]:'datetime'})
    df = df.set_index('datetime').sort_index()
    return df

df = load_data("btc_5min.csv")
df.head()


Unnamed: 0_level_0,timestamp,gmtoffset,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-05-30 00:00:00+00:00,1748563200,0,105593.25,105727.351562,105593.25,105727.351562,
2025-05-30 00:05:00+00:00,1748563500,0,105730.40625,105866.625,105730.40625,105866.625,178528256.0
2025-05-30 00:10:00+00:00,1748563800,0,105881.734375,105896.65625,105876.710937,105896.65625,
2025-05-30 00:15:00+00:00,1748564100,0,105918.382812,105985.171875,105918.382812,105981.375,
2025-05-30 00:20:00+00:00,1748564400,0,105959.453125,106071.84375,105959.453125,106071.84375,97390592.0


## 2. Feature Engineering

In [9]:

def add_indicators(df):
    df['return'] = df['close'].pct_change().fillna(0)
    df['r_mean_12'] = df['close'].rolling(12).mean()
    df['r_std_12'] = df['close'].rolling(12).std().fillna(0)
    df['r_max_12'] = df['high'].rolling(12).max()
    df['r_min_12'] = df['low'].rolling(12).min()
    df['tr'] = np.maximum(df['high'] - df['low'], 
                          np.maximum(df['high'] - df['close'].shift(), 
                                     df['close'].shift() - df['low']))
    df['atr_14'] = df['tr'].rolling(14).mean().fillna(method='bfill')
    df['mom_6'] = df['close'] / df['close'].shift(6) - 1
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    df['dow'] = df.index.dayofweek
    return df.fillna(method='ffill').fillna(method='bfill').fillna(0)

df = add_indicators(df)
df.head()


  df['return'] = df['close'].pct_change().fillna(0)
  df['atr_14'] = df['tr'].rolling(14).mean().fillna(method='bfill')
  return df.fillna(method='ffill').fillna(method='bfill').fillna(0)


Unnamed: 0_level_0,timestamp,gmtoffset,open,high,low,close,volume,label,return,r_mean_12,r_std_12,r_max_12,r_min_12,tr,atr_14,mom_6,hour,minute,dow
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2025-05-30 00:00:00+00:00,1748563200,0,105593.25,105727.351562,105593.25,105727.351562,178528256.0,0,0.0,105764.905599,0.0,106150.414062,104832.375,139.273438,195.176339,0.001268,0,0,4
2025-05-30 00:05:00+00:00,1748563500,0,105730.40625,105866.625,105730.40625,105866.625,178528256.0,0,0.001317,105764.905599,0.0,106150.414062,104832.375,139.273438,195.176339,0.001268,0,5,4
2025-05-30 00:10:00+00:00,1748563800,0,105881.734375,105896.65625,105876.710937,105896.65625,178528256.0,0,0.000284,105764.905599,0.0,106150.414062,104832.375,30.03125,195.176339,0.001268,0,10,4
2025-05-30 00:15:00+00:00,1748564100,0,105918.382812,105985.171875,105918.382812,105981.375,178528256.0,0,0.0008,105764.905599,0.0,106150.414062,104832.375,88.515625,195.176339,0.001268,0,15,4
2025-05-30 00:20:00+00:00,1748564400,0,105959.453125,106071.84375,105959.453125,106071.84375,97390592.0,0,0.000854,105764.905599,0.0,106150.414062,104832.375,112.390625,195.176339,0.001268,0,20,4


## 2bis. Feature engineering 2 (causing a loss of accuracy)

In [4]:
import numpy as np
import pandas as pd

def add_indicators(df):
    # ── Price-return statistics ───────────────────────────────────────────────
    df['return']   = df['close'].pct_change().fillna(0)
    df['r_mean_12'] = df['close'].rolling(12).mean()
    df['r_std_12']  = df['close'].rolling(12).std().fillna(0)
    df['r_max_12']  = df['high'].rolling(12).max()
    df['r_min_12']  = df['low'].rolling(12).min()

    # ── True Range & ATR ──────────────────────────────────────────────────────
    df['tr'] = np.maximum(df['high'] - df['low'],
                  np.maximum(df['high'] - df['close'].shift(),
                             df['close'].shift() - df['low']))
    df['atr_14'] = df['tr'].rolling(14).mean().bfill()

    # ── Simple momentum ───────────────────────────────────────────────────────
    df['mom_6'] = df['close'] / df['close'].shift(6) - 1

    # ── Time-of-day dummies ───────────────────────────────────────────────────
    df['hour']   = df.index.hour
    df['minute'] = df.index.minute
    df['dow']    = df.index.dayofweek

    # ── NEW: RSI (14) ─────────────────────────────────────────────────────────
    delta      = df['close'].diff()
    gain       = delta.clip(lower=0)
    loss       = (-delta).clip(lower=0)
    avg_gain   = gain.rolling(14).mean()
    avg_loss   = loss.rolling(14).mean()
    rs         = avg_gain / (avg_loss + 1e-9)
    df['rsi_14'] = 100 - (100 / (1 + rs))
    df['rsi_14'].fillna(50, inplace=True)   # neutral where insufficient history

    # ── NEW: Bollinger-band width (20, 2σ) ────────────────────────────────────
    ma_20    = df['close'].rolling(20).mean()
    std_20   = df['close'].rolling(20).std()
    df['bb_width'] = (2 * std_20 * 2) / ma_20   # (upper-lower)/mid = 4·σ / MA

    # ── NEW: Volume activity ratios (if volume exists) ────────────────────────
    if 'volume' in df.columns:
        df['vol_avg_20'] = df['volume'].rolling(20).mean()
        df['vol_ratio']  = df['volume'] / (df['vol_avg_20'] + 1e-9)
        df['vol_std_20'] = df['volume'].rolling(20).std().fillna(0)

    # ── Final cleanup ─────────────────────────────────────────────────────────
    return (df
            .ffill()   # forward-fill indicators
            .bfill()   # back-fill any leading NAs
            .fillna(0))


## 3. Breakout Labeling

In [10]:

def label_breakouts(df, lookback=12, horizon=6, thresh=0.002):
    r_max = df['high'].rolling(lookback).max().shift(1)
    r_min = df['low'].rolling(lookback).min().shift(1)
    future_max = df['high'].rolling(horizon).max().shift(- (horizon - 1))
    future_min = df['low'].rolling(horizon).min().shift(- (horizon - 1))
    up_break = (future_max > (1 + thresh) * r_max)
    down_break = (future_min < (1 - thresh) * r_min)
    label = np.where(up_break, 1, np.where(down_break, -1, 0))
    df['label'] = label
    return df

df = label_breakouts(df)
df['label'].value_counts(normalize=True)


label
 0    0.841035
-1    0.083128
 1    0.075837
Name: proportion, dtype: float64

## 4. Prepare Dataset

In [11]:

def prepare_dataset(df):
    feature_cols = [c for c in df.columns if c not in ['label','tr']]
    X = df[feature_cols].copy()
    y = df['label'].copy()
    mask = ~y.isna()
    return X.loc[mask], y.loc[mask]

X, y = prepare_dataset(df)
X.shape, y.shape


((34429, 17), (34429,))

## 5. Train LightGBM Model

In [None]:

def train_lgbm(X, y, params=None, n_splits=5):
    if params is None:
        params = {
            'n_estimators': 1000,
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'n_jobs': -1
        }
    clf = LGBMClassifier(
    **params,
    class_weight="balanced")
    tscv = TimeSeriesSplit(n_splits=n_splits)
    oof_preds = np.zeros((len(y), 3))
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        clf.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[log_evaluation(0)]
        )
        proba = clf.predict_proba(X_val)
        oof_preds[val_idx, :] = proba
    return clf, oof_preds

clf, oof = train_lgbm(X, y, n_splits=5)


## 5bis. Train LightGBM Model

In [12]:
from collections import Counter
import numpy as np
from lightgbm import LGBMClassifier, log_evaluation
from sklearn.model_selection import TimeSeriesSplit

def train_lgbm(X, y, params=None, n_splits=5):
    # 1️⃣ Default hyper-parameters (unchanged)
    if params is None:
        params = {
            'n_estimators': 1_000,
            'learning_rate': 0.05,
            'num_leaves': 31,
            'max_depth': -1,
            'n_jobs': -1
        }

    # 2️⃣ Custom class weights → favor precision
    #    - Heavier weight on “no-breakout” (-1) to curb false signals
    #    - Slight boost for “up-breakout” (+1) to retain sensitivity
    freq   = Counter(y)           # class frequencies
    total  = sum(freq.values())
    weights = {
        -1: total / freq[-1] * 2.5,   # “no” breakout
         1: total / freq[ 1] * 1.2,   # up breakout
         0: total / freq[ 0]          # down breakout
    }

    clf = LGBMClassifier(
        **params,
        class_weight=weights
    )

    # 3️⃣ Time-series cross-validation
    tscv = TimeSeriesSplit(n_splits=n_splits)
    oof_preds = np.zeros((len(y), 3))

    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        clf.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[log_evaluation(0)]
        )

        proba = clf.predict_proba(X_val)
        oof_preds[val_idx, :] = proba

    return clf, oof_preds

clf, oof = train_lgbm(X, y, n_splits=5)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3358
[LightGBM] [Info] Number of data points in the train set: 5739, number of used features: 16
[LightGBM] [Info] Start training from score -0.585596
[LightGBM] [Info] Start training from score -1.644342
[LightGBM] [Info] Start training from score -1.385949
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3360
[LightGBM] [Info] Number of data points in the train set: 11477, number of used features: 16
[LightGBM] [Info] Start training from score -0.647793
[LightGBM] [Info] Start training from score -1.560872
[LightGBM] [Info] Start training from score -1.321077
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000633 seco

## 6. Evaluate Model

In [None]:

def evaluate_predictions(y_true, proba_preds):
    y_pred = np.argmax(proba_preds, axis=1) - 1
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    avg_prec = average_precision_score(pd.get_dummies(y_true), proba_preds)
    return {'precision_macro': precision, 'recall_macro': recall, 
            'f1_macro': f1, 'average_precision': avg_prec}

metrics = evaluate_predictions(y.values, oof)
metrics


## 7. Save Model

In [None]:

joblib.dump(clf, "lgbm_breakout.pkl")
