In [1]:
# import os
# from datetime import datetime, timedelta
# import pandas as pd
# from binance.client import Client
# from dotenv import load_dotenv

# load_dotenv()
# api_key = os.getenv("API_KEY")
# api_secret = os.getenv("API_SECRET")

# client = Client(api_key, api_secret)

# three_months_ago = datetime.now() - timedelta(days=90)

# klines = client.get_historical_klines(
#     symbol='BTCUSDT',
#     interval=Client.KLINE_INTERVAL_5MINUTE,
#     start_str=three_months_ago.strftime("%Y-%m-%d %H:%M:%S")
# )

# df = pd.DataFrame(klines, columns=[
#     'Open time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close time',
#     'Quote asset volume', 'Number of trades', 'Taker buy base asset volume',
#     'Taker buy quote asset volume', 'Ignore'
# ])

# df['Open time'] = pd.to_datetime(df['Open time'], unit='ms')
# price_df = df[['Open time', 'Close']].copy()
# price_df['Close'] = pd.to_numeric(price_df['Close'])

# print(price_df)

In [2]:
import os
import numpy as np
import pandas as pd
import pandas_ta as ta
import xgboost as xgb
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pyarrow.parquet as pq

from sklearn.metrics import accuracy_score, classification_report
from binance.client import Client
from dotenv import load_dotenv
from warnings import simplefilter

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from xgboost import plot_importance

simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

load_dotenv()
api_key = os.getenv("API_KEY")
api_secret = os.getenv("API_SECRET")

  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
client = Client(api_key, api_secret)
one_year_ago = datetime.now() - timedelta(days=365*5)
klines = client.get_historical_klines(
    symbol='BTCUSDT',
    interval=Client.KLINE_INTERVAL_5MINUTE,
    start_str=one_year_ago.strftime("%Y-%m-%d %H:%M:%S")
)

df = pd.DataFrame(klines, columns=[
    'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close time',
    'Quote asset volume', 'Number of trades', 'Taker buy base asset volume',
    'Taker buy quote asset volume', 'Ignore'
])

df['Date'] = pd.to_datetime(df['Date'], unit='ms')
numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Number of trades']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']].dropna()
print(f"{df.shape}")
df.to_parquet("btc_5m.parquet")

(525230, 6)


In [None]:
df = pd.read_parquet("btc_5m_5years.parquet")

def create_volume_bars(df, volume_threshold=1000):
    bars = []
    current_bar = None
    cumulative_volume = 0
    for _, row in df.iterrows():
        if current_bar is None:
            current_bar = {'Date': row['Date'], 'Open': row['Open'], 'High': row['High'], 'Low': row['Low']}
        cumulative_volume += row['Volume']
        current_bar['High'] = max(current_bar['High'], row['High'])
        current_bar['Low'] = min(current_bar['Low'], row['Low'])
        if cumulative_volume >= volume_threshold:
            current_bar['Close'] = row['Close']
            current_bar['Volume'] = cumulative_volume
            bars.append(current_bar)
            current_bar = None
            cumulative_volume = 0
    return pd.DataFrame(bars)

df_vb = create_volume_bars(df, volume_threshold=1000)

def get_weights_ffd(d, size):
    w = [1.]
    for k in range(1, size):
        w_ = -w[-1] / k * (d - k + 1)
        w.append(w_)
    return np.array(w[::-1]).reshape(-1, 1)

def frac_diff_ffd(series, d, thres=1e-5):
    w = get_weights_ffd(d, len(series))
    w_ = np.cumsum(np.abs(w))
    w_ /= w_[-1]
    skip = np.searchsorted(w_, thres)
    df = {}
    for name in series.columns:
        series_f = series[[name]].ffill().dropna()
        df_ = pd.Series(index=series.index, dtype=float)
        for iloc in range(skip, series_f.shape[0]):
            loc = series_f.index[iloc]
            if not np.isfinite(series.loc[loc, name]): continue
            df_[loc] = np.dot(w[-(iloc + 1):, :].T, series_f.iloc[:iloc + 1])[0, 0]
        df[name] = df_
    return pd.concat(df, axis=1)

print("Applying Fractional Differentiation...")
close_prices_vb = df_vb[['Close']].set_index(df_vb['Date'])
frac_close = frac_diff_ffd(close_prices_vb, d=0.4).dropna()
df_vb = df_vb.set_index('Date').loc[frac_close.index].reset_index()
df_vb['frac_close'] = frac_close['Close'].values

Applying Fractional Differentiation...


In [5]:
def calculate_features(df):
    print("Calculating features...")
    df.ta.adx(length=14, append=True)
    df.ta.aroon(length=25, append=True)
    df.ta.bbands(length=20, append=True)
    df.ta.atr(length=14, append=True)
    df.ta.obv(append=True)
    df['slope_24'] = df['Close'].rolling(window=24).apply(lambda x: np.polyfit(np.arange(len(x)), x, 1)[0] if x.notna().all() else np.nan, raw=False)
    df['bbw_pct'] = (df['BBU_20_2.0'] - df['BBL_20_2.0']) / df['BBM_20_2.0']
    df.ta.chop(length=14, append=True)
    df['clv'] = ((df['Close'] - df['Low']) - (df['High'] - df['Close'])) / (df['High'] - df['Low'] + 1e-12)
    df['clv_mean_12'] = df['clv'].rolling(window=12).mean()
    return df

df_features = calculate_features(df_vb.copy())

Calculating features...


In [6]:
def get_regime_labels(close_series, look_forward_period=20):
    print("Creating regime labels...")
    future_vol = close_series.shift(-look_forward_period).rolling(look_forward_period).std()
    trend_cutoff = future_vol.quantile(0.66)
    sideway_cutoff = future_vol.quantile(0.33)
    labels = pd.Series(index=close_series.index, data=-1, dtype=int)
    labels[future_vol >= trend_cutoff] = 1
    labels[future_vol <= sideway_cutoff] = 0
    return labels

df_features['regime'] = get_regime_labels(df_features['Close'])
df_model_data = df_features[df_features['regime'] != -1].dropna()


Creating regime labels...


In [11]:
print("Preparing data for the model...")
X = df_model_data.drop(columns=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'clv', 'regime', 'frac_close'])
y = df_model_data['regime']

test_size = 0.2
split_index = int(len(X) * (1 - test_size))
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Shape before dropping NaNs from features: {df_features.shape}")
df_features.dropna(inplace=True)
print(f"Shape after dropping NaNs: {df_features.shape}")

Preparing data for the model...
Training data shape: (55172, 17)
Test data shape: (13794, 17)
Shape before dropping NaNs from features: (102958, 26)
Shape after dropping NaNs: (102958, 26)


In [None]:
import os
import numpy as np
import pandas as pd
import pandas_ta as ta
import xgboost as xgb
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from binance.client import Client
from dotenv import load_dotenv
from warnings import simplefilter
from xgboost import plot_importance

simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

load_dotenv()
api_key = os.getenv("API_KEY")
api_secret = os.getenv("API_SECRET")

def download_and_cache_data(file_path="btc_5m_5years.parquet", years=5):
    if os.path.exists(file_path):
        print(f"Loading data from cached file: {file_path}")
        return pd.read_parquet(file_path)
    print("Downloading historical data (this may take a while)...")
    client = Client(api_key, api_secret)
    start_date = datetime.now() - timedelta(days=365 * years)
    klines = client.get_historical_klines(
        symbol='BTCUSDT',
        interval=Client.KLINE_INTERVAL_5MINUTE,
        start_str=start_date.strftime("%Y-%m-%d %H:%M:%S")
    )
    df = pd.DataFrame(klines, columns=[
        'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close time',
        'Quote asset volume', 'Number of trades', 'Taker buy base asset volume',
        'Taker buy quote asset volume', 'Ignore'
    ])
    df['Date'] = pd.to_datetime(df['Date'], unit='ms')
    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']].dropna()
    df.to_parquet(file_path)
    return df

df_time_bars = download_and_cache_data()

def create_volume_bars(df, volume_threshold=1000):
    print(f"Creating Volume Bars with threshold: {volume_threshold} BTC...")
    bars = []
    current_bar = None
    cumulative_volume = 0
    for _, row in df.iterrows():
        if current_bar is None:
            current_bar = {'Date': row['Date'], 'Open': row['Open'], 'High': row['High'], 'Low': row['Low']}
        cumulative_volume += row['Volume']
        current_bar['High'] = max(current_bar['High'], row['High'])
        current_bar['Low'] = min(current_bar['Low'], row['Low'])
        if cumulative_volume >= volume_threshold:
            current_bar['Close'] = row['Close']
            current_bar['Volume'] = cumulative_volume
            bars.append(current_bar)
            current_bar = None
            cumulative_volume = 0
    return pd.DataFrame(bars).set_index('Date')

df_vb = create_volume_bars(df_time_bars, volume_threshold=2000)

def get_weights_ffd(d, size):
    w = [1.]
    for k in range(1, size):
        w_ = -w[-1] / k * (d - k + 1)
        w.append(w_)
    w = np.array(w[::-1]).reshape(-1, 1)
    return w

def frac_diff_ffd(series, d, thres=1e-5):
    w = get_weights_ffd(d, len(series))
    w_ = np.cumsum(np.abs(w))
    w_ /= w_[-1]
    skip = np.searchsorted(w_, thres)
    df = {}
    for name in series.columns:
        series_f = series[[name]].ffill().dropna()
        df_ = pd.Series(index=series.index, dtype=float)
        for iloc in range(skip, series_f.shape[0]):
            loc = series_f.index[iloc]
            if not np.isfinite(series.loc[loc, name]):
                continue
            df_[loc] = np.dot(w[-(iloc + 1):, :].T, series_f.iloc[:iloc + 1])[0, 0]
        df[name] = df_
    return pd.concat(df, axis=1)

print("Applying Fractional Differentiation to the 'Close' price...")
frac_close = frac_diff_ffd(df_vb[['Close']], d=0.4)
frac_close.rename(columns={'Close': 'frac_close'}, inplace=True)
df_vb = df_vb.join(frac_close)
print("Fractional Differentiation complete.")

def get_daily_vol(close, lookback=100):
    ret = close.pct_change()
    vol = ret.ewm(span=lookback).std()
    return vol

def apply_pt_sl_on_t1(close, events, pt_sl, molecule):
    events_ = events.loc[molecule]
    out = events_[['t1']].copy(deep=True)
    pt_mult, sl_mult = pt_sl
    pt = events_['trgt'] * pt_mult
    sl = events_['trgt'] * sl_mult
    for loc, t1 in events_['t1'].items():
        df0 = close[loc:t1]
        df0 = (df0 / close[loc] - 1)
        out.loc[loc, 'sl'] = df0[df0 < -sl[loc]].index.min()
        out.loc[loc, 'pt'] = df0[df0 > pt[loc]].index.min()
    return out

def get_events(close, t_events, pt_sl, trgt, min_ret, t1=None):
    trgt = trgt.loc[t_events]
    trgt = trgt[trgt > min_ret]
    if t1 is None:
        t1 = pd.Series(pd.NaT, index=t_events)
    events = pd.concat({'t1': t1, 'trgt': trgt}, axis=1).dropna(subset=['trgt'])
    df0 = apply_pt_sl_on_t1(close, events, pt_sl, events.index)
    events['t1'] = df0.min(axis=1)
    return events

def get_bins(events, close):
    events_ = events.dropna(subset=['t1'])
    px = close.reindex(events_.index.union(events_['t1'].values)).ffill()
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index] - 1
    out['bin'] = np.sign(out['ret'])
    out.loc[out['bin'] == 0, 'bin'] = -1
    return out

print("\nStarting Triple-Barrier Labeling...")
volatility = get_daily_vol(df_vb['Close'], lookback=50)
t_events = df_vb.index[50:]
vertical_barrier = t_events + pd.Timedelta(days=5)
vertical_barrier = pd.Series(vertical_barrier[vertical_barrier < df_vb.index[-1]], index=t_events[:len(vertical_barrier)])
events = get_events(df_vb['Close'], t_events, [2.0, 1.0], volatility, 0.001, vertical_barrier)
labels = get_bins(events, df_vb['Close'])
print("Labeling complete.")

def calculate_all_features(df):
    print("Calculating features...")
    df.ta.adx(length=14, append=True)
    df.ta.aroon(length=25, append=True)
    df.ta.bbands(length=20, append=True)
    df['bbw_pct'] = (df['BBU_20_2.0'] - df['BBL_20_2.0']) / df['BBM_20_2.0']
    df.ta.atr(length=14, append=True)
    df.ta.chop(length=14, append=True)
    df.ta.obv(append=True)
    return df

df_features = calculate_all_features(df_vb.copy())

data = df_features.join(labels['bin']).dropna()
X = data.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume', 'bin'])
y = data['bin']
y[y <= 0] = 0
print(f"Final dataset shape for modeling: {X.shape}")

class PurgedKFold(KFold):
    def __init__(self, n_splits=10, t1=None, pct_embargo=0.):
        super().__init__(n_splits, shuffle=False)
        self.t1 = t1
        self.pct_embargo = pct_embargo
    def split(self, X, y=None, groups=None):
        indices = np.arange(X.shape[0])
        embargo = int(X.shape[0] * self.pct_embargo)
        test_splits = [(i[0], i[-1] + 1) for i in np.array_split(indices, self.n_splits)]
        for i, j in test_splits:
            test_indices = indices[i:j]
            t0 = X.index[i]
            train_indices = []
            for ix in X.index:
                if ix < t0 and self.t1.loc[ix] < t0:
                    train_indices.append(X.index.get_loc(ix))
            if len(test_indices) > 0:
                t1_test_max = self.t1.iloc[test_indices].max()
                embargo_start_time = t1_test_max + pd.Timedelta(minutes=embargo * 5)
                train_indices = [idx for idx in train_indices if X.index[idx] > embargo_start_time or X.index[idx] < t0]
            yield np.array(train_indices), test_indices

print("\nStarting Purged K-Fold Cross-Validation...")
cv_splitter = PurgedKFold(n_splits=5, t1=events['t1'], pct_embargo=0.01)
oos_scores = []
for train_idx, test_idx in cv_splitter.split(X):
    if len(train_idx) == 0 or len(test_idx) == 0:
        continue
    X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    scale_pos_weight = y_train.value_counts().get(0, 1) / y_train.value_counts().get(1, 1)
    model = xgb.XGBClassifier(
        objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
        n_estimators=500, learning_rate=0.01, max_depth=4,
        random_state=42, scale_pos_weight=scale_pos_weight
    )
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)
    y_pred = model.predict(X_test)
    oos_scores.append(f1_score(y_test, y_pred))
    print(f"Fold F1-Score: {oos_scores[-1]:.4f}")
print(f"\nAverage F1-Score from Purged K-Fold CV: {np.mean(oos_scores):.4f}")

print("\nTraining final model...")
final_model = xgb.XGBClassifier(
    objective='binary:logistic', eval_metric='logloss', use_label_encoder=False,
    n_estimators=500, learning_rate=0.01, max_depth=4, random_state=42
)
final_model.fit(X, y, verbose=False)
print("Plotting Feature Importance...")
fig, ax = plt.subplots(figsize=(15, 10))
plot_importance(final_model, ax=ax, max_num_features=20, importance_type='gain')
plt.title("Feature Importance (Gain)")
plt.show()


Loading data from cached file: btc_5m_5years.parquet
Creating Volume Bars with threshold: 2000 BTC...
Applying Fractional Differentiation to the 'Close' price...
