In [None]:
# Import packages
import pandas as pd
import numpy as np
import yfinance as yf

# Import MlFinLab tools
from mlfinlab.ensemble.sb_bagging import SequentiallyBootstrappedBaggingClassifier
from mlfinlab.util.volatility import get_daily_vol
from mlfinlab.filters.filters import cusum_filter
from mlfinlab.labeling.labeling import add_vertical_barrier, get_events, get_bins
from sklearn.ensemble import RandomForestClassifier

# Fetch data from Yahoo Finance
sp500 = yf.Ticker("^GSPC")  # S&P 500
# Get historical market data
data = sp500.history(period="1mo", interval="2m")
# Filter events using the CUSUM filter
daily_vol = get_daily_vol(close=data["Close"], lookback=50)
cusum_events = cusum_filter(data["Close"], threshold=daily_vol.mean() * 0.5)
# Do triple-barrier labelling
vertical_barriers = add_vertical_barrier(t_events=data.index, close=data["Close"], num_hours=1)
pt_sl = [1, 1]
triple_barrier_events = get_events(
    close=data["Close"],
    t_events=cusum_events,
    pt_sl=pt_sl,
    target=daily_vol,
    num_threads=1,
    vertical_barrier_times=vertical_barriers)

labels = get_bins(triple_barrier_events, data["Close"])

# Feature Engineering
x = pd.DataFrame(index=data.index)
# Volatility
data["log_ret"] = np.log(data["Close"]).diff()
x["volatility_50"] = (data["log_ret"].rolling(window=50, min_periods=50, center=False).std())
x["volatility_31"] = (data["log_ret"].rolling(window=31, min_periods=31, center=False).std())
x["volatility_15"] = (data["log_ret"].rolling(window=15, min_periods=15, center=False).std())
# Autocorrelation
window_autocorr = 50
x["autocorr_1"] = (data["log_ret"].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False))
x["autocorr_2"] = (data["log_ret"].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=2), raw=False))
x["autocorr_3"] = (data["log_ret"].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False))
x["autocorr_4"] = (
data["log_ret"].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=4), raw=False))
x["autocorr_5"] = (data["log_ret"].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False))
# Log-return momentum
x["log_t1"] = data["log_ret"].shift(1)
x["log_t2"] = data["log_ret"].shift(2)
x["log_t3"] = data["log_ret"].shift(3)
x["log_t4"] = data["log_ret"].shift(4)
x["log_t5"] = data["log_ret"].shift(5)

x.dropna(inplace=True)
labels = labels.loc[x.index.min() : x.index.max(),]
triple_barrier_events = triple_barrier_events.loc[x.index.min() : x.index.max(),]
x = x.loc[labels.index]
x_train = x  # We'll use all examples in this particular case
y_train = labels.loc[x_train.index, "bin"]

# Use tools
base_est = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False,
                                  class_weight='balanced_subsample')
clf = SequentiallyBootstrappedBaggingClassifier(base_estimator=base_est,
                                                samples_info_sets=triple_barrier_events.t1.dropna(),
                                                price_bars=data['Close'], oob_score=True)
clf.fit(x_train, y_train)