In [2]:
import numpy as np
import pandas as pd
from binance.client import Client
from AdaptiveWindowCV import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from datetime import datetime
from tqdm import tqdm
import pytz
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
from scipy import stats
import traceback
import optuna
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Fetch Data

In [3]:
def get_binance_data_batch(
    start=1514752200, 
    stop=None, 
    interval='1h', 
    symbol="BTCUSDT"
):
    client = Client(requests_params={'timeout': 120})
    klines = client.get_historical_klines(symbol, interval, start, stop)
    klines = pd.DataFrame(np.array(klines)[:, 0:6], columns=['Datetime', 'Open', 'High', 'Low', 'Close', 'Volume'])
    klines['Datetime'] = (klines['Datetime'].apply(float)/1000).apply(lambda x: datetime.fromtimestamp(x, pytz.timezone('UTC')).replace(tzinfo=None))
    klines = klines.set_index('Datetime').apply(lambda x: x.apply(float))
    
    return klines

In [4]:
data = get_binance_data_batch()
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-08-17 04:00:00,4261.48,4313.62,4261.32,4308.83,47.181009
2017-08-17 05:00:00,4308.83,4328.69,4291.37,4315.32,23.234916
2017-08-17 06:00:00,4330.29,4345.45,4309.37,4324.35,7.229691
2017-08-17 07:00:00,4316.62,4349.99,4287.41,4349.99,4.443249
2017-08-17 08:00:00,4333.32,4377.85,4333.32,4360.69,0.972807
...,...,...,...,...,...
2023-05-08 08:00:00,27975.47,27989.11,27577.01,27704.94,5210.602030
2023-05-08 09:00:00,27704.95,27937.50,27680.00,27914.94,3757.379190
2023-05-08 10:00:00,27914.94,27951.75,27868.82,27936.14,1899.424930
2023-05-08 11:00:00,27936.14,27973.44,27870.27,27934.09,1669.583480


# Sample Data with CUSUM Events

In [7]:
"""----------------------------------------------------------------------
    function:  Implementation of the symmetric CUSUM filter
    reference: De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons.
    methodology: 39
----------------------------------------------------------------------"""
def cusum_events(
    prices, # dataframe of prices and dates
    threshold # thresholds
):
  
  timeEvents, shiftPositive, shiftNegative = np.array([]), 0, 0
  # dataframe with price differences
  priceDelta = prices.diff()

  for i in priceDelta.index[1:]:
    # compute shiftNegative/shiftPositive with min/max of 0 and delta prices in each day
    shiftPositive = np.max([0, shiftPositive + priceDelta.loc[i]]) # compare price diff with zero
    shiftNegative = np.min([0, shiftNegative + priceDelta.loc[i]]) # compare price diff with zero

    if shiftNegative < -threshold.loc[i]:
      shiftNegative = 0 # reset shiftNegative to 0
      timeEvents = np.append(timeEvents, [i]) # append this time into timeEvents

    elif shiftPositive > threshold.loc[i]:
      shiftPositive = 0 # reset shiftPositive to 0
      timeEvents = np.append(timeEvents, [i]) # append this time into timeEvents

  return pd.DatetimeIndex(timeEvents)

"""----------------------------------------------------------------------
    function: computes the daily volatility at intraday estimation points
    reference: De Prado, M. (2018) Advances in financial machine learning. John Wiley & Sons.
    methodology: Page 44
----------------------------------------------------------------------"""
def daily_volatility(
    close, # dataframe of dates and close price
    span
): 
    dataframe = close.index.searchsorted(close.index - pd.Timedelta(days=1)) # searchsort a lag of one day in dates column
    dataframe = dataframe[dataframe > 0] # drop indexes when it's lower than 1
    dataframe = pd.Series(close.index[dataframe - 1], index=close.index[close.shape[0] - dataframe.shape[0]:]) # dataframe of dates and a lag of them

    returns = np.log(close.loc[dataframe.index] / close.loc[dataframe.values].values).rename("Daily Returns") # dataframe of returns
    stds = returns.ewm(span=span).std().rename("Volatility") # dataframe of ewma stds

    return pd.concat([returns, stds], axis=1).dropna()

In [13]:
def normal_log_likelihood(returns, std, mean=0):
    var = std ** 2
    values = -np.log(var) - ((returns - mean) ** 2) / var
    
    return np.sum(pd.Series(values).replace([-np.inf, np.inf, np.nan, None], 0))

spans = pd.Series()

for span in tqdm(np.linspace(1, 100, 100)):
    vol = daily_volatility(data.Close, span)
    returns, std, mean = vol['Daily Returns'], vol['Volatility'], 0

    spans.loc[span] = normal_log_likelihood(returns, std, mean)

span = spans.idxmax()
span

100%|██████████| 100/100 [00:00<00:00, 101.59it/s]


56.0

In [14]:
volatility = daily_volatility(data.Close, span)
px.line(volatility)

In [15]:
molecules = cusum_events(np.log(data.Close)[volatility.index[0]:], 0.5 * volatility.Volatility)

fig1 = px.line(data.Close.tail(1000))
fig2 = px.scatter(data.Close.loc[molecules].loc[data.tail(1000).index[0]:])
fig2.update_traces(marker_color='red', name='Samples')
fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.show()

# Labeling

In [16]:
"""
function: calculates the t-value of a linear trend
refernce: De Prado, M (2020) Machine Learning for Asset Managers
methodology: page 68, snippet 5.1
"""
def t_value_linear_regression(price:pd.Series): # time series of prices
    x = np.arange(price.shape[0]) # create regression data
    ols = stats.linregress(x, price.values) # fit linear regression
    t_value = ols.slope / ols.stderr # calculate t-value
    
    return t_value

"""
function: implements the trend scanning method
refernce: De Prado, M (2020) Machine Learning for Asset Managers
methodology: page 68, snippet 5.2
"""
def bins_from_trend(molecule, # index of observations we wish to label
                    close, # time series of prices
                    span): # the range arguments of span lenghts that the algorithm will evaluate, in search for the maximum absolute t-value
    
    outputs = pd.DataFrame(index=molecule, columns=['End Time', 't-Value', 'Trend']) # initialize outputs
    spans = range(*span) # get spans
    
    for index in tqdm(molecule):
        t_values = pd.Series(dtype='float64') # initialize t-value series
        location = close.index.get_loc(index) # find observation location
        
        if location + max(spans) > close.shape[0]: # check if the window goes out of range
            continue
        
        for span in spans:
            tail = close.index[location + span - 1] # get window tail index
            window_prices = close.loc[index:tail] # get window prices
            t_values.loc[tail] = t_value_linear_regression(window_prices) # get trend t-value 
            
        tail = t_values.replace([-np.inf, np.inf, np.nan],0).abs().idxmax() # modify for validity and find the t-value's window tail index
        outputs.loc[index,['End Time', 't-Value', 'Trend']] = t_values.index[-1], t_values[tail], np.sign(t_values[tail]) # prevent leakage and get best t-value
        
    outputs['End Time'] = pd.to_datetime(outputs['End Time']) # convert to datetime
    outputs['Trend'] = pd.to_numeric(outputs['Trend'], downcast='signed') # convert to numeric
    
    return outputs.dropna(subset=['Trend']) # drop NaN values from trends

In [17]:
labels = bins_from_trend(molecules, data.Close, [3, 10, 1])
labels

100%|██████████| 8076/8076 [00:35<00:00, 226.86it/s]


Unnamed: 0,End Time,t-Value,Trend
2017-08-18 07:00:00,2017-08-18 15:00:00,2.294568,1.0
2017-08-18 08:00:00,2017-08-18 16:00:00,4.145878,1.0
2017-08-18 09:00:00,2017-08-18 17:00:00,-4.883613,-1.0
2017-08-18 10:00:00,2017-08-18 18:00:00,-10.447089,-1.0
2017-08-18 12:00:00,2017-08-18 20:00:00,-25.404583,-1.0
...,...,...,...
2023-05-06 18:00:00,2023-05-07 02:00:00,-8.900053,-1.0
2023-05-07 14:00:00,2023-05-07 22:00:00,-5.133976,-1.0
2023-05-07 21:00:00,2023-05-08 05:00:00,-6.164239,-1.0
2023-05-07 23:00:00,2023-05-08 07:00:00,-3.799084,-1.0


# Features

In [18]:
features = pd.DataFrame()

features['Log Return'] = np.log(data.Close).diff()
features['Log Return Lag 1'] = features['Log Return'].shift(1)
features['Log Return Lag 6'] = features['Log Return'].shift(6)
features['Log Return Lag 12'] = features['Log Return'].shift(12)
features['Log Return Lag 18'] = features['Log Return'].shift(18)
features['Log Return Lag 24'] = features['Log Return'].shift(24)

features['Volatility 30'] = daily_volatility(data.Close, 30).Volatility
features['Volatility 60'] = daily_volatility(data.Close, 60).Volatility
features['Volatility 180'] = daily_volatility(data.Close, 180).Volatility

auto_correlation_window = 60

features['Autocorrelation Lag 1'] = features['Log Return'].rolling(window=auto_correlation_window).apply(lambda x: x.autocorr(lag=1))
features['Autocorrelation Lag 6'] = features['Log Return'].rolling(window=auto_correlation_window).apply(lambda x: x.autocorr(lag=6))
features['Autocorrelation Lag 12'] = features['Log Return'].rolling(window=auto_correlation_window).apply(lambda x: x.autocorr(lag=12))
features['Autocorrelation Lag 18'] = features['Log Return'].rolling(window=auto_correlation_window).apply(lambda x: x.autocorr(lag=18))
features['Autocorrelation Lag 24'] = features['Log Return'].rolling(window=auto_correlation_window).apply(lambda x: x.autocorr(lag=24))

features = features.dropna()
features

Unnamed: 0_level_0,Log Return,Log Return Lag 1,Log Return Lag 6,Log Return Lag 12,Log Return Lag 18,Log Return Lag 24,Volatility 30,Volatility 60,Volatility 180,Autocorrelation Lag 1,Autocorrelation Lag 6,Autocorrelation Lag 12,Autocorrelation Lag 18,Autocorrelation Lag 24
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-08-19 16:00:00,0.007012,-0.017559,-0.016761,-0.017130,0.034137,0.003169,0.021351,0.023272,0.024217,-0.208921,-0.289347,0.110556,-0.168803,0.222934
2017-08-19 17:00:00,0.006819,0.007012,-0.010802,0.002146,-0.001309,-0.013499,0.020582,0.022732,0.023845,-0.201064,-0.295348,0.121184,-0.165058,0.186731
2017-08-19 18:00:00,0.014524,0.006819,0.034129,-0.022355,0.013327,-0.018276,0.021257,0.022808,0.023746,-0.186902,-0.200436,0.071167,-0.123690,0.120018
2017-08-19 19:00:00,-0.002492,0.014524,-0.013562,0.011621,-0.003600,0.004573,0.023344,0.023771,0.024213,-0.190135,-0.209963,0.050223,-0.114168,0.109452
2017-08-19 20:00:00,-0.000034,-0.002492,0.000636,-0.008583,0.001893,-0.028239,0.024311,0.024232,0.024411,-0.202937,-0.211868,0.045149,-0.117953,0.094295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-08 08:00:00,-0.009718,-0.006956,0.002288,0.000150,0.004337,-0.001847,0.015090,0.016596,0.019331,-0.052755,0.051132,0.105544,-0.004921,-0.023536
2023-05-08 09:00:00,0.007551,-0.009718,-0.003019,-0.004828,-0.003554,-0.001432,0.015297,0.016866,0.019487,-0.134659,0.027195,0.038347,-0.052143,-0.037543
2023-05-08 10:00:00,0.000759,0.007551,-0.002653,-0.001169,0.000379,0.001109,0.015260,0.017000,0.019599,-0.106177,0.020575,0.041326,-0.089288,-0.048238
2023-05-08 11:00:00,-0.000073,0.000759,0.004805,-0.010814,-0.001697,-0.000266,0.015239,0.017147,0.019722,-0.093511,0.028607,0.030186,-0.077628,-0.041710


# Model Data

In [19]:
index = features.index.intersection(labels.index)
X = features.loc[index]
y = labels.loc[index].Trend
times = labels['End Time'].loc[index]

# Model Development

In [20]:
"""
class and functions: splits the data and performes cross validation when observations overlap  
reference: De Prado, M. (2018) Advances in financial machine learning.
methodology: page 109, snippet 7.3
"""
class PurgedKFold(KFold):
    def __init__(
        self, # The PurgedKFold class containing observations and split information
        n_splits: int=3, # The number of KFold splits
        times: pd.Series=None, # Entire observation times
        percent_embargo: float=0.0 # Embargo size percentage divided by 100
    ):
        
        if not isinstance(times, pd.Series): # check if times parameter is a pd.Series
            raise ValueError('Label Through Dates must be a pandas series') # raise error 
        
        super(PurgedKFold,self).__init__(n_splits, shuffle=False, random_state=None) # create the PurgedKFold class from Sklearn's KFold
        self.times = times # set the times property in class
        self.percent_embargo = percent_embargo # set the percent_embargo property in class
        
    def split(
        self, # The PurgedKFold class containing observations and split information
        data: pd.DataFrame, # The sample that is going be splited
        labels: pd.Series=None, # The labels that are going be splited
        groups=None # Group our labels
    ):
        
        if (data.index == self.times.index).sum() != len(self.times): # check if data and times have the same index (starting time)
            raise ValueError('data and ThruDateValues must have the same index') # raise error
        
        indices = np.arange(data.shape[0]) # get data positions
        embargo = int(data.shape[0]*self.percent_embargo) # get embargo size
        
        test_starts = [(i[0], i[-1] + 1) for i in \
            np.array_split(np.arange(data.shape[0]), self.n_splits)] # get all test indices
        
        for start, end in test_starts:
            first_test_index = self.times.index[start] # get the start of the current test set
            test_indices = indices[start:end] # get test indices for current split
            max_test_index = self.times.index.searchsorted(self.times[test_indices].max()) # get the farthest test index
            train_indices = self.times.index.searchsorted(self.times[self.times<=first_test_index].index) # find the left side of the training data
            
            if max_test_index + embargo < data.shape[0]:
                train_indices = np.concatenate((train_indices, indices[max_test_index + embargo:])) # find the right side of the training data with embargo
            
            yield train_indices, test_indices

In [21]:
cross_validator = PurgedKFold(4, times, 0.01)

In [38]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 1, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
    }     
    
    try:
        clf = RandomForestClassifier(random_state=42, **params)
        score = cross_val_score(clf, X, y, cv=cross_validator, scoring='f1', n_jobs=8).mean()

        return score
    
    except BaseException as e:
        traceback.print_exc()

        return None

study_svm = optuna.create_study(direction="maximize")
study_svm.optimize(objective, n_trials=60)

print("Number of finished trials: ", len(study_svm.trials))
print("Best trial:")
trial_svm = study_svm.best_trial

print("  Value: {}".format(trial_svm.value))
print("  Params: ")
for key, value in trial_svm.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-05-08 14:10:30,521][0m A new study created in memory with name: no-name-e57ee1ef-3718-4dac-9155-5c7fee77f25a[0m
[32m[I 2023-05-08 14:10:36,233][0m Trial 0 finished with value: 0.6118436774369558 and parameters: {'n_estimators': 922, 'max_depth': 6, 'min_samples_split': 47, 'min_samples_leaf': 32}. Best is trial 0 with value: 0.6118436774369558.[0m
[32m[I 2023-05-08 14:10:42,673][0m Trial 1 finished with value: 0.6179675863383847 and parameters: {'n_estimators': 798, 'max_depth': 29, 'min_samples_split': 132, 'min_samples_leaf': 33}. Best is trial 1 with value: 0.6179675863383847.[0m
[32m[I 2023-05-08 14:10:50,526][0m Trial 2 finished with value: 0.6161420302802243 and parameters: {'n_estimators': 818, 'max_depth': 48, 'min_samples_split': 46, 'min_samples_leaf': 21}. Best is trial 1 with value: 0.6179675863383847.[0m
[32m[I 2023-05-08 14:10:53,946][0m Trial 3 finished with value: 0.6151503280506416 and parameters: {'n_estimators': 521, 'max_depth': 7, 'min_sam

Number of finished trials:  60
Best trial:
  Value: 0.6223968447969936
  Params: 
    n_estimators: 81
    max_depth: 22
    min_samples_split: 67
    min_samples_leaf: 29


In [None]:
model = RandomForestClassifier(random_state=42, **trial_svm.params)
model

In [5]:
class EmpiricalDataModel:
    
    def __init__(
        self, 
        cusum_threshold=0.5, 
        auto_correlation_window=60, 
        n_splits=4, 
        percent_embargo=0.01
    ):
        
        self.cusum_threshold = cusum_threshold
        self.auto_correlation_window = auto_correlation_window
        self.n_splits = n_splits
        self.percent_embargo = percent_embargo

    def sample_data(
        self,
        data
    ):
        
        def normal_log_likelihood(returns, std, mean=0):
            var = std ** 2
            values = -np.log(var) - ((returns - mean) ** 2) / var
            
            return np.sum(pd.Series(values).replace([-np.inf, np.inf, np.nan, None], 0))

        spans = pd.Series()

        for span in tqdm(np.linspace(1, 100, 100)):
            vol = daily_volatility(data.Close, span)
            returns, std, mean = vol['Daily Returns'], vol['Volatility'], 0

            spans.loc[span] = normal_log_likelihood(returns, std, mean)

        span = spans.idxmax()

        print(f"Optimal volatility span: {span}")

        volatility = daily_volatility(data.Close, span)
        molecules = cusum_events(np.log(data.Close)[volatility.index[0]:], self.cusum_threshold * volatility.Volatility)

        return molecules
    
    def develop_data_model(
        self,
        data
    ):
        
        molecules = self.sample_data(data)
        labels = bins_from_trend(molecules, data.Close, [3, 10, 1])

        features = pd.DataFrame()

        features['Log Return'] = np.log(data.Close).diff()
        features['Log Return Lag 1'] = features['Log Return'].shift(1)
        features['Log Return Lag 6'] = features['Log Return'].shift(6)
        features['Log Return Lag 12'] = features['Log Return'].shift(12)
        features['Log Return Lag 18'] = features['Log Return'].shift(18)
        features['Log Return Lag 24'] = features['Log Return'].shift(24)

        features['Volatility 30'] = daily_volatility(data.Close, 30).Volatility
        features['Volatility 60'] = daily_volatility(data.Close, 60).Volatility
        features['Volatility 180'] = daily_volatility(data.Close, 180).Volatility

        features['Autocorrelation Lag 1'] = features['Log Return'].rolling(window=self.auto_correlation_window).apply(lambda x: x.autocorr(lag=1))
        features['Autocorrelation Lag 6'] = features['Log Return'].rolling(window=self.auto_correlation_window).apply(lambda x: x.autocorr(lag=6))
        features['Autocorrelation Lag 12'] = features['Log Return'].rolling(window=self.auto_correlation_window).apply(lambda x: x.autocorr(lag=12))
        features['Autocorrelation Lag 18'] = features['Log Return'].rolling(window=self.auto_correlation_window).apply(lambda x: x.autocorr(lag=18))
        features['Autocorrelation Lag 24'] = features['Log Return'].rolling(window=self.auto_correlation_window).apply(lambda x: x.autocorr(lag=24))

        features = features.dropna()

        index = features.index.intersection(labels.index)
        X = features.loc[index]
        y = labels.loc[index].Trend
        times = labels['End Time'].loc[index]
        
        cross_validator = PurgedKFold(self.n_splits, times, self.percent_embargo)

        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
                'max_depth': trial.suggest_int('max_depth', 4, 50),
                'min_samples_split': trial.suggest_int('min_samples_split', 1, 150),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
            }     
            
            try:
                clf = RandomForestClassifier(random_state=42, **params)
                score = cross_val_score(clf, X, y, cv=cross_validator, scoring='f1', n_jobs=8).mean()

                return score
            
            except BaseException as e:
                traceback.print_exc()

                return None

        study_svm = optuna.create_study(direction="maximize")
        study_svm.optimize(objective, n_trials=60)

        print("Number of finished trials: ", len(study_svm.trials))
        print("Best trial:")
        trial_svm = study_svm.best_trial

        print("  Value: {}".format(trial_svm.value))
        print("  Params: ")
        for key, value in trial_svm.params.items():
            print("    {}: {}".format(key, value))

        model = RandomForestClassifier(random_state=42, **trial_svm.params)

        return model, X, y

In [10]:
model, X, y = EmpiricalDataModel().develop_data_model(data)

100%|██████████| 100/100 [00:00<00:00, 104.92it/s]


Optimal volatility span: 56.0


100%|██████████| 8076/8076 [00:34<00:00, 231.72it/s]
[32m[I 2023-05-08 16:27:49,267][0m A new study created in memory with name: no-name-b12ada3d-d605-404e-a3fc-ad4b96e07718[0m
[32m[I 2023-05-08 16:28:01,631][0m Trial 0 finished with value: 0.6166331248276373 and parameters: {'n_estimators': 992, 'max_depth': 45, 'min_samples_split': 12, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.6166331248276373.[0m
[32m[I 2023-05-08 16:28:07,259][0m Trial 1 finished with value: 0.61583688780683 and parameters: {'n_estimators': 666, 'max_depth': 36, 'min_samples_split': 49, 'min_samples_leaf': 43}. Best is trial 0 with value: 0.6166331248276373.[0m
[32m[I 2023-05-08 16:28:12,463][0m Trial 2 finished with value: 0.6175010535279329 and parameters: {'n_estimators': 626, 'max_depth': 38, 'min_samples_split': 44, 'min_samples_leaf': 36}. Best is trial 2 with value: 0.6175010535279329.[0m
[32m[I 2023-05-08 16:28:13,276][0m Trial 3 finished with value: 0.614438605800321 and paramete

Number of finished trials:  60
Best trial:
  Value: 0.6200067665073439
  Params: 
    n_estimators: 933
    max_depth: 33
    min_samples_split: 127
    min_samples_leaf: 23
