### Importing Packages and API Key

In [42]:
# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv
from scipy import stats

# Load environment variables from the .env file
load_dotenv('config.env')


True

In [43]:
# Import MlFinLab package
import mlfinlab
from mlfinlab.data_structures.standard_data_structures import (get_dollar_bars,
                                                               get_tick_bars, get_volume_bars)

from mlfinlab.multi_product.etf_trick import get_futures_roll_series
from mlfinlab.data_structures.imbalance_data_structures import get_ema_dollar_imbalance_bars


### Importing Data and Creating Dollar, Volume and Tick Bars

In [44]:
# import ES_Trades data
#path = 'Data/ES_Trades.csv'
path = 'https://raw.githubusercontent.com/jjakimoto/finance_ml/refs/heads/master/datasets/Google.csv'
data = pd.read_csv(path)

# viewing data
data.head()


Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
0,2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
1,2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2,2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
3,2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
4,2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


In [45]:
# getting the min, max, mean and median of the volume column. The numbers should be in a format with commas separating the thousands and no decimal points.
min_volume = f'{data["Volume"].min():,}'
max_volume = f'{data["Volume"].max():,}'
mean_volume = f'{data["Volume"].mean():,.2f}'
median_volume = f'{data["Volume"].median():,}'

print('Min Volume:', min_volume)
print('Max Volume:', max_volume)
print('Mean Volume:', mean_volume)
print('Median Volume:', median_volume)

Min Volume: 521,141.0
Max Volume: 82,151,100.0
Mean Volume: 8,404,690.76
Median Volume: 5,620,700.0


In [46]:
# Format the Data

# renaming 'Date' to 'date'
data.rename(columns={'Date': 'date'}, inplace=True)

# renaming 'Adj. Close' to 'price'
data.rename(columns={'Adj. Close': 'price'}, inplace=True)

# renaming 'Volume' to 'volume'
data.rename(columns={'Volume': 'volume'}, inplace=True)

# selecting only the 'date', 'price' and 'volume' columns as the new data to be used
new_data = data[['date', 'price', 'volume']]
print(new_data.head())
print('\n')
print('Rows:', new_data.shape[0])

         date      price      volume
0  2004-08-19  50.322842  44659000.0
1  2004-08-20  54.322689  22834300.0
2  2004-08-23  54.869377  18256100.0
3  2004-08-24  52.597363  15247300.0
4  2004-08-25  53.164113   9188600.0


Rows: 3125


### Creating Dollar Bars and Dollar Imbalance Bars

In [47]:
# 'new_data' can be simply replaced with the file path where 'raw_tick_data' was saved if memory is an issue
print('Creating Dollar Bars')
dollar = get_dollar_bars(new_data, threshold=2000000000,
                         batch_size=1000, verbose=True)


Creating Dollar Bars
Reading data in batches:
Batch number: 0
Batch number: 1
Batch number: 2
Batch number: 3
Returning bars 



In [48]:
# creating dollar imbalance bars
print('Creating Dollar Imbalance Bars')
dollar_imbalance = get_ema_dollar_imbalance_bars(new_data, num_prev_bars=3, expected_imbalance_window=10, exp_num_ticks_init=10, exp_num_ticks_constraints=[1, 10],
                                                 batch_size=1000, verbose=True, to_csv=False, analyse_thresholds=False, output_path=None)

Creating Dollar Imbalance Bars
Reading data in batches:
Batch number: 0
Batch number: 1
Batch number: 2
Batch number: 3
Returning bars 



### Confirming Sampling

In [49]:
# Confirm the dollar sampling
dollar['value'] = dollar['close'] * dollar['volume']
dollar.head()


Unnamed: 0_level_0,tick_num,open,high,low,close,volume,cum_buy_volume,cum_ticks,cum_dollar_value,value
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2004-08-19,1,50.322842,50.322842,50.322842,50.322842,44659000.0,0.0,1,2247368000.0,2247368000.0
2004-08-23,3,54.322689,54.869377,54.322689,54.869377,41090400.0,41090400.0,2,2242121000.0,2254605000.0
2004-08-27,7,52.597363,54.12207,52.597363,53.239345,37742400.0,16283400.0,4,2005164000.0,2009381000.0
2004-09-03,12,51.162935,51.343492,50.159839,50.159839,39523700.0,20036400.0,5,2006010000.0,1982502000.0
2004-09-14,18,50.947269,55.917612,50.947269,55.917612,42266600.0,42266600.0,6,2250130000.0,2363447000.0


In [50]:
# Confirm the dollar imbalance sampling
dollar_imbalance['value'] = dollar_imbalance['close'] * dollar_imbalance['volume']
dollar_imbalance

Unnamed: 0_level_0,tick_num,open,high,low,close,volume,cum_buy_volume,cum_ticks,cum_dollar_value,value
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2004-09-01,10,50.322842,54.869377,50.28021,50.28021,142744500.0,62291600.0,10,7472499000.0,7177223000.0
2004-09-02,11,50.912161,50.912161,50.912161,50.912161,15118600.0,15118600.0,1,769720600.0,769720600.0
2004-09-10,16,50.159839,52.828075,50.159839,52.828075,28746000.0,23593600.0,5,1480121000.0,1518596000.0
2004-09-14,18,53.916435,55.917612,53.916435,55.917612,18673000.0,18673000.0,2,1028452000.0,1044150000.0
2004-09-16,20,56.173402,57.161452,56.173402,57.161452,19979300.0,19979300.0,2,1131461000.0,1142046000.0
2004-09-20,22,58.926902,59.864797,58.926902,59.864797,20101200.0,20101200.0,2,1194470000.0,1203354000.0
2004-09-29,29,59.102444,65.742942,59.102444,65.742942,86980400.0,63562200.0,7,5445390000.0,5718347000.0
2004-10-14,40,65.000651,71.219849,65.000651,71.219849,147790500.0,99109500.0,11,10143760000.0,10525620000.0
2004-10-22,46,72.278116,86.481962,70.462511,86.481962,170923200.0,130090800.0,6,13507100000.0,14781770000.0
2004-11-18,65,93.990139,98.3185,84.029391,84.029391,616087800.0,222319500.0,19,56088770000.0,51769480000.0


### Viewing the Dollar and Dollar Imbalance Bars

In [51]:
# linking the data to the naming convention below
continuous_dollar_relative_method = dollar
continuous_dollar_imbalance_relative_method = dollar_imbalance

# viewing the head of continuous_contract_relative_method
continuous_dollar_relative_method.head()


Unnamed: 0_level_0,tick_num,open,high,low,close,volume,cum_buy_volume,cum_ticks,cum_dollar_value,value
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2004-08-19,1,50.322842,50.322842,50.322842,50.322842,44659000.0,0.0,1,2247368000.0,2247368000.0
2004-08-23,3,54.322689,54.869377,54.322689,54.869377,41090400.0,41090400.0,2,2242121000.0,2254605000.0
2004-08-27,7,52.597363,54.12207,52.597363,53.239345,37742400.0,16283400.0,4,2005164000.0,2009381000.0
2004-09-03,12,51.162935,51.343492,50.159839,50.159839,39523700.0,20036400.0,5,2006010000.0,1982502000.0
2004-09-14,18,50.947269,55.917612,50.947269,55.917612,42266600.0,42266600.0,6,2250130000.0,2363447000.0


In [52]:
# viewing the head of continuous_contract_relative_method for dollar imbalance
continuous_dollar_imbalance_relative_method.head()

Unnamed: 0_level_0,tick_num,open,high,low,close,volume,cum_buy_volume,cum_ticks,cum_dollar_value,value
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2004-09-01,10,50.322842,54.869377,50.28021,50.28021,142744500.0,62291600.0,10,7472499000.0,7177223000.0
2004-09-02,11,50.912161,50.912161,50.912161,50.912161,15118600.0,15118600.0,1,769720600.0,769720600.0
2004-09-10,16,50.159839,52.828075,50.159839,52.828075,28746000.0,23593600.0,5,1480121000.0,1518596000.0
2004-09-14,18,53.916435,55.917612,53.916435,55.917612,18673000.0,18673000.0,2,1028452000.0,1044150000.0
2004-09-16,20,56.173402,57.161452,56.173402,57.161452,19979300.0,19979300.0,2,1131461000.0,1142046000.0


### Computing Serial correlation of the Dollar and Dollar Imbalance Bars

In [53]:
# computing the returns for the dollar bars and dollar imbalance bars using pct change
dollar_returns = continuous_dollar_relative_method['close'].pct_change().dropna()
dollar_imbalance_returns = continuous_dollar_imbalance_relative_method['close'].pct_change().dropna()

In [54]:
# computing the serial correlation for the dollar bars and dollar imbalance bars
dollar_serial_corr = dollar_returns.autocorr()
dollar_imbalance_serial_corr = dollar_imbalance_returns.autocorr()




In [55]:
# printing the serial correlation for the dollar bars and dollar imbalance bars
print('Serial Correlation for Dollar Bars:', dollar_serial_corr)
print('Serial Correlation for Dollar Imbalance Bars:', dollar_imbalance_serial_corr)

Serial Correlation for Dollar Bars: 0.023799572518800136
Serial Correlation for Dollar Imbalance Bars: -0.2563899257520614
