---
# **Financial Data Structures**

---

# Preliminaries

## Libraries

In [None]:
# Imports libraries

# Import functions from RiskLabAI
from RiskLabAI.controller import Controller
from RiskLabAI.data.structures.data_structures_lopez import *
from RiskLabAI.utils import *

from RiskLabAI.data.structures.imbalance_bars import ExpectedImbalanceBars, FixedImbalanceBars
from RiskLabAI.data.structures.run_bars import ExpectedRunBars, FixedRunBars
from RiskLabAI.data.structures.standard_bars import StandardBars
from RiskLabAI.data.structures.time_bars import TimeBars

from RiskLabAI.utils.constants import CUMULATIVE_DOLLAR, CUMULATIVE_VOLUME, CUMULATIVE_TICKS

from RiskLabAI.controller import Controller



import plotly.graph_objects as go
from openpyxl import load_workbook
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")


# Set plotting backend and handle numpy errors
pd.options.plotting.backend = "plotly"
np.seterr(divide='ignore', invalid='ignore')


import datetime
import time
import sys
from statsmodels.stats import stattools
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

# !pip3 install dtale
import dtale

# Path to save the Excel file
excel_file_path = "structure.xlsx"

# Initialize the Excel file
wb = load_workbook(excel_file_path) if os.path.exists(excel_file_path) else None

# Financial Data Structures Using RiskLabAI


## Data Import and Initial Processing

In [None]:
controller = Controller()
data = pd.read_csv('stock_data.csv')
data["date"] = pd.to_datetime(data["date"])
data.head()

## Bars

### Standard Bars

#### Time Bars

In [None]:
time = controller.handle_input_command(
    method_name="time_bars",
    method_arguments={
        "resolution_type":'MIN',
        "resolution_units":10,
    },
    input_data= data,
    batch_size= 1_000_000,
)

time.head()

#### Tick Bars

In [None]:
tick = controller.handle_input_command(
    method_name="tick_standard_bars",
    method_arguments={
        "threshold": 250,
    },
    input_data=data,
    batch_size=1_000_000,
)

tick.head()

#### Volume Bars

In [None]:
volume = controller.handle_input_command(
    method_name="volume_standard_bars",
    method_arguments={
        "threshold": 3_500_000,
    },
    input_data=data,
    batch_size=1_000_000
)

volume.head()

#### VPIN (Volume-Synchronized Probability of Informed Trading)

In [None]:
volume_imbalance = (volume["Cumulative Buy Volume"] - volume["Cumulative Sell Volume"]).abs()
vpin = volume_imbalance.rolling(window=1).mean() / volume["Cumulative Volume"]
volume["VPIN"] = vpin
volume.to_csv('stock_vpin.csv')

# Save countAverage data to Excel without overwriting previous sheets
with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
    volume.to_excel(writer, sheet_name='VPIN')

volume.head()

In [None]:
# Create a subplot with 2 rows and 1 column
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1)

# Add the price plot to the first row
fig.add_trace(
    go.Scatter(
        x=volume.index,
        y=volume['Close'],
        name="Prices",
    ),
    row=1, col=1,
)

# Add the VPIN plot to the second row as a line chart
fig.add_trace(
    go.Scatter(
        x=volume.index,
        y=volume['VPIN'],
        name="VPIN",
        mode='lines'
    ),
    row=2, col=1,
)

# Update layout
fig.update_layout(
    title="Price and VPIN",
    xaxis_title="Date",
    hovermode="x unified",
    template="plotly_dark",
)

# Update x-axis title for the second subplot
fig.update_xaxes(title_text="Date", row=2, col=1)

# Show the figure
fig.show()

#### Dollar Bars

In [None]:
dollar = controller.handle_input_command(
    method_name="dollar_standard_bars",
    method_arguments={
        "threshold": 60_000_000_000,
    },
    input_data=data,
    batch_size=1_000_000,
)

dollar.head()

#### Stability

In [73]:
time = time.set_index(['Date Time'])
dollar = dollar.set_index(['Date Time'])
volume = volume.set_index(['Date Time'])
tick = tick.set_index(['Date Time'])

In [None]:
dollar_count = dollar['Close'].resample('1W').count()
volume_count = volume['Close'].resample('1W').count()
tick_count = tick['Close'].resample('1W').count()
time_count = time['Open'].resample('1W').count()

count_df = pd.concat([time_count, tick_count, volume_count, dollar_count], axis=1)
count_df.columns = ['time', 'tick', 'volume', 'dollar']
count_df.head()

In [None]:
# Create and style traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=count_df.index, y=count_df['time'], name='Time'))
fig.add_trace(go.Scatter(x=count_df.index, y=count_df['tick'], name='Tick'))
fig.add_trace(go.Scatter(x=count_df.index, y=count_df['volume'], name='Volume'))
fig.add_trace(go.Scatter(x=count_df.index, y=count_df['dollar'], name='Dollar'))

# Use the function to update layout
update_figure_layout(
    fig,
    title="Standard Bars Frequency",
    xaxis_title="Date",
    yaxis_title="Frequency"
)

# Save count_df data to Excel without overwriting previous sheets
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    count_df.to_excel(writer, sheet_name='Standard Frequency - RiskLabAI')
    
# Show the plot
fig.show()

#### Normality

##### Jarque-Bera Test

In [76]:
dollar_returns = np.log(dollar['Close']).diff().dropna()
volume_returns = np.log(volume['Close']).diff().dropna()
tick_returns = np.log(tick['Close']).diff().dropna()
time_returns = np.log(time['Open']).diff().dropna()

In [None]:
print("Jarque-Bera test statistic for time returns:", int(stats.jarque_bera(time_returns)[0]))
print("Jarque-Bera test statistic for dollar returns:", int(stats.jarque_bera(dollar_returns)[0]))
print("Jarque-Bera test statistic for volume returns:", int(stats.jarque_bera(volume_returns)[0]))
print("Jarque-Bera test statistic for tick returns:", int(stats.jarque_bera(tick_returns)[0]))

##### Shapiro-Wilk Test

In [None]:
print("Shapiro-Wilk test statistic for time returns:", stats.shapiro(time_returns))
print("Shapiro-Wilk test statistic for dollar returns:", stats.shapiro(dollar_returns))
print("Shapiro-Wilk test statistic for volume returns:", stats.shapiro(volume_returns))
print("Shapiro-Wilk test statistic for tick returns:", stats.shapiro(tick_returns))

##### Results KDE plot 

###### Standardize Data

In [79]:
time_standard = (time_returns - time_returns.mean()) / time_returns.std()
tick_standard = (tick_returns - tick_returns.mean()) / tick_returns.std()
volume_standard = (volume_returns - volume_returns.mean()) / volume_returns.std()
dollar_standard = (dollar_returns - dollar_returns.mean()) / dollar_returns.std()

###### Distribution Plot

In [None]:
plt.figure(figsize=(16, 12))
sns.kdeplot(time_standard, label="Time")
sns.kdeplot(tick_standard, label="Tick")
sns.kdeplot(volume_standard, label="Volume")
sns.kdeplot(dollar_standard, label="Dollar")
sns.kdeplot(np.random.normal(size=1000000), label="Normal", linestyle="dotted")
plt.xticks(range(-4, +4))
plt.title(
    'Partial recovery of normality',
    loc='center', 
)
plt.xlim(-5, 5)
plt.show()

# Save the standardized returns data to Excel without overwriting previous sheets
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    time_standard.to_excel(writer, sheet_name='Standardized Returns', startcol=0, index=True)
    tick_standard.to_excel(writer, sheet_name='Standardized Returns', startcol=2, index=True)
    volume_standard.to_excel(writer, sheet_name='Standardized Returns', startcol=4, index=True)
    dollar_standard.to_excel(writer, sheet_name='Standardized Returns', startcol=6, index=True)

### Information Driven Bars

#### Expected Imbalance Bars

##### Imbalance Tick Bars

In [None]:
expected_tick_imbalance_bars = controller.handle_input_command(
    method_name="expected_tick_imbalance_bars",
    method_arguments={
        "window_size_for_expected_n_ticks_estimation": 5,
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 2000,
    },
    input_data=data,
    batch_size=1_000_000,
)

expected_tick_imbalance_bars.head()

##### Imbalance Volume Bars

In [None]:
expected_volume_imbalance_bars = controller.handle_input_command(
    method_name="expected_volume_imbalance_bars",
    method_arguments={
        "window_size_for_expected_n_ticks_estimation": 5,
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 2000,
    },
    input_data=data,
    batch_size=1_000_000,
)

expected_volume_imbalance_bars.head()

##### Imbalance Dollar Bars

In [None]:
expected_dollar_imbalance_bars = controller.handle_input_command(
    method_name="expected_dollar_imbalance_bars",
    method_arguments={
        "window_size_for_expected_n_ticks_estimation": 5,
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 2000,
    },
    input_data=data,
    batch_size=1_000_000,
)

expected_dollar_imbalance_bars.head()

#### Fixed Imbalance Bars

##### Fixed Imbalance Tick Bars

In [None]:
fixed_tick_imbalance_bars = controller.handle_input_command(
    method_name="fixed_tick_imbalance_bars",
    method_arguments={
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

fixed_tick_imbalance_bars.head()

##### Fixed Imabalance Volume Bars

In [None]:
fixed_volume_imbalance_bars = controller.handle_input_command(
    method_name="fixed_volume_imbalance_bars",
    method_arguments={
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

fixed_volume_imbalance_bars.head()

##### Fixed Imbalance Dollar Bars

In [None]:
fixed_dollar_imbalance_bars = controller.handle_input_command(
    method_name="fixed_dollar_imbalance_bars",
    method_arguments={
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

fixed_dollar_imbalance_bars.head()

#### Run Bars

##### Expected Run bars

###### Expected Tick Run Bars

In [None]:
expected_tick_run_bars = controller.handle_input_command(
    method_name="expected_tick_run_bars",
    method_arguments={
        "window_size_for_expected_n_ticks_estimation": 5,
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

expected_tick_run_bars.head()

###### Expected volume Run Bars

In [None]:
expected_volume_run_bars = controller.handle_input_command(
    method_name="expected_volume_run_bars",
    method_arguments={
        "window_size_for_expected_n_ticks_estimation": 5,
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

expected_volume_run_bars.head()

###### Expected Dollar Run Bars

In [None]:
expected_dollar_run_bars = controller.handle_input_command(
    method_name="expected_dollar_run_bars",
    method_arguments={
        "window_size_for_expected_n_ticks_estimation": 5,
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

expected_dollar_run_bars.head()

##### Fixed Run Bars

###### Fixed Tick Run Bars

In [None]:
fixed_tick_run_bars = controller.handle_input_command(
    method_name="fixed_tick_run_bars",
    method_arguments={
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

fixed_tick_run_bars.head()

###### Fixed Volume Run Bars

In [None]:
fixed_volume_run_bars = controller.handle_input_command(
    method_name="fixed_volume_run_bars",
    method_arguments={
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

fixed_volume_run_bars.head()

###### Fixed Dollar Run Bars

In [None]:
fixed_dollar_run_bars = controller.handle_input_command(
    method_name="fixed_dollar_run_bars",
    method_arguments={
        "window_size_for_expected_imbalance_estimation": 10000,
        "initial_estimate_of_expected_n_ticks_in_bar": 20000,
    },
    input_data=data,
    batch_size=1_000_000,
)

fixed_dollar_run_bars.head()

# Financial Data Structures Using Lopez book

## Data Import and Initial Processing

In [None]:
dir = "https://raw.githubusercontent.com/risk-labratory/data/main/"
url = dir + "IVE_2020.csv"

dataframe = pd.read_csv(url, header=0)
dataframe['dates'] = pd.to_datetime(dataframe['dates'])
dataframe.set_index('dates', inplace=True, drop=True)
dataframe.drop_duplicates(inplace=True)
dataframe = dataframe[(dataframe.index.hour >= 9) & (dataframe.index.hour < 16)]
dataframe.head()


## Bars

### Standard Bars

#### Time Bars

In [None]:
ohlcvTime = generate_time_bar(dataframe, frequency = "30Min")
ohlcvTime.head(10)

#### Tick Bars

In [None]:
timeBarLen = ohlcvTime.shape[0]
ohlcvTick = generate_tick_bar(dataframe, ticks_per_bar = 0, number_bars = timeBarLen)
ohlcvTick.head(10)

#### Volume Bars

In [None]:
ohlcvVolume = generate_volume_bar(dataframe, volume_per_bar = 0, number_bars = timeBarLen)
ohlcvVolume.head(10)

#### Dollar Bars

In [None]:
ohlcvDollar = generate_dollar_bar(dataframe, dollar_per_bar = 0, number_bars = timeBarLen)
ohlcvDollar.head(10)

#### Stability

In [None]:
countAverage = pd.DataFrame()
countAverage['time'] = ohlcvTime.resample("1W")['tick_count'].mean()
countAverage['tick'] = ohlcvTick.resample("1W")['tick_count'].mean()
countAverage['volume'] = ohlcvVolume.resample("1W")['tick_count'].mean()
countAverage['dollar'] = ohlcvDollar.resample("1W")['tick_count'].mean()
countAverage.head(10)

In [None]:
fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=countAverage.index, y=countAverage.time, name='Time'))
fig.add_trace(go.Scatter(x=countAverage.index, y=countAverage.tick, name='Tick'))
fig.add_trace(go.Scatter(x=countAverage.index, y=countAverage.volume, name='Volume'))
fig.add_trace(go.Scatter(x=countAverage.index, y=countAverage.dollar, name='Dollar'))

# Use the function to update layout
update_figure_layout(
    fig,
    title="Standard Bars Frequency",
    xaxis_title="Date",
    yaxis_title="Frequency"
)
# Save countAverage data to Excel without overwriting previous sheets
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    countAverage.to_excel(writer, sheet_name='Standard Bars Frequency')

# Show the plot
fig.show()

### Information-Driven Bars

#### Data Initial Processsing

In [None]:
# Select a smaller range
startDate = datetime.datetime(2020, 3, 1)
endDate = datetime.datetime(2020, 3, 5)
new_dataframe = dataframe[((dataframe.index >= startDate) & (dataframe.index <= endDate))][['price', 'size']]
new_dataframe.drop_duplicates()
new_dataframe['ret'] = np.log(new_dataframe['price']) - np.log(new_dataframe['price'].shift(1))
new_dataframe['label'] = np.sign(new_dataframe['ret'])
new_dataframe['volume_labeled'] = new_dataframe['label']*new_dataframe['size']
new_dataframe['dollarslabeled'] = new_dataframe['volume_labeled']*new_dataframe['price']
new_dataframe.dropna(inplace = True)
new_dataframe.head()

#### Imbalance Tick Bar

In [None]:
imbalance_tick_bar, thetas_absolute_tick, thresholds_tick = generate_information_driven_bars(new_dataframe, bar_type="tick", tick_expected_initial=0)
imbalance_tick_bar.head()

#### Imbalance Volume Bar

In [None]:
imbalance_volume_bar, thetas_absolute_volume, thresholds_volume = generate_information_driven_bars(new_dataframe, bar_type="volume", tick_expected_initial=0)
imbalance_volume_bar.head()

#### Imbalance Dollar Bar

In [None]:
imbalance_dollar_bar, thetas_absolute_dollar, thresholds_dollar = generate_information_driven_bars(new_dataframe, bar_type="dollar", tick_expected_initial=0)
imbalance_dollar_bar.head()