---
# **Sample Weight**

---

# Preliminaries

## Libraries

In [314]:
import pandas as pd
import numpy as np 
import os

import time
import sys
from scipy import stats
from statsmodels.stats import stattools
import random 
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set plotting backend and handle numpy errors
pd.options.plotting.backend = "plotly"
np.seterr(divide = 'ignore', invalid = 'ignore')

import multiprocessing as mp

from openpyxl import load_workbook

# Import functions from RiskLabAI
from RiskLabAI.data.labeling import *
from RiskLabAI.data.labeling.labeling import *
from RiskLabAI.data.weights import *
from RiskLabAI.utils import *

# Path to save the Excel file
excel_file_path = "sample_weight.xlsx"

# Initialize the Excel file
wb = load_workbook(excel_file_path) if os.path.exists(excel_file_path) else None

In [320]:
# Import Tick data
dir = "https://raw.githubusercontent.com/risk-labratory/data/main/"
url = dir + "IVE_2020.csv"
df = pd.read_csv(url, header=0)
df['dates'] = pd.to_datetime(df['dates'])
df.set_index('dates', inplace=True, drop=True)
df.drop_duplicates(inplace=True)
df = df[(df.index.hour >= 9) & (df.index.hour < 16)]
df.head()

ohlcv = generate_time_bars(df, frequency = "1B")
ohlcv.dropna(inplace = True)
ohlcv.index = ohlcv.index.strftime("%Y-%m-%d")
ohlcv.index = [dt.datetime.strptime(date, '%Y-%m-%d').date() for date in ohlcv.index]
ohlcv.index = pd.DatetimeIndex(ohlcv.index)
close = ohlcv.close
  
# Save OHLCV data to Excel
with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
    ohlcv.to_excel(writer, sheet_name='OHLCV_Data')

ohlcv.head()


Unnamed: 0,open,high,low,close,volume,value_of_trades,price_mean,tick_count,price_mean_log_return
2020-01-03,129.33,129.9874,129.2912,129.64,526340,129.751774,129.719157,922,-0.003845
2020-01-06,129.0,129.8952,128.93,129.8952,655431,129.548003,129.493223,770,-0.001743
2020-01-07,129.52,129.58,129.1405,129.38,413423,129.376731,129.357347,908,-0.00105
2020-01-08,129.38,130.2999,129.24,129.76,449383,129.881903,129.858126,1028,0.003864
2020-01-09,130.3,130.38,129.92,130.3168,376142,130.161216,130.161563,614,0.002334


# Analytics

## Volatility Computation

In [324]:
vols = compute_daily_volatility(close, span = 32)
stds = vols['std']
vols.head()
# Write the volatility data to a new sheet in the Excel file
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    vols.to_excel(writer, sheet_name='Volatility_Data')

#plot the volatility data
fig = go.Figure()
fig.add_trace(go.Scatter(x=vols.index, y=vols['rets'].values, mode="lines", marker=dict(color='firebrick'), name="rets"))
fig.add_trace(go.Scatter(x=vols.index, y=vols['std'].values, mode="lines", marker=dict(color='royalblue'), name="stds"))
fig.add_trace(go.Scatter(x=vols.index, y=-vols['std'].values, mode="lines", marker=dict(color='royalblue'), name="-stds"))

# Apply custom layout
update_figure_layout(fig, "Returns and Volatility", "Date", "Values")
fig.show()

## Event Detection and Triple Barrier Method

In [325]:
# Time events
timeevents = symmetric_cusum_filter(close, 5)

# Triple Barrier calculations
timestamp = pd.Series(pd.NaT, index = timeevents)
side = pd.Series(1., index = timeevents)
target = stds.loc[timeevents] 
# standard setup
ptsl = [1, 1]
events1 = pd.concat({'timestamp': timestamp, 'target': target, 'side': side}, axis = 1).dropna(subset = ['target'])
events1['End Time'] = timestamp
events1['Base Width'] = target
events1['Side'] = side

events1.head()



Unnamed: 0,timestamp,target,side,End Time,Base Width,Side
2020-01-31,NaT,0.007381,1.0,NaT,0.007381,1.0
2020-02-12,NaT,0.010774,1.0,NaT,0.010774,1.0
2020-02-24,NaT,0.012431,1.0,NaT,0.012431,1.0
2020-02-26,NaT,0.021313,1.0,NaT,0.021313,1.0
2020-02-27,NaT,0.02349,1.0,NaT,0.02349,1.0


In [326]:
# Run the triple barrier function
out = triple_barrier(close, events1, ptsl, molecule = timeevents)
out.head(10)

Unnamed: 0,End Time,stop_loss,profit_taking
2020-01-31,NaT,2020-02-25,2020-02-04
2020-02-12,NaT,2020-02-24,NaT
2020-02-24,NaT,2020-02-25,NaT
2020-02-26,NaT,2020-02-27,2020-11-16
2020-02-27,NaT,2020-03-09,2020-03-02
2020-03-02,NaT,2020-03-06,2020-11-16
2020-03-06,NaT,2020-03-09,2020-06-08
2020-03-09,NaT,2020-03-12,2020-03-10
2020-03-11,NaT,2020-03-12,2020-04-29
2020-03-12,NaT,2020-03-20,2020-03-13


In [327]:
events1.rename(columns = {'End Time': 'timestamp'}, inplace = True)

## Additional Events and Barriers


In [328]:
t1 = vertical_barrier(close, timeevents, 21)
events2 = meta_events(close, timeevents, ptsl = ptsl, target = stds, return_min = 0.04, num_threads=1, timestamp = t1)
events2.head()

Unnamed: 0,End Time,Base Width,target,timestamp
2020-03-12,2020-03-13,0.044984,0.044984,2020-04-02
2020-03-13,2020-03-16,0.043517,0.043517,2020-04-03
2020-03-16,2020-03-17,0.047672,0.047672,2020-04-06
2020-03-17,2020-03-18,0.046638,0.046638,2020-04-07
2020-03-18,2020-03-20,0.045596,0.045596,2020-04-08


In [329]:
out2 = get_labels(events2, close)
out2.head()

Unnamed: 0,ret,bin
2020-03-12,-0.00328,-1.0
2020-03-13,-0.099218,-1.0
2020-03-16,0.078135,1.0
2020-03-17,0.017907,1.0
2020-03-18,0.130506,1.0


## Number of Concurrent Labels

In [None]:
curr = expand_label_for_meta_labeling(close.index, events2.timestamp, events2.index)
curr.head()

with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    curr.to_excel(writer, sheet_name='Curr_Data')
    
fig = go.Figure()

fig.add_trace(go.Scatter(x=curr.index, y=curr.values, mode='lines', name='Curr'))

update_figure_layout(fig, "Curr Values Over Time", "Date", "Curr Value")
fig.show()


## Sample Weight Using Tripple Barrier Method

In [None]:
sample_weight = calculate_sample_weight(events2.timestamp, curr, events2.index)
sample_weight.head()

#Write sample_weight to a new sheet in the Excel file
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    sample_weight.to_excel(writer, sheet_name='Sample_Weight')

# Plot the distribution of sample weights
fig = go.Figure()
fig.add_trace(go.Histogram(x=sample_weight.values, y=curr.values, histfunc='count', name='Sample Weight Distribution'))
update_figure_layout(fig, "Sample Weight Distribution", "Sample Weight", "Number of Curr")
fig.show()

## Sequential Bootstrap

In [332]:
# Function to generate the indicator matrix

def create_index_matrix(
    bar_ix: pd.Index, 
    t1: pd.Series
) -> pd.DataFrame:
    """
    Derives an indicator matrix from the given bar index and the time series.

    This function returns a binary matrix indicating which (price) bars
    influence the label for each observation. The indicator matrix has the 
    time series values from `t1` as columns, and the bar index `bar_ix` as rows.
    
    The matrix is constructed such that for each pair of bar indices 
    `(t0, t1)`, the matrix is set to 1 from `t0` to `t1` for the corresponding 
    index in `t1`.

    :param bar_ix: The index of bars (pd.Index).
    :param t1: The time series containing start and end times (pd.Series).
    
    :return: A binary DataFrame (pd.DataFrame) representing the indicator matrix.

    Formulae:
        For each pair `(t0, t1)` in `t1`:
            The value at `indM.loc[t0:t1, i] = 1` is set.
    """
    # Initialize the indicator matrix with zeros
    ind_matrix = pd.DataFrame(0, index=bar_ix, columns=range(t1.shape[0]))

    # Iterate through the series and set the indicator values
    for i, (t0, t1_value) in enumerate(t1.items()):
        ind_matrix.loc[t0:t1_value, i] = 1

    return ind_matrix

## Indicator Matrix

In [333]:
# indicator matrix
indM = create_index_matrix(close.index, timestamp)
indM.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
2020-01-03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-07,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-08,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-09,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Calculate the average uniqueness
avgU = calculate_average_uniqueness(indM)

# Write avgU to a new sheet in the Excel file
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    avgU.to_excel(writer, sheet_name='Average_Uniqueness')

avgU.head()

0    0.093176
1    0.059898
2    0.047442
3    0.044719
4    0.043737
dtype: float64

In [337]:
# Preform sequential bootstrapping
seqb = perform_sequential_bootstrap(indM, indM.shape[0])
seqb

[7,
 32,
 12,
 31,
 25,
 18,
 5,
 15,
 25,
 15,
 29,
 21,
 19,
 19,
 14,
 4,
 27,
 19,
 8,
 29,
 0,
 20,
 25,
 4,
 1,
 31,
 28,
 24,
 20,
 3,
 0,
 1,
 12,
 23,
 5,
 23,
 7,
 32,
 31,
 6,
 35,
 24,
 31,
 31,
 14,
 31,
 11,
 15,
 2,
 18,
 9,
 4,
 26,
 35,
 34,
 16,
 8,
 30,
 23,
 30,
 6,
 7,
 35,
 21,
 7,
 21,
 29,
 20,
 5,
 21,
 2,
 16,
 20,
 21,
 5,
 0,
 3,
 8,
 3,
 33,
 8,
 12,
 20,
 13,
 9,
 2,
 2,
 8,
 31,
 6,
 24,
 28,
 11,
 28,
 5,
 1,
 24,
 8,
 34,
 20,
 32,
 17,
 2,
 18,
 12,
 30,
 12,
 11,
 5,
 23,
 34,
 6,
 32,
 11,
 6,
 12,
 35,
 27,
 6,
 5,
 5,
 10,
 8,
 13,
 9,
 26,
 15,
 8,
 11,
 29,
 5,
 10,
 1,
 15,
 33,
 35,
 0,
 6,
 22,
 27,
 22,
 0,
 21,
 31,
 3,
 1,
 33,
 5,
 14,
 15,
 1,
 18,
 5,
 24,
 34,
 11,
 32,
 26,
 15,
 3,
 0,
 2,
 7,
 4,
 19,
 22,
 15,
 17,
 3,
 13,
 20,
 27,
 11,
 15,
 13,
 28,
 26,
 24,
 24,
 20,
 14,
 1,
 32,
 13,
 29,
 0,
 20,
 18,
 25,
 3,
 0,
 13,
 2,
 5,
 9,
 27,
 1,
 17,
 23,
 30,
 23,
 18,
 14,
 6,
 12,
 28,
 3,
 3,
 13,
 30,
 18,
 22,
 0,
 24,
 0,


## Return Attribution

In [None]:
# Sample return weight by absolute return
abs_weight = sample_weight_absolute_return_meta_labeling(timestamp, close, timeevents)
abs_weight.head()
# Write abs_weight to a new sheet in the Excel file
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    abs_weight.to_excel(writer, sheet_name='Abs_Weight')

# Plot the distribution of absolute weights
fig = go.Figure()
fig.add_trace(go.Histogram(x=abs_weight.values, histfunc='count', name='Absolute Weight Distribution'))
update_figure_layout(fig, "Absolute Weight Distribution", "Absolute Weight", "Count")
fig.show()


In [348]:
# Against time
fig = go.Figure()

fig.add_trace(go.Scatter(x=abs_weight.index, y=abs_weight.values, mode='lines+markers', name='Absolute Weights'))

# Apply custom layout
update_figure_layout(fig, "Absolute Weights Over Time", "Date", "Absolute Weight")

fig.show()


## Calculate Time Decay

In [None]:
# Calculate clf_weight
clf_weight = calculate_time_decay(abs_weight, clf_last_weight=0.005)
clf_weight.head()

# Write clf_weight to a new sheet in the Excel file
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    clf_weight.to_excel(writer, sheet_name='Clf_Weight')

#plot
# Different clf_last_weight values
clf_last_weights = [1, 0, 0.5, -0.5]

# Plot the clf_weight for different clf_last_weight values
fig = go.Figure()

for weight in clf_last_weights:
    clf_weight = calculate_time_decay(abs_weight, clf_last_weight=weight)
    fig.add_trace(go.Scatter(x=clf_weight.index, y=clf_weight.values, mode='lines+markers', name=f'Clf Weight (last={weight})'))

# Apply custom layout
update_figure_layout(fig, "Clf Weights Over Time for Different clf_last_weight", "Date", "Clf Weight")

fig.show()

## Monte Carlo Experiments

In [353]:
def random_time_stamp(n_observation , n_bars , maximum_holding):
    data = {'date': np.zeros(n_observation),
        'timestamp': np.zeros(n_observation)}
    timestampp = pd.DataFrame(data)
    for i in range(n_observation):
        date = random.randint(0,n_bars)
        t = date + random.randint(1,maximum_holding+1)
        timestampp.loc[i, 'date'] = date
        timestampp.loc[i, 'timestamp'] = t
    timestampp = timestampp.sort_values(by=['date'])
    return timestampp

def monte_carlo_simulation_for_sequentional_bootstraps(n_observation , n_bars , maximum_holding):
    timestampp = random_time_stamp(n_observation , n_bars , maximum_holding)
    
    bar_index  = np.array([i for i in range(int(np.max(timestampp.timestamp)) + 1)])
    # indM=pd.DataFrame(0,index=bar_index,columns=range(timestampp.shape[0]))
    
    index_matrix_ = create_index_matgrix(bar_index,timestampp)
    
    rho = random.sample([i for i in range(np.shape(index_matrix_)[1])] ,np.shape(index_matrix_)[1])
    rho = np.array([int(col) for col in  rho])
    standard_uniqueness = np.mean(calculate_average_uniqueness(index_matrix_.iloc[:,rho]))
    rho = perform_sequential_bootstrap(index_matrix_,np.shape(index_matrix_)[1])
    rho = np.array([int(col) for col in  rho])
    sequentioal_uniqueness = np.mean(calculate_average_uniqueness(index_matrix_.iloc[:,rho]))
    return standard_uniqueness ,sequentioal_uniqueness


In [303]:
def simulate_sequentional_vs_standard_bootstrap(iteration, n_observation, n_bars, maximum_holding):
    seq_u = np.zeros(iteration)
    std_u = np.zeros(iteration)
    for i in range(iteration):
        print(i)
        standard_uniqueness, sequentioal_uniqueness = monte_carlo_simulation_for_sequentional_bootstraps(n_observation, n_bars, maximum_holding)
        seq_u[i] = sequentioal_uniqueness
        std_u[i] = standard_uniqueness
    return seq_u, std_u

seqU, stdU = simulate_sequentional_vs_standard_bootstrap(1000, 10, 100, 5)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [354]:
# Group data together
hist_data = [seqU, stdU]
group_labels = ['seqU', 'stdU']

# Create a DataFrame to save the data
plot_data = pd.DataFrame({'seqU': seqU, 'stdU': stdU})

# Write the data to a new sheet in the Excel file
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    plot_data.to_excel(writer, sheet_name='Bootstrap_Data')

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05)

# Apply custom layout using update_figure_layout
update_figure_layout(fig, "Sequential vs Standard Bootstrap Sample Weight Distribution", "Sample Weight", "Density")

fig.show()