In [76]:
from IPython.display import clear_output, HTML, display
#calibration.py
from collections import defaultdict

import numpy as np
import pandas as pd
import yaml
from frozendict import frozendict
from sklearn.cluster import KMeans
from tqdm import tqdm

from my_clean_event import clean_event_data

In [77]:


class SpreadCalibrator:
    def __init__(self, config_path: str):
        self.calibration_config = self._load_config(config_path)
        self.symbol = self.calibration_config['symbol']
        self.symbol_directory = {
            'EURUSD': 'EURUSD_5m_clean.parquet',
            'XAUUSD': 'XAUUSD_1y_clean.parquet'
        }
        self.feed_data = None
        self.event_data = None
        self.calibrated_results = defaultdict(dict)

    def _load_config(self, config_path: str) -> frozendict:
        with open(config_path, 'r') as file:
            config = yaml.safe_load(file)
        return frozendict(config)

    def load_feed_data(self) -> None:
        feed_data_directory = self.symbol_directory[self.symbol]
        self.feed_data = pd.read_parquet(feed_data_directory)
        self.feed_data = self.feed_data.rename(
            columns={'spread': 'raw_spread'})
        print('Feed data cleaned and loaded')

    def load_event_data(self, economics_event_data: str) -> None:
        self.event_data = clean_event_data(economics_event_data)
        self.event_data = self.event_data.reset_index(drop=True)
        print('Event data cleaned and loaded')

    def calibrate_seasonality(self, today_date: pd.Timestamp) -> None:
        start_calibration_date = today_date - pd.Timedelta(
            days=self.calibration_config['seasonality_lookback_period_in_days'])

        feed_calibration_data = self.feed_data[
            (self.feed_data.index < today_date) &
            (self.feed_data.index >= start_calibration_date)
        ]

        train_data = feed_calibration_data.copy()
        train_data['spread'] = train_data['Ask'].astype(
            float) - train_data['Bid'].astype(float)
        train_data['time'] = train_data.index.time
        train_spread_profile = train_data.groupby('time')['spread'].mean()
        display(train_data)
        display(train_spread_profile)
        train_spread_profile.to_csv('tsp.csv')
        # Apply K-means
        kmeans = KMeans(n_clusters=3, random_state=42)
        kmeans.fit_predict(train_spread_profile.values.reshape(-1, 1))
        centroids = np.sort(kmeans.cluster_centers_.flatten())
        print('centroids',centroids)
        thresholds = (centroids[1:] + centroids[:-1]) / 2
        print('thresholds',thresholds)
        thresholds = [float(val) for val in thresholds]
        categories = np.digitize(train_spread_profile, bins=thresholds)
        print('categories',categories)
        quantile_val = self.calibration_config['seasonality_quantile']
        spread_values = {
            int(cat): round(float(np.quantile(
                train_spread_profile[categories == cat],
                quantile_val
            )), 6)
            for cat in np.unique(categories)
        }
        print('spread_values',spread_values)
        self.calibrated_results['seasonality'] = {
            'thresholds': thresholds,
            'category_spread_values': spread_values
        }
        train_data.to_csv('feed_datam.csv')



    def calibrate_events(self, today_date: pd.Timestamp) -> None:
        start_calibration_date = today_date - pd.Timedelta(
            days=self.calibration_config['economic_events_lookback_period_in_days'])

        events_calibration_data = self.event_data[
            (self.event_data['release_date'] < today_date) &
            (self.event_data['release_date'] >= start_calibration_date)
        ].reset_index(drop=True)

        events_calibration_data = events_calibration_data.sort_values(
            by='release_date'
        ).reset_index(drop=True)

        train_data = self.feed_data[
            (self.feed_data.index < today_date) &
            (self.feed_data.index >= start_calibration_date)
        ].copy()

        train_data['spread'] = train_data['Ask'].astype(
            float) - train_data['Bid'].astype(float)
        train_data['epoch'] = train_data.index.astype(np.int64) // 10**9
        train_data.to_csv('tdeve.csv')
        display('train_data_evengts',train_data)
        df_resampled = train_data.resample('1s').first()
#         df_resampled = train_data.select_dtypes(include=['number'])
#         df_resampled = train_data.resample('1s').mean()



        df_resampled['epoch'] = df_resampled.index.astype(np.int64) // 10**9
        df_resampled.to_csv('dfr.csv')
        data_event = events_calibration_data[(events_calibration_data['release_date'] >= train_data.index[0]) & (
            events_calibration_data['release_date'] <= train_data.index[-1])]
        self._process_events(df_resampled, data_event)

    def _process_events(self, df_resampled: pd.DataFrame, data_event: pd.DataFrame) -> None:
        data_event = data_event.loc[
            data_event.groupby('release_date')['impact'].idxmax()
        ].reset_index(drop=True)

        N = 3
        for i, row in tqdm(data_event.iterrows(), total=len(data_event)):
            mask = (
                (df_resampled['epoch'] > row['release_epoch'] - N * 60) &
                (df_resampled['epoch'] < row['release_epoch'] + N * 60)
            )
            df_resampled.loc[mask, 'event_id'] = row['id']
            df_resampled.loc[mask, 'impact'] = row['impact']
            df_resampled.loc[mask, 'event time'] = row['release_epoch']

        df_resampled.dropna(inplace=True)
        df_resampled['time to event'] = df_resampled['event time'] - \
            df_resampled['epoch']
        df_resampled.to_csv('dfr2.csv')

        self._calculate_impact_spreads(df_resampled)

    def _calculate_impact_spreads(self, df_resampled: pd.DataFrame) -> None:
        quantile_val = self.calibration_config['economic_events_quantile']
        impact_based_spread = defaultdict(dict)

        for impact in df_resampled['impact'].unique():
            df = df_resampled[df_resampled['impact'] == impact]
            df_mean = df.groupby('time to event')['spread'].mean()
            impact_based_spread[int(impact)] = {
                'before_spread': float(df_mean.loc[: -60].quantile(quantile_val).round(6)),
                'event_spread': float(df_mean.loc[-30:30].quantile(quantile_val).round(6)),
                'after_spread': float(df_mean.loc[60:].quantile(quantile_val).round(6)),
            }

        self.calibrated_results['economic_events'] = {
            'impact_spread_values': dict(impact_based_spread)
        }

    def save_results(self, file_name: str) -> None:
        yaml_str = yaml.dump(dict(self.calibrated_results),
                             default_flow_style=False)
        with open(f"{file_name}.yaml", "w") as yaml_file:
            yaml_file.write(yaml_str)


def main():
    # Initialize calibrator
    calibrator = SpreadCalibrator("stable_spread_config.yaml")

    # Load data
    calibrator.load_feed_data()
    calibrator.load_event_data(
        'economic_events_jan_2023_july_2024.csv')

    # Set calibration date
    today_date = pd.to_datetime('2024-07-27')

    # Run calibrations
    calibrator.calibrate_seasonality(today_date)
    calibrator.calibrate_events(today_date)

    # Save results
    calibrator.save_results('calibrated_spread_values')


# if __name__ == "__main__":
#     main()


In [78]:
import yaml
yaml_file = "calibrated_spread_values.yaml"
with open(yaml_file, 'r') as file:
    spreads = yaml.safe_load(file)  # Use safe
display(spreads)

{'economic_events': {'impact_spread_values': {1: {'after_spread': 0.110808,
    'before_spread': 0.11709,
    'event_spread': 0.112917},
   2: {'after_spread': 0.125034,
    'before_spread': 0.118807,
    'event_spread': 0.120492},
   3: {'after_spread': 0.130557,
    'before_spread': 0.134783,
    'event_spread': 0.134719},
   4: {'after_spread': 0.116539,
    'before_spread': 0.125215,
    'event_spread': 0.136221},
   5: {'after_spread': 0.122674,
    'before_spread': 0.137151,
    'event_spread': 0.201755}}},
 'seasonality': {'category_spread_values': {0: 0.1, 1: 0.14, 2: 0.18},
  'thresholds': [0.1062268833400751, 0.15400587400759974]}}

In [87]:
event = clean_event_data('economic_events_jan_2023_july_2024.csv')
event = event.drop_duplicates(subset=['impact', 'release_date'])

In [85]:
spreads

{'economic_events': {'impact_spread_values': {1: {'after_spread': 0.110808,
    'before_spread': 0.11709,
    'event_spread': 0.112917},
   2: {'after_spread': 0.125034,
    'before_spread': 0.118807,
    'event_spread': 0.120492},
   3: {'after_spread': 0.130557,
    'before_spread': 0.134783,
    'event_spread': 0.134719},
   4: {'after_spread': 0.116539,
    'before_spread': 0.125215,
    'event_spread': 0.136221},
   5: {'after_spread': 0.122674,
    'before_spread': 0.137151,
    'event_spread': 0.201755}}},
 'seasonality': {'category_spread_values': {0: 0.1, 1: 0.14, 2: 0.18},
  'thresholds': [0.1062268833400751, 0.15400587400759974]}}

In [86]:
feed = pd.read_parquet('XAUUSD_1y_clean.parquet',engine = 'pyarrow')
feed['ts'] = pd.to_datetime(feed.index)
# fdf.head()

In [88]:
feed = feed[feed.ts >='2024-07-28 22:05:00.336']
feed.reset_index(inplace=True)

In [89]:
feed.head()

Unnamed: 0,DateTime,Bid,Ask,Last,spread,epoch,ts
0,2024-07-28 22:05:00.336,2388.11,2388.57,0.0,0.46,1722204300,2024-07-28 22:05:00.336
1,2024-07-28 22:05:00.404,2388.11,2388.57,0.0,0.46,1722204300,2024-07-28 22:05:00.404
2,2024-07-28 22:05:00.670,2388.15,2388.57,0.0,0.42,1722204300,2024-07-28 22:05:00.670
3,2024-07-28 22:05:00.701,2387.93,2388.35,0.0,0.42,1722204300,2024-07-28 22:05:00.701
4,2024-07-28 22:05:00.802,2387.93,2388.33,0.0,0.4,1722204300,2024-07-28 22:05:00.802


In [90]:
feed['stable']=0.0
for i in range(len(feed)):

    timestamp = feed.iloc[i].ts
    spread = feed.iloc[i].spread
    events = event[(event['release_date'] >= (timestamp - pd.Timedelta(seconds=180))) & 
                 (event['release_date'] <= (timestamp + pd.Timedelta(seconds=180)))].reset_index(drop=True)
    if len(events)>0:
        events['t_delta'] = (timestamp - events.release_date).dt.total_seconds()
        events['when'] = np.where(events['t_delta'] < -60, 'before_spread',\
                                  np.where(events['t_delta'] < 60, 'event_spread',\
                                           'after_spread'))
        events['spread'] = events.apply(
        lambda row: spreads['economic_events']['impact_spread_values']
                    .get(row['impact'], {})
                    .get(row['when'], None),
        axis=1
        )
        events_spread = events.spread.max()
    else:
        events_spread = 0

    ar = spreads['seasonality']['thresholds']
    if spread < ar[0]:  
        season_spread = spreads['seasonality']['category_spread_values'][0]
    elif spread > ar[1]:  
        season_spread = spreads['seasonality']['category_spread_values'][2]
    else:  
        season_spread = spreads['seasonality']['category_spread_values'][1]
    spread = max(events_spread,season_spread)
    feed.at[i,'stable']=spread


In [94]:
prod_spreads = pd.read_csv('prod_spreads.csv')
prod_spreads.head()

Unnamed: 0,ts,spread
0,2024-07-28 22:05:00.336,0.18
1,2024-07-28 22:05:00.404,0.18
2,2024-07-28 22:05:00.670,0.18
3,2024-07-28 22:05:00.701,0.18
4,2024-07-28 22:05:00.802,0.18


In [96]:
# feed[prod_spreads.spread!=feed.stable]

In [101]:
num = 284535
timestamp=feed.at[num,'ts']
events = event[(event['release_date'] >= (timestamp - pd.Timedelta(seconds=180))) & 
             (event['release_date'] <= (timestamp + pd.Timedelta(seconds=180)))].reset_index(drop=True)
print(timestamp)
display(events)
print('quant_spread:')
display(prod_spreads.loc[num:].head(1))
print('\nhighest stable spread at that time:')
display(feed.iloc[num:].head(1))

2024-07-29 07:57:00.232000


Unnamed: 0,impact,id,release_epoch,symbol,event_name,release_date
0,1.0,ca6915224b3141e6,1722240000,GBP,LME Aluminum Alloy Stocks,2024-07-29 08:00:00
1,2.0,b8e0cfbb64a204cb,1722240000,GBP,LME Lead Stocks,2024-07-29 08:00:00
2,3.0,d064f5c92c8ba3b3,1722240000,GBP,LME Zinc Stocks,2024-07-29 08:00:00
3,4.0,f728547ec963473a,1722240000,GBP,LME Copper Stocks,2024-07-29 08:00:00


quant_spread:


Unnamed: 0,ts,spread
284535,2024-07-29 07:57:00.232,0.125215



highest stable spread at that time:


Unnamed: 0,DateTime,Bid,Ask,Last,spread,epoch,ts,stable
284535,2024-07-29 07:57:00.232,2391.43,2391.52,0.0,0.09,1722239820,2024-07-29 07:57:00.232,0.134783


In [102]:
num=1164030
print('quant_spread:')
display(prod_spreads.loc[num:].head(1))
print('\nhighest stable spread at that time:')
display(feed.iloc[num:].head(1))

quant_spread:


Unnamed: 0,ts,spread
1164030,2024-07-31 14:32:26.807,0.122674



highest stable spread at that time:


Unnamed: 0,DateTime,Bid,Ask,Last,spread,epoch,ts,stable
1164030,2024-07-31 14:32:26.807,2424.19,2424.27,0.0,0.08,1722436346,2024-07-31 14:32:26.807,0.130557
