# Ziwei Huang zh1459
# Data Engineering Final Part 1 
# Real-time Regression & Classification

In [1]:
# import packages
import numpy as np
import pandas as pd

from polygon import RESTClient

import arcticdb as adb

import requests
import os
import threading
from threading import Lock


from pycaret.regression import (
    setup as reg_setup,
    compare_models as reg_compare,
    save_model as save_reg_model,
    load_model as load_reg_model,
    predict_model as reg_predict,
    pull as reg_pull,
    tune_model as reg_tune
)
from pycaret.classification import (
    setup as cls_setup,
    compare_models as cls_compare,
    save_model as save_cls_model,
    predict_model as cls_predict,
    pull as cls_pull,
    tune_model as cls_tune
)
from sklearn.model_selection import train_test_split

from datetime import datetime, timedelta, timezone
import time
from tqdm.auto import tqdm


In [2]:
# configuration
API_KEY = "beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq"
client = RESTClient(API_KEY)

DB_PATH = '/Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/data'
aux_lib = 'forex_live_auxiliary'
main_lib = 'forex_live_main'

Regression_CPs = ['USD_EUR', 'USD_GBP', 'USD_CAD', 'USD_CHF', 'USD_AUD']
Classification_CPs = ['EUR_CHF', 'GBP_EUR', 'GBP_CHF', 'USD_JPY', 'USD_INR', 'USD_CNY']
all_CPs = Regression_CPs + Classification_CPs
btc_ticker = "USD_BTC"

start_time = datetime.now()
end_time = start_time + timedelta(hours=5)


In [3]:
# set up ArcticDB
ac = adb.Arctic(f"lmdb://{DB_PATH}")

# Create libraries if not exist
for lib_name in [aux_lib, main_lib]:
    if not ac.has_library(lib_name):
        ac.create_library(lib_name)

## Data Fetching Methods

In [7]:
# fetch real-time data
def fetch_forex(cp, end_time):
    lib = ac['forex_live_auxiliary']
    from_currency, to_currency = cp.split('_')
    pbar = tqdm(desc=cp, position=threading.get_ident() % 10, leave=False)
    
    records = []
    
    while datetime.now() < end_time:
        try:
            quote = client.get_last_forex_quote(from_currency, to_currency)
            # for debug: print(f"Debug {cp} response:", quote)
            if not quote.last.timestamp:
                print(f"Data fetching error for {from_currency}:{to_currency}")
                continue
            
            ask = float(quote.last.ask)
            bid = float(quote.last.bid)
            mid = (ask + bid) / 2
            ts = int(quote.last.timestamp)
            
            record = np.array([ask, bid, mid, ts])
            records.append(record)
            
            pbar.update(1)
            pbar.set_postfix({'last': datetime.now().strftime("%H:%M:%S")})
        
        except Exception as e:
            print(f"Error fetching {cp}: {e}")
            
        finally:
            pbar.refresh()
            time.sleep(1)
            
    if records:
        lib.write(symbol=cp, data=np.stack(records))
        print(f"{cp} - Stored {len(records)} records to ArcticDB")
        
    pbar.close()

def fetch_btc(end_time):
    lib = ac['forex_live_auxiliary']
    pbar = tqdm(desc="BTC/USD", position=threading.get_ident() % 10, leave=False)
    
    records = []

    while datetime.now() < end_time:
        try:
            trade = client.get_last_crypto_trade('BTC', 'USD')
            if not trade.timestamp:
                print('No BTC trade or timestamp')
                
            price = float(trade.price)
            size = float(trade.size)
            ts = int(trade.timestamp)
            
            data = np.array([price, size, ts])
            records.append(data)
            
            pbar.update(1)
            pbar.set_postfix({
                'price': f"{trade.price:.2f}",
                'time': datetime.now().strftime("%H:%M:%S")
            })
            
        except Exception as e:
            print(f"BTC fetching error: {e}")
            
        finally:
            pbar.refresh()
            time.sleep(1)
        
    if records:
        lib.write(symbol='BTC', data=np.stack(records))
        print(f"BTC - Stored {len(records)} records to ArcticDB")
        
    pbar.close()

## Start Threading & Fetching Data

In [None]:
# start fetching with multi-threading
main_pbar = tqdm(total=(len(all_CPs) + 1), desc='Total Pairs')

threads = []

for cp in all_CPs:
    t = threading.Thread(target=fetch_forex, args=(cp, end_time), daemon=True)
    t.start()
    threads.append(t)
    main_pbar.update(1)

btc_thread = threading.Thread(target=fetch_btc, args=(end_time,), daemon=True)
btc_thread.start()
threads.append(btc_thread)
main_pbar.update(1)


while any(t.is_alive() for t in threads):
    time.sleep(5) # check if there is thread still running and wait

main_pbar.close()

Total Pairs:   0%|          | 0/12 [00:00<?, ?it/s]

USD_GBP: 0it [00:00, ?it/s]

USD_EUR: 0it [00:00, ?it/s]

USD_CHF: 0it [00:00, ?it/s]

USD_CAD: 0it [00:00, ?it/s]

GBP_EUR: 0it [00:00, ?it/s]

USD_CNY: 0it [00:00, ?it/s]

GBP_CHF: 0it [00:00, ?it/s]

EUR_CHF: 0it [00:00, ?it/s]

USD_AUD: 0it [00:00, ?it/s]

USD_JPY: 0it [00:00, ?it/s]

USD_INR: 0it [00:00, ?it/s]

BTC/USD: 0it [00:00, ?it/s]

USD_GBP - Stored 17375 records to ArcticDB
GBP_CHF - Stored 17362 records to ArcticDB
USD_EUR - Stored 17340 records to ArcticDB
USD_AUD - Stored 17370 records to ArcticDB
USD_CHF - Stored 17323 records to ArcticDB
GBP_EUR - Stored 17343 records to ArcticDB
BTC - Stored 17416 records to ArcticDB
USD_JPY - Stored 17370 records to ArcticDB
EUR_CHF - Stored 17349 records to ArcticDB
USD_INR - Stored 17374 records to ArcticDB
USD_CAD - Stored 17320 records to ArcticDB
USD_CNY - Stored 17345 records to ArcticDB


In [9]:
print(ac['forex_live_auxiliary'].list_symbols())

['GBP_CHF', 'EUR_CHF', 'USD_INR', 'USD_EUR', 'GBP_EUR', 'USD_AUD', 'BTC', 'USD_CHF', 'USD_JPY', 'USD_CAD', 'USD_GBP', 'USD_CNY']


In [11]:
# Preview fetched data & array shape
all_symbols = ['USD_EUR', 'EUR_CHF', 'GBP_EUR', 'USD_GBP', 'GBP_CHF', 'USD_CHF', 'USD_CAD', 'USD_JPY', 'USD_INR', 'USD_CNY', 'USD_AUD', 'BTC']

for symbol in all_symbols:
    data = ac['forex_live_auxiliary'].read(symbol).data
    print(f"{symbol} - shape: {data.shape}")
    print(data[:5])
    print("-" * 40)


USD_EUR - shape: (17340, 4)
[[8.7865000e-01 8.7857000e-01 8.7861000e-01 1.7459752e+12]
 [8.7865000e-01 8.7857000e-01 8.7861000e-01 1.7459752e+12]
 [8.7865000e-01 8.7857000e-01 8.7861000e-01 1.7459752e+12]
 [8.7865000e-01 8.7857000e-01 8.7861000e-01 1.7459752e+12]
 [8.7865000e-01 8.7857000e-01 8.7861000e-01 1.7459752e+12]]
----------------------------------------
EUR_CHF - shape: (17349, 4)
[[9.37480000e-01 9.37300000e-01 9.37390000e-01 1.74597520e+12]
 [9.37480000e-01 9.37300000e-01 9.37390000e-01 1.74597520e+12]
 [9.37410000e-01 9.37270000e-01 9.37340000e-01 1.74597520e+12]
 [9.37450000e-01 9.37250000e-01 9.37350000e-01 1.74597520e+12]
 [9.37450000e-01 9.37250000e-01 9.37350000e-01 1.74597521e+12]]
----------------------------------------
GBP_EUR - shape: (17343, 4)
[[1.17764000e+00 1.17749000e+00 1.17756500e+00 1.74597520e+12]
 [1.17764000e+00 1.17749000e+00 1.17756500e+00 1.74597520e+12]
 [1.17764000e+00 1.17749000e+00 1.17756500e+00 1.74597520e+12]
 [1.17764000e+00 1.17749000e+00 1

## Helper Function to Load Raw Data from Auxiliary as DataFrame

In [4]:
def load_raw_data_as_df(symbol, lib):
    # load raw data from auxiliary ArcticDB library and return as dataframe for more readable and maintainable calculation
    raw = lib.read(symbol).data
    
    if symbol == 'BTC':
        df = pd.DataFrame(raw, columns=['price', 'size', 'timestamp'])
    else:
        df = pd.DataFrame(raw, columns=['ask', 'bid', 'mid', 'timestamp'])
    
    # sort by timestamp and ensure they are numeric
    df = df.sort_values(by='timestamp').reset_index(drop=True)
    df['timestamp'] = df['timestamp'].astype(np.int64)
    
    return df

raw_lib = ac['forex_live_auxiliary']
df_fx = load_raw_data_as_df('USD_EUR', raw_lib)
df_btc = load_raw_data_as_df('BTC', raw_lib)

print(df_fx.head())
print(df_btc.head())

       ask      bid      mid      timestamp
0  0.87865  0.87857  0.87861  1745975199000
1  0.87865  0.87857  0.87861  1745975199000
2  0.87865  0.87857  0.87861  1745975199000
3  0.87865  0.87857  0.87861  1745975199000
4  0.87865  0.87857  0.87861  1745975199000
      price      size      timestamp
0  94459.35  0.067115  1745975202725
1  94459.74  0.000474  1745975203913
2  94468.44  0.001103  1745975205046
3  94459.54  0.008145  1745975206205
4  94449.30  0.001000  1745975206987


## Feature Calculation

In [None]:
def count_crossings_vectorized(prices, lower_bands, upper_bands):
    p1, p2 = prices[:-1], prices[1:] # price at t & price at t + 1
    
    # expand dimensions: make prices and bands into 2D matrices for vectorized broadcasting
    p1_matrix, p2_matrix = p1[None, :], p2[None, :] # Shape (1, T-1)
    lower_matrix, upper_matrix = lower_bands[:, None], upper_bands[:, None] # Shape (B, 1)
    
    cross_up_lower = (p1_matrix < lower_matrix) & (lower_matrix < p2_matrix) # Cross up lower band
    cross_down_lower = (p1_matrix > lower_matrix) & (lower_matrix > p2_matrix) # Cross down lower band
    cross_up_upper = (p1_matrix < upper_matrix) & (upper_matrix < p2_matrix) # Cross up upper band
    cross_down_upper = (p1_matrix > upper_matrix) & (upper_matrix > p2_matrix) # Cross down upper band
    # The result is a B x (T−1) boolean matrix that shows which bands were crossed at which times.
    
    crossing = cross_down_lower| cross_down_upper | cross_up_lower | cross_up_upper
    return crossing.sum()

def compute_keltner_fd(df_window):
    # Compute normalized fd using vectorized crossing count across 1000 keltner bands
    
    if df_window.shape[0] < 3:
        return np.nan

    prices = df_window['mid'].values
    mean_price = prices.mean()
    
    n_range = np.arange(1, 1001)
    deviations = n_range * 0.00001 * mean_price
    
    upper_bands = mean_price + deviations
    lower_bands = mean_price - deviations

    return count_crossings_vectorized(prices, lower_bands, upper_bands)  # Normalize later if needed


In [6]:
# calculate features for a currency pair based on auxiliary data
def compute_features_for_cp(cp_name, ac, main_lib_name='forex_live_main', duration_minutes=None):
    raw_lib = ac['forex_live_auxiliary']
    main_lib = ac[main_lib_name]

    # load raw data
    cp_df = load_raw_data_as_df(cp_name, raw_lib)
    indicator_df = load_raw_data_as_df('USD_EUR', raw_lib) # use USD_EUR as indicator CP
    btc_df = load_raw_data_as_df('BTC', raw_lib)

    start_ts = max(cp_df['timestamp'].min(), btc_df['timestamp'].min())

    if duration_minutes:
        end_ts = start_ts + duration_minutes * 60_000  # ms
        cp_df = cp_df[cp_df['timestamp'] <= end_ts]
        btc_df = btc_df[btc_df['timestamp'] <= end_ts]
        indicator_df = indicator_df[indicator_df['timestamp'] <= end_ts]
    else:
        end_ts = min(cp_df['timestamp'].max(), btc_df['timestamp'].max(), indicator_df['timestamp'].max())

    interval_ms = 6 * 60 * 1000
    intervals = np.arange(start_ts, end_ts, interval_ms)

    raw_fd_list = []
    feature_rows = []
    global_max_vol = 0

    for i in range(len(intervals) - 1):
        t0, t1 = intervals[i], intervals[i + 1]
        
        cp_window = cp_df[(cp_df['timestamp'] >= t0) & (cp_df['timestamp'] < t1)]
        btc_window = btc_df[(btc_df['timestamp'] >= t0) & (btc_df['timestamp'] < t1)]
        indicator_window = indicator_df[(indicator_df['timestamp'] >= t0) & (indicator_df['timestamp'] < t1)]

        if len(cp_window) < 3:
            feature_rows.append([np.nan] * 7)
            raw_fd_list.append(np.nan)
            continue

        mid_prices = cp_window['mid'].values
        mean_price = mid_prices.mean()
        
        fd_raw = compute_keltner_fd(cp_window)  # raw count, unnormalized
        raw_fd_list.append(fd_raw)

        max_p = mid_prices.max()
        min_p = mid_prices.min()
        range_p = max_p - min_p

        norm_vol = (range_p / mean_price) if mean_price else 0
        global_max_vol = max(global_max_vol, norm_vol)
        norm_vol = (norm_vol / global_max_vol) if global_max_vol else 0

        if len(btc_window) > 1 and len(indicator_window) > 1:
            ref_ts = cp_window['timestamp'].values
            btc_interp = np.interp(ref_ts, btc_window['timestamp'].values, btc_window['price'].values)
            indicator_interp = np.interp(ref_ts, indicator_window['timestamp'].values, indicator_window['mid'].values)
            btc_corr = np.corrcoef(indicator_interp, btc_interp)[0, 1] # use USD_EUR and BTC price correlation as macro-econ indicator
        else:
            btc_corr = np.nan

        feature_rows.append([mean_price, norm_vol, None, max_p, min_p, range_p, btc_corr])  # None is placeholder for norm_fd

    # Normalize FD
    max_fd = max([fd for fd in raw_fd_list if not np.isnan(fd)], default=1)
    norm_fd_list = [fd / max_fd if not np.isnan(fd) else np.nan for fd in raw_fd_list]

    # Insert normalized FD into feature rows
    for i in range(len(feature_rows)):
        feature_rows[i][2] = norm_fd_list[i]  # index 2 is FD

    features = np.array(feature_rows)
    main_lib.write(cp_name, data=features)
    print(f"{cp_name} -> {features.shape}")
    return features


## Helper Function to Load Features from Main as DataFrame

In [7]:
def load_features_as_df(cps, ac, lib_name = 'forex_live_main'):
    lib = ac[lib_name]
    df_list = []
    
    for cp in cps:
        try:
            data = lib.read(cp).data
            df_cp = pd.DataFrame(data, columns=[
                'mean_price',
                'norm_vol',
                'norm_fd',
                'max_p',
                'min_p',
                'range_p',
                'btc_corr'
            ])
            df_cp['cp'] = cp
            df_list.append(df_cp)
        except Exception as e:
            print(f"Failed to load features for {cp}: {e}")
    
    return pd.concat(df_list, ignore_index=True)

## 4-Hour Regression & Classification

In [16]:
# feature calculations
for cp in all_CPs:
    try:
        compute_features_for_cp(cp, ac, duration_minutes=240)
    except Exception as e:
        print(f"⚠️ Error processing {cp}: {e}")

USD_EUR -> (39, 7)
USD_GBP -> (39, 7)
USD_CAD -> (39, 7)
USD_CHF -> (39, 7)
USD_AUD -> (39, 7)
EUR_CHF -> (39, 7)
GBP_EUR -> (39, 7)
GBP_CHF -> (39, 7)
USD_JPY -> (39, 7)
USD_INR -> (39, 7)
USD_CNY -> (39, 7)


In [17]:
# load features from main library with helper function
df_base = load_features_as_df(Regression_CPs, ac)

# split train & test df
train_df, test_df = train_test_split(df_base, test_size=0.3, random_state=42)

print(df_base.head())
print(train_df.head())
print(test_df.head())

   mean_price  norm_vol   norm_fd     max_p    min_p   range_p  btc_corr  \
0    0.878692  1.000000  0.505017  0.878850  0.87845  0.000400  0.548763   
1    0.878381  0.775275  0.448161  0.878560  0.87825  0.000310  0.345905   
2    0.878408  0.812763  0.366221  0.878525  0.87820  0.000325  0.563562   
3    0.878511  1.000000  0.608696  0.878770  0.87830  0.000470  0.238937   
4    0.878522  0.468079  0.511706  0.878620  0.87840  0.000220 -0.101534   

        cp  
0  USD_EUR  
1  USD_EUR  
2  USD_EUR  
3  USD_EUR  
4  USD_EUR  
     mean_price  norm_vol   norm_fd     max_p     min_p   range_p  btc_corr  \
38     0.879309  0.264675  0.832776  0.879425  0.879150  0.000275  0.073698   
31     0.879342  0.529329  0.749164  0.879550  0.879000  0.000550  0.353415   
173    1.562838  0.278157  0.740552  1.563335  1.562500  0.000835  0.908146   
12     0.879748  0.307831  0.449833  0.879920  0.879600  0.000320  0.531508   
109    1.383818  0.475105  0.813704  1.384150  1.383625  0.000525  0.4

In [18]:
# train regression model
vol_reg_exp = reg_setup(
    data=train_df,
    target='norm_vol',
    ignore_features=['norm_fd', 'cp'],  # Exclude unrelated columns
    session_id=42,
    verbose=False
)

vol_model_4h = reg_compare(fold=2)
vol_model_4h = reg_tune(vol_model_4h, fold=2)
save_reg_model(vol_model_4h, 'vol_model_4h')
vol_results = reg_pull()


# Setup and train regression model for norm_fd
fd_reg_exp = reg_setup(
    data=train_df,
    target='norm_fd',
    ignore_features=['norm_vol', 'cp'],
    session_id=42,
    verbose=False
)

fd_model_4h = reg_compare(fold=2)
fd_model_4h = reg_tune(fd_model_4h, fold=2)
save_reg_model(fd_model_4h, 'fd_model_4h')
fd_results = reg_pull()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.1048,0.0241,0.1554,0.5619,0.0961,0.2192,0.02
rf,Random Forest Regressor,0.1259,0.0308,0.1753,0.4366,0.1074,0.2498,0.025
ada,AdaBoost Regressor,0.1468,0.037,0.1922,0.3239,0.1205,0.3168,0.425
dt,Decision Tree Regressor,0.1128,0.0376,0.1935,0.3073,0.1194,0.2282,0.005
et,Extra Trees Regressor,0.1401,0.0389,0.1973,0.2934,0.1251,0.3145,0.02
lar,Least Angle Regression,0.1694,0.048,0.2189,0.1361,0.1368,0.3567,0.395
lightgbm,Light Gradient Boosting Machine,0.1715,0.0493,0.2219,0.1096,0.1389,0.3689,0.025
lr,Linear Regression,0.1701,0.0497,0.2227,0.1074,0.1378,0.3521,1.3
huber,Huber Regressor,0.1701,0.0545,0.2328,0.027,0.1443,0.3363,0.4
ridge,Ridge Regression,0.1894,0.0555,0.2353,0.0032,0.1484,0.4231,0.405


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0967,0.023,0.1518,0.6261,0.0911,0.1903
1,0.1501,0.0394,0.1986,0.2092,0.1266,0.3529
Mean,0.1234,0.0312,0.1752,0.4176,0.1088,0.2716
Std,0.0267,0.0082,0.0234,0.2084,0.0178,0.0813


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,0.1563,0.0407,0.2009,0.0369,0.1307,0.3942,0.005
ridge,Ridge Regression,0.1615,0.0418,0.2034,0.0157,0.1317,0.3936,0.005
br,Bayesian Ridge,0.1591,0.0421,0.2045,0.0024,0.1324,0.3937,0.005
huber,Huber Regressor,0.164,0.0421,0.2046,0.0009,0.1328,0.4046,0.005
llar,Lasso Least Angle Regression,0.1584,0.0425,0.2054,-0.0063,0.1335,0.4028,0.025
dummy,Dummy Regressor,0.1584,0.0425,0.2054,-0.0063,0.1335,0.4028,0.005
en,Elastic Net,0.1584,0.0425,0.2054,-0.0063,0.1335,0.4028,0.005
lasso,Lasso Regression,0.1584,0.0425,0.2054,-0.0063,0.1335,0.4028,0.005
lightgbm,Light Gradient Boosting Machine,0.1625,0.0447,0.2098,-0.0434,0.1353,0.3936,0.02
lr,Linear Regression,0.1749,0.0486,0.2194,-0.1464,0.1416,0.4172,0.03


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1666,0.0474,0.2177,0.0399,0.1376,0.3899
1,0.146,0.0339,0.1842,0.034,0.1238,0.3985
Mean,0.1563,0.0407,0.2009,0.0369,0.1307,0.3942
Std,0.0103,0.0067,0.0167,0.003,0.0069,0.0043


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


In [29]:
# Predicting norm_vol and norm_fd on the test set
# load models to ensure they are in memory
vol_model_4h = load_reg_model('vol_model_4h')
fd_model_4h = load_reg_model('fd_model_4h')

# prepare features and drop target columns
vol_test_features = test_df.drop(columns=['norm_vol'], errors='ignore')
fd_test_features = test_df.drop(columns=['norm_fd'], errors='ignore')

# predict norm_vol and norm_fd 
test_df['pred_vol'] = reg_predict(vol_model_4h, data=vol_test_features)['prediction_label']
test_df['pred_fd'] = reg_predict(fd_model_4h, data=fd_test_features)['prediction_label']

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [30]:
# define function for forecastability classification
def classify_forecastability(vol, fd):
    if vol < 0.5 and fd < 0.5:
        return 'FORECASTABLE'
    elif vol > 0.5 and fd > 0.5:
        return 'NON-FORECASTABLE'
    else:
        return 'UNDEFINED'

In [31]:
# Label forecastability from regression output
test_df['forecastability'] = test_df.apply(
    lambda row: classify_forecastability(row['pred_vol'], row['pred_fd']),
    axis=1
)

In [32]:
test_df

Unnamed: 0,mean_price,norm_vol,norm_fd,max_p,min_p,range_p,btc_corr,cp,pred_vol,pred_fd,forecastability
138,0.823268,0.246043,0.128319,0.82332,0.8231,0.00022,-0.235164,USD_CHF,0.288587,0.640547,UNDEFINED
16,0.879794,0.389579,0.560201,0.880005,0.8796,0.000405,-0.208309,USD_EUR,0.50254,0.637882,NON-FORECASTABLE
155,0.823773,0.447075,1.0,0.824,0.8236,0.0004,0.060033,USD_CHF,0.474579,0.611252,UNDEFINED
96,1.384404,0.538225,0.591006,1.38475,1.384155,0.000595,-0.1017,USD_CAD,0.427173,0.627302,UNDEFINED
68,0.747109,0.391844,0.568019,0.747217,0.74695,0.000267,0.578981,USD_GBP,0.332713,0.559751,UNDEFINED
153,0.823674,0.447129,0.9107,0.8239,0.8235,0.0004,0.381771,USD_CHF,0.496715,0.579322,UNDEFINED
55,0.746847,0.323438,0.615752,0.74697,0.74675,0.00022,-0.30533,USD_GBP,0.292996,0.64751,UNDEFINED
15,0.879744,0.307833,0.441472,0.87992,0.8796,0.00032,0.670421,USD_EUR,0.31434,0.550676,UNDEFINED
112,1.383537,0.362059,0.531049,1.38385,1.38345,0.0004,-0.72266,USD_CAD,0.398226,0.688926,UNDEFINED
111,1.383666,0.339398,0.749465,1.38395,1.383575,0.000375,0.064718,USD_CAD,0.328905,0.610787,UNDEFINED


In [33]:
# Train a classifier using base CPs' predicted labels
train_df['pred_vol'] = reg_predict(vol_model_4h, data=train_df)['prediction_label']
train_df['pred_fd'] = reg_predict(fd_model_4h, data=train_df)['prediction_label']
train_df['forecastability'] = train_df.apply(
    lambda row: classify_forecastability(row['pred_vol'], row['pred_fd']),
    axis=1
)

# filter only defined forecastability classes
cls_train_df = train_df[train_df['forecastability'].isin(['FORECASTABLE', 'NON-FORECASTABLE'])].copy()


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.0356,0.0055,0.0741,0.8997,0.0442,0.069


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Orthogonal Matching Pursuit,0.1535,0.0397,0.1993,0.0547,0.1317,0.4187


In [34]:
# PyCaret classification model setup
cls_exp = cls_setup(
    data=cls_train_df,
    target='forecastability',
    ignore_features=['norm_vol', 'norm_fd', 'cp'],
    session_id=42,
    verbose=False
)

cls_model_4h = cls_compare(fold=2)
cls_model_4h = cls_tune(cls_model_4h, fold=2)
save_cls_model(cls_model_4h, 'forecastability_classifier_4h')
cls_result = cls_pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.45
nb,Naive Bayes,1.0,0.0,1.0,1.0,1.0,,0.0,0.405
dt,Decision Tree Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.005
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.405
rf,Random Forest Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.03
ada,Ada Boost Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.01
lda,Linear Discriminant Analysis,1.0,0.0,1.0,1.0,1.0,,0.0,0.01
et,Extra Trees Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.025
lightgbm,Light Gradient Boosting Machine,1.0,0.0,1.0,1.0,1.0,,0.0,0.01
dummy,Dummy Classifier,1.0,0.0,1.0,1.0,1.0,,0.0,0.005


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,0.0,1.0,1.0,1.0,,0.0
1,1.0,0.0,1.0,1.0,1.0,,0.0
Mean,1.0,0.0,1.0,1.0,1.0,,0.0
Std,0.0,0.0,0.0,0.0,0.0,,0.0


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


In [36]:
cls_train_df

Unnamed: 0,mean_price,norm_vol,norm_fd,max_p,min_p,range_p,btc_corr,cp,pred_vol,pred_fd,forecastability
31,0.879342,0.529329,0.749164,0.87955,0.879,0.00055,0.353415,USD_EUR,0.540128,0.582136,NON-FORECASTABLE
41,0.745988,0.775168,0.386635,0.74606,0.745905,0.000155,0.581607,USD_GBP,0.77609,0.55949,NON-FORECASTABLE
128,0.824061,0.681552,0.209976,0.824345,0.823735,0.00061,-0.517973,USD_CHF,0.683339,0.668613,NON-FORECASTABLE
2,0.878408,0.812763,0.366221,0.878525,0.8782,0.000325,0.563562,USD_EUR,0.667626,0.561281,NON-FORECASTABLE
46,0.746196,0.54737,0.797136,0.7463,0.74607,0.00023,0.812334,USD_GBP,0.548154,0.536593,NON-FORECASTABLE
137,0.823407,0.575866,0.168142,0.8237,0.823185,0.000515,0.500332,USD_CHF,0.575667,0.567556,NON-FORECASTABLE
145,0.823837,0.670561,0.844328,0.8241,0.8235,0.0006,-0.326899,USD_CHF,0.653405,0.649651,NON-FORECASTABLE
126,0.824295,0.681359,0.228077,0.82461,0.824,0.00061,-0.358474,USD_CHF,0.715822,0.652784,NON-FORECASTABLE
147,0.823868,0.55878,0.779163,0.8241,0.8236,0.0005,-0.32543,USD_CHF,0.556804,0.649505,NON-FORECASTABLE
11,0.879741,0.50504,0.523411,0.879925,0.8794,0.000525,-0.604039,USD_EUR,0.5293,0.677154,NON-FORECASTABLE


In [42]:
# Step 1: Load the classification CPs' features
df_class = load_features_as_df(Classification_CPs, ac)

# Step 2: Drop rows with missing values before predicting
df_class = df_class.dropna().reset_index(drop=True)

# Step 3: Predict pred_vol and pred_fd using the trained regression models
df_class['pred_vol'] = reg_predict(vol_model_4h, data=df_class.drop(columns=['norm_vol', 'cp'], errors='ignore'))['prediction_label']
df_class['pred_fd'] = reg_predict(fd_model_4h, data=df_class.drop(columns=['norm_fd', 'cp'], errors='ignore'))['prediction_label']

# Step 4: Use full set of features used in training
cls_features = df_class[['mean_price', 'max_p', 'min_p', 'range_p', 'btc_corr', 'pred_vol', 'pred_fd']]

# Step 5: Predict
prediction_results = cls_predict(cls_model_4h, data=cls_features)
df_class['forecastability'] = prediction_results['prediction_label']


In [43]:
summary = df_class.groupby('cp')['forecastability'].value_counts().unstack(fill_value=0)
summary['majority_class'] = df_class.groupby('cp')['forecastability'].agg(lambda x: x.value_counts().idxmax())
summary.reset_index(inplace=True)
print(summary)

forecastability       cp  NON-FORECASTABLE    majority_class
0                EUR_CHF                39  NON-FORECASTABLE
1                GBP_CHF                39  NON-FORECASTABLE
2                GBP_EUR                39  NON-FORECASTABLE
3                USD_INR                39  NON-FORECASTABLE
4                USD_JPY                39  NON-FORECASTABLE


## 5-Hour Regression & Classification

In [8]:
# feature calculations
for cp in all_CPs:
    try:
        compute_features_for_cp(cp, ac, duration_minutes=300)
    except Exception as e:
        print(f"⚠️ Error processing {cp}: {e}")

USD_EUR -> (49, 7)
USD_GBP -> (49, 7)
USD_CAD -> (49, 7)
USD_CHF -> (49, 7)
USD_AUD -> (49, 7)
EUR_CHF -> (49, 7)
GBP_EUR -> (49, 7)
GBP_CHF -> (49, 7)
USD_JPY -> (49, 7)
USD_INR -> (49, 7)
USD_CNY -> (49, 7)


In [9]:
# load features from main library with helper function
df_base = load_features_as_df(Regression_CPs, ac)

# split train & test df
train_df, test_df = train_test_split(df_base, test_size=0.3, random_state=42)

print(df_base.head())
print(train_df.head())
print(test_df.head())

   mean_price  norm_vol   norm_fd     max_p    min_p   range_p  btc_corr  \
0    0.878692  1.000000  0.468944  0.878850  0.87845  0.000400  0.548763   
1    0.878381  0.775275  0.416149  0.878560  0.87825  0.000310  0.345905   
2    0.878408  0.812763  0.340062  0.878525  0.87820  0.000325  0.563562   
3    0.878511  1.000000  0.565217  0.878770  0.87830  0.000470  0.238937   
4    0.878522  0.468079  0.475155  0.878620  0.87840  0.000220 -0.101534   

        cp  
0  USD_EUR  
1  USD_EUR  
2  USD_EUR  
3  USD_EUR  
4  USD_EUR  
     mean_price  norm_vol   norm_fd     max_p     min_p   range_p  btc_corr  \
235    1.559415  0.243570  0.535610  1.559819  1.559090  0.000730  0.300593   
228    1.560498  0.406174  0.627180  1.561037  1.559819  0.001217 -0.339099   
181    0.823434  0.585057  0.704674  0.823700  0.823177  0.000523 -0.731885   
5      0.878374  0.702237  0.559006  0.878530  0.878200  0.000330  0.406156   
56     0.746196  0.547370  0.797136  0.746300  0.746070  0.000230  0.8

In [10]:
# train regression model
vol_reg_exp = reg_setup(
    data=train_df,
    target='norm_vol',
    ignore_features=['norm_fd', 'cp'],  # Exclude unrelated columns
    session_id=42,
    verbose=False
)

vol_model_5h = reg_compare(fold=2)
vol_model_5h = reg_tune(vol_model_5h, fold=2)
save_reg_model(vol_model_5h, 'vol_model_5h')
vol_results = reg_pull()


# Setup and train regression model for norm_fd
fd_reg_exp = reg_setup(
    data=train_df,
    target='norm_fd',
    ignore_features=['norm_vol', 'cp'],
    session_id=42,
    verbose=False
)

fd_model_5h = reg_compare(fold=2)
fd_model_5h = reg_tune(fd_model_5h, fold=2)
save_reg_model(fd_model_5h, 'fd_model_5h')
fd_results = reg_pull()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.1018,0.0248,0.1572,0.5628,0.0957,0.2077,0.415
rf,Random Forest Regressor,0.127,0.0293,0.1703,0.4857,0.1052,0.2628,0.04
gbr,Gradient Boosting Regressor,0.1206,0.0295,0.1714,0.4806,0.1053,0.2426,0.015
ada,AdaBoost Regressor,0.1519,0.0331,0.181,0.4197,0.1138,0.336,0.015
lightgbm,Light Gradient Boosting Machine,0.1732,0.0485,0.2202,0.1414,0.1381,0.3818,0.02
lr,Linear Regression,0.1781,0.0507,0.225,0.105,0.1429,0.3998,0.995
dt,Decision Tree Regressor,0.1384,0.0526,0.2294,0.0678,0.1414,0.2892,0.005
huber,Huber Regressor,0.1715,0.0554,0.2347,0.0258,0.1464,0.3498,0.005
ridge,Ridge Regression,0.1929,0.0581,0.2409,-0.026,0.1533,0.4488,0.425
br,Bayesian Ridge,0.1955,0.0593,0.2435,-0.0492,0.1546,0.4521,0.015


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1828,0.0467,0.216,0.1261,0.1402,0.4698
1,0.1645,0.0508,0.2253,0.1499,0.1381,0.315
Mean,0.1737,0.0487,0.2207,0.138,0.1391,0.3924
Std,0.0091,0.002,0.0046,0.0119,0.001,0.0774


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.1477,0.0362,0.1892,0.1491,0.1266,0.4448,0.015
rf,Random Forest Regressor,0.1409,0.0363,0.1894,0.1477,0.1266,0.4315,0.03
lightgbm,Light Gradient Boosting Machine,0.1479,0.0382,0.1945,0.0987,0.1303,0.4588,0.02
ada,AdaBoost Regressor,0.1482,0.0385,0.1951,0.0946,0.1289,0.4367,0.015
et,Extra Trees Regressor,0.143,0.0387,0.196,0.0842,0.1278,0.3959,0.05
knn,K Neighbors Regressor,0.1539,0.0421,0.2032,0.0208,0.1348,0.4529,0.005
huber,Huber Regressor,0.1556,0.0419,0.204,0.0072,0.1364,0.4881,0.005
omp,Orthogonal Matching Pursuit,0.1576,0.0419,0.2041,0.0052,0.1355,0.4726,0.005
br,Bayesian Ridge,0.1565,0.0425,0.2054,-0.006,0.1368,0.4825,0.005
lr,Linear Regression,0.1575,0.0423,0.2052,-0.0063,0.1366,0.4811,0.03


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1323,0.027,0.1642,0.2325,0.1109,0.388
1,0.1471,0.0359,0.1895,0.2717,0.1231,0.3831
Mean,0.1397,0.0314,0.1769,0.2521,0.117,0.3856
Std,0.0074,0.0045,0.0126,0.0196,0.0061,0.0024


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Transformation Pipeline and Model Successfully Saved


In [11]:
# Predicting norm_vol and norm_fd on the test set
# load models to ensure they are in memory
vol_model_5h = load_reg_model('vol_model_5h')
fd_model_5h = load_reg_model('fd_model_5h')

# prepare features and drop target columns
vol_test_features = test_df.drop(columns=['norm_vol'], errors='ignore')
fd_test_features = test_df.drop(columns=['norm_fd'], errors='ignore')

# predict norm_vol and norm_fd 
test_df['pred_vol'] = reg_predict(vol_model_5h, data=vol_test_features)['prediction_label']
test_df['pred_fd'] = reg_predict(fd_model_5h, data=fd_test_features)['prediction_label']

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [12]:
# define function for forecastability classification
def classify_forecastability(vol, fd):
    if vol < 0.5 and fd < 0.5:
        return 'FORECASTABLE'
    elif vol > 0.5 and fd > 0.5:
        return 'NON-FORECASTABLE'
    else:
        return 'UNDEFINED'

In [13]:
# Label forecastability from regression output
test_df['forecastability'] = test_df.apply(
    lambda row: classify_forecastability(row['pred_vol'], row['pred_fd']),
    axis=1
)

In [14]:
# Train a classifier using base CPs' predicted labels
train_df['pred_vol'] = reg_predict(vol_model_5h, data=train_df)['prediction_label']
train_df['pred_fd'] = reg_predict(fd_model_5h, data=train_df)['prediction_label']
train_df['forecastability'] = train_df.apply(
    lambda row: classify_forecastability(row['pred_vol'], row['pred_fd']),
    axis=1
)

# filter only defined forecastability classes
cls_train_df = train_df[train_df['forecastability'].isin(['FORECASTABLE', 'NON-FORECASTABLE'])].copy()


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0215,0.0053,0.0731,0.8951,0.0433,0.0392


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.0901,0.0189,0.1374,0.5419,0.0908,0.2585


In [15]:
# PyCaret classification model setup
cls_exp = cls_setup(
    data=cls_train_df,
    target='forecastability',
    ignore_features=['norm_vol', 'norm_fd', 'cp'],
    session_id=42,
    verbose=False
)

cls_model_5h = cls_compare(fold=2)
cls_model_5h = cls_tune(cls_model_5h, fold=2)
save_cls_model(cls_model_5h, 'forecastability_classifier_4h')
cls_result = cls_pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.035
gbc,Gradient Boosting Classifier,0.9815,1.0,0.9815,0.9825,0.9813,0.9595,0.961,0.015
dt,Decision Tree Classifier,0.9808,0.9853,0.9808,0.9827,0.981,0.9586,0.9602,0.005
ada,Ada Boost Classifier,0.9808,0.9853,0.9808,0.9827,0.981,0.9586,0.9602,0.01
et,Extra Trees Classifier,0.9808,1.0,0.9808,0.9827,0.981,0.9586,0.9602,0.025
lda,Linear Discriminant Analysis,0.9623,0.9739,0.9623,0.9659,0.9626,0.9197,0.9227,0.01
ridge,Ridge Classifier,0.9615,0.9967,0.9615,0.9615,0.9615,0.915,0.915,0.025
svm,SVM - Linear Kernel,0.9423,0.9673,0.9423,0.9567,0.9436,0.8818,0.893,0.005
nb,Naive Bayes,0.9252,1.0,0.9252,0.9355,0.9228,0.8316,0.8459,0.01
knn,K Neighbors Classifier,0.8291,0.8962,0.8291,0.8291,0.8284,0.6235,0.6247,0.01


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Std,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Fitting 2 folds for each of 10 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


In [17]:
# Step 1: Load the classification CPs' features
df_class = load_features_as_df(Classification_CPs, ac)

# Step 2: Drop rows with missing values before predicting
df_class = df_class.dropna().reset_index(drop=True)

# Step 3: Predict pred_vol and pred_fd using the trained regression models
df_class['pred_vol'] = reg_predict(vol_model_5h, data=df_class.drop(columns=['norm_vol', 'cp'], errors='ignore'))['prediction_label']
df_class['pred_fd'] = reg_predict(fd_model_5h, data=df_class.drop(columns=['norm_fd', 'cp'], errors='ignore'))['prediction_label']

# Step 4: Use full set of features used in training
cls_features = df_class[['mean_price', 'max_p', 'min_p', 'range_p', 'btc_corr', 'pred_vol', 'pred_fd']]

# Step 5: Predict
prediction_results = cls_predict(cls_model_5h, data=cls_features)
df_class['forecastability'] = prediction_results['prediction_label']


In [18]:
summary = df_class.groupby('cp')['forecastability'].value_counts().unstack(fill_value=0)
summary['majority_class'] = df_class.groupby('cp')['forecastability'].agg(lambda x: x.value_counts().idxmax())
summary.reset_index(inplace=True)
print(summary)

forecastability       cp  FORECASTABLE  NON-FORECASTABLE    majority_class
0                EUR_CHF             7                42  NON-FORECASTABLE
1                GBP_CHF             0                49  NON-FORECASTABLE
2                GBP_EUR             0                49  NON-FORECASTABLE
3                USD_INR             0                49  NON-FORECASTABLE
4                USD_JPY             0                49  NON-FORECASTABLE


## Export ArcticDB Libraries to .CSV files

In [20]:
# Export destination
EXPORT_DIR = "/Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports"
os.makedirs(EXPORT_DIR, exist_ok=True)

# Final list of symbols to export
raw_symbols = all_CPs + ['BTC']
feature_symbols = all_CPs

# Export function
def export_library_to_csv(ac, library_name, symbol_list):
    lib = ac[library_name]

    for symbol in symbol_list:
        try:
            arr = lib.read(symbol).data
            df = pd.DataFrame(arr)

            # Assign column names
            if library_name == 'forex_live_auxiliary':
                if symbol == 'BTC':
                    df.columns = ['price', 'size', 'timestamp']
                else:
                    df.columns = ['ask', 'bid', 'mid', 'timestamp']
            elif library_name == 'forex_live_main':
                df.columns = ['mean_price', 'norm_vol', 'norm_fd', 'max_p', 'min_p', 'range_p', 'btc_corr']

            # Save CSV
            filename = f"{library_name}_{symbol}.csv"
            filepath = os.path.join(EXPORT_DIR, filename)
            df.to_csv(filepath, index=False)
            print(f"✅ Exported {symbol} → {filepath}")

        except Exception as e:
            print(f"⚠️ Error exporting {symbol} from {library_name}: {e}")

# Run exports for Part 1
export_library_to_csv(ac, aux_lib, raw_symbols)
export_library_to_csv(ac, main_lib, feature_symbols)

✅ Exported USD_EUR → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_USD_EUR.csv
✅ Exported USD_GBP → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_USD_GBP.csv
✅ Exported USD_CAD → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_USD_CAD.csv
✅ Exported USD_CHF → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_USD_CHF.csv
✅ Exported USD_AUD → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_USD_AUD.csv
✅ Exported EUR_CHF → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_EUR_CHF.csv
✅ Exported GBP_EUR → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_GBP_EUR.csv
✅ Exported GBP_CHF → /Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/exports/forex_live_auxiliary_GBP_CHF.csv
✅ Exported USD_JPY → /Users/zway/Desktop

## Part 2 L/S Strategy

In [1]:
# Part 2 Configuration
import numpy as np
import pandas as pd
from polygon import RESTClient
import arcticdb as adb
import threading
import time
from datetime import datetime, timedelta, timezone
from tqdm.auto import tqdm


In [2]:
# Configuration
API_KEY = "beBybSi8daPgsTp5yx5cHtHpYcrjp5Jq"
client = RESTClient(API_KEY)

PART2_DB_PATH = '/Users/zway/Desktop/NYU/Data Engineering/Assignment/Final/data'
PART2_LIB_NAME = 'forex_live_part2'
PART2_CPs = ['GBP_USD', 'USD_JPY']

In [3]:
# Set up ArcticDB
ac2 = adb.Arctic(f"lmdb://{PART2_DB_PATH}")
if not ac2.has_library(PART2_LIB_NAME):
    ac2.create_library(PART2_LIB_NAME)
lib2 = ac2[PART2_LIB_NAME]

In [100]:
def fetch_forex_conversion(cp, end_time):
    from_currency, to_currency = cp.split('_')
    records = []

    print(f"Fetching {cp} via currency conversion...")

    while datetime.now() < end_time:
        try:
            response = client.get_real_time_currency_conversion(from_currency, to_currency, amount=1, precision=6)
            quote = response.last

            ask = float(quote.ask)
            bid = float(quote.bid)
            mid = (ask + bid) / 2
            ts = int(quote.timestamp)

            # Scale USD/JPY
            if cp == 'USD_JPY':
                ask, bid, mid = ask / 100, bid / 100, mid / 100

            record = np.array([ask, bid, mid, ts])
            records.append(record)

            print(f"[{cp}] {datetime.now().strftime('%H:%M:%S')} tick recorded.")

        except Exception as e:
            print(f"Error fetching {cp}: {e}")

        time.sleep(1)

    if records:
        lib2.write(symbol=cp, data=np.stack(records))
        print(f"{cp} - Stored {len(records)} records to ArcticDB")


In [None]:
def fetch_one_hour_data_part2():
    end_time = datetime.now() + timedelta(minutes=10)
    threads = []

    for cp in PART2_CPs:
        thread = threading.Thread(target=fetch_forex_conversion, args=(cp, end_time), daemon=True)
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    print("✅ Completed 1-hr data collection.")


# 1st Hour

In [89]:
fetch_one_hour_data_part2()

🔁 Fetching GBP_USD via currency conversion...
🔁 Fetching USD_JPY via currency conversion...
[USD_JPY] 16:11:38 tick recorded.
[GBP_USD] 16:11:38 tick recorded.
[USD_JPY] 16:11:39 tick recorded.
[GBP_USD] 16:11:39 tick recorded.
[GBP_USD] 16:11:41 tick recorded.
[USD_JPY] 16:11:41 tick recorded.
[GBP_USD] 16:11:42 tick recorded.
[USD_JPY] 16:11:42 tick recorded.
[GBP_USD] 16:11:43 tick recorded.
[USD_JPY] 16:11:43 tick recorded.
[GBP_USD] 16:11:44 tick recorded.
[USD_JPY] 16:11:44 tick recorded.
[GBP_USD] 16:11:45 tick recorded.
[USD_JPY] 16:11:45 tick recorded.
[GBP_USD] 16:11:46 tick recorded.
[USD_JPY] 16:11:46 tick recorded.
[GBP_USD] 16:11:47 tick recorded.
[USD_JPY] 16:11:47 tick recorded.
[GBP_USD] 16:11:48 tick recorded.
[USD_JPY] 16:11:48 tick recorded.
[GBP_USD] 16:11:49 tick recorded.
[USD_JPY] 16:11:49 tick recorded.
[GBP_USD] 16:11:50 tick recorded.
[USD_JPY] 16:11:50 tick recorded.
[GBP_USD] 16:11:51 tick recorded.
[USD_JPY] 16:11:51 tick recorded.
[GBP_USD] 16:11:52 tick 

In [4]:
def load_and_adjust_data(cp_name, lib):
    # Load raw data from ArcticDB
    raw = lib.read(cp_name).data
    df = pd.DataFrame(raw, columns=['ask', 'bid', 'mid', 'timestamp'])

    # Sort by timestamp
    df = df.sort_values(by='timestamp').reset_index(drop=True)

    return df

In [5]:
ls_log = []  # Global list to track each L/S trade round

In [16]:
for cp in ['GBP_USD', 'USD_JPY']:
    df = load_and_adjust_data(cp, lib2)
    print(f"\n{cp} Data Summary:")
    print(df[['mid', 'timestamp']].head())
    print(df[['mid']].describe())
    print(f"Unique mid values: {df['mid'].nunique()}")
    print(f"Total rows: {len(df)}")


GBP_USD Data Summary:
        mid     timestamp
0  0.753525  1.746218e+12
1  0.753525  1.746218e+12
2  0.753525  1.746218e+12
3  0.753525  1.746218e+12
4  0.753550  1.746218e+12
              mid
count  581.000000
mean     0.753710
std      0.000112
min      0.753450
25%      0.753650
50%      0.753705
75%      0.753815
max      0.753950
Unique mid values: 70
Total rows: 581

USD_JPY Data Summary:
        mid  timestamp
0  0.577075   1.450075
1  1.000000   1.450080
            mid
count  2.000000
mean   0.788538
std    0.299053
min    0.577075
25%    0.682806
50%    0.788538
75%    0.894269
max    1.000000
Unique mid values: 2
Total rows: 2


In [7]:
df_gbp = load_and_adjust_data('GBP_USD', lib2)
print("GBP_USD timestamp range:")
print(pd.to_datetime(df_gbp['timestamp'], unit='ms').describe())

GBP_USD timestamp range:
count                              581
mean     2025-05-02 20:38:03.700516352
min                2025-05-02 20:33:06
25%                2025-05-02 20:35:31
50%                2025-05-02 20:37:57
75%                2025-05-02 20:40:32
max                2025-05-02 20:43:01
Name: timestamp, dtype: object


In [12]:
df_gbp = load_and_adjust_data('USD_JPY', lib2)
print("USD_JPY timestamp range:")
print(pd.to_datetime(df_gbp['timestamp'], unit='ms').describe())

USD_JPY timestamp range:
count                                2
mean     1970-01-01 00:00:00.001450077
min      1970-01-01 00:00:00.001450075
25%      1970-01-01 00:00:00.001450076
50%      1970-01-01 00:00:00.001450077
75%      1970-01-01 00:00:00.001450078
max      1970-01-01 00:00:00.001450079
Name: timestamp, dtype: object


In [97]:
from sklearn.linear_model import LinearRegression

# Step 1: Load the transformed features for 1-hour
df_gbp = load_and_adjust_data('GBP_USD', lib2)
df_jpy = load_and_adjust_data('USD_JPY', lib2)

# Step 2: Compute trend (slope) on mid prices using sklearn
def compute_slope_mid(series):
    X = np.arange(len(series)).reshape(-1, 1)
    y = series.values
    model = LinearRegression().fit(X, y)
    return model.coef_[0]

slope_gbp = compute_slope_mid(df_gbp['mid'])  
slope_jpy = compute_slope_mid(df_jpy['mid'])

print(f"Slope GBP/USD: {slope_gbp:.6f}")
print(f"Slope USD/JPY: {slope_jpy:.6f}")

# Step 3: Decide Long/Short
if slope_gbp > slope_jpy:
    print("Long GBP/USD, Short USD/JPY")
else:
    print("Long USD/JPY, Short GBP/USD")


Slope GBP/USD: 0.000000
Slope USD/JPY: -0.000001
Long GBP/USD, Short USD/JPY


In [98]:
# Step 4: Determine entry/exit prices and compute PnL
gbp_mid_prices = df_gbp['mid'].values
jpy_mid_prices = df_jpy['mid'].values

# Ensure equal length
n = min(len(gbp_mid_prices), len(jpy_mid_prices))
gbp_mid_prices = gbp_mid_prices[:n]
jpy_mid_prices = jpy_mid_prices[:n]

# Determine entry/exit based on slope direction
if slope_gbp > slope_jpy:
    long_cp, short_cp = 'GBP_USD', 'USD_JPY'
    long_entry, long_exit = gbp_mid_prices[0], gbp_mid_prices[-1]
    short_entry, short_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
else:
    long_cp, short_cp = 'USD_JPY', 'GBP_USD'
    long_entry, long_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
    short_entry, short_exit = gbp_mid_prices[0], gbp_mid_prices[-1]

# Assume $100 total capital per step, split equally
capital_per_leg = 100 / 2

# Compute PnL for both legs
pnl_long = (long_exit - long_entry) / long_entry * capital_per_leg
pnl_short = (short_entry - short_exit) / short_entry * capital_per_leg
net_pnl = pnl_long + pnl_short

# Log the trade
ls_log.append({
    'step': len(ls_log) + 1,
    'long_cp': long_cp,
    'short_cp': short_cp,
    'long_entry': long_entry,
    'long_exit': long_exit,
    'short_entry': short_entry,
    'short_exit': short_exit,
    'pnl_long': pnl_long,
    'pnl_short': pnl_short,
    'net_pnl': net_pnl
})

# Output the result
print(f"\n📈 Long: {long_cp} | Entry: {long_entry:.5f} → Exit: {long_exit:.5f} | PnL: ${pnl_long:.2f}")
print(f"📉 Short: {short_cp} | Entry: {short_entry:.5f} → Exit: {short_exit:.5f} | PnL: ${pnl_short:.2f}")
print(f"💰 Net PnL for Step {len(ls_log)}: ${net_pnl:.2f}")



📈 Long: GBP_USD | Entry: 0.75324 → Exit: 0.75344 | PnL: $0.01
📉 Short: USD_JPY | Entry: 1.45029 → Exit: 1.45005 | PnL: $0.01
💰 Net PnL for Step 1: $0.02


# 2nd Hour

In [99]:
# fetch #2 hour
fetch_one_hour_data_part2()

🔁 Fetching GBP_USD via currency conversion...
🔁 Fetching USD_JPY via currency conversion...
[GBP_USD] 16:22:46 tick recorded.
[USD_JPY] 16:22:46 tick recorded.
[GBP_USD] 16:22:47 tick recorded.
[USD_JPY] 16:22:47 tick recorded.
[GBP_USD] 16:22:48 tick recorded.
[USD_JPY] 16:22:48 tick recorded.
[GBP_USD] 16:22:49 tick recorded.
[USD_JPY] 16:22:49 tick recorded.
[GBP_USD] 16:22:50 tick recorded.
[USD_JPY] 16:22:50 tick recorded.
[GBP_USD] 16:22:51 tick recorded.
[USD_JPY] 16:22:51 tick recorded.
[GBP_USD] 16:22:52 tick recorded.
[USD_JPY] 16:22:52 tick recorded.
[GBP_USD] 16:22:53 tick recorded.
[USD_JPY] 16:22:53 tick recorded.
[GBP_USD] 16:22:54 tick recorded.
[USD_JPY] 16:22:54 tick recorded.
[GBP_USD] 16:22:55 tick recorded.
[USD_JPY] 16:22:55 tick recorded.
[GBP_USD] 16:22:56 tick recorded.
[USD_JPY] 16:22:56 tick recorded.
[GBP_USD] 16:22:57 tick recorded.
[USD_JPY] 16:22:57 tick recorded.
[GBP_USD] 16:22:58 tick recorded.
[USD_JPY] 16:22:58 tick recorded.
[GBP_USD] 16:22:59 tick 

In [101]:
# Step 1: Load the transformed features for 1-hour
df_gbp = load_and_adjust_data('GBP_USD', lib2)
df_jpy = load_and_adjust_data('USD_JPY', lib2)

# Step 2: Compute trend (slope) on mid prices using sklearn
def compute_slope_mid(series):
    X = np.arange(len(series)).reshape(-1, 1)
    y = series.values
    model = LinearRegression().fit(X, y)
    return model.coef_[0]

slope_gbp = compute_slope_mid(df_gbp['mid'])  
slope_jpy = compute_slope_mid(df_jpy['mid'])

print(f"Slope GBP/USD: {slope_gbp:.6f}")
print(f"Slope USD/JPY: {slope_jpy:.6f}")

# Step 3: Decide Long/Short
if slope_gbp > slope_jpy:
    print("Long GBP/USD, Short USD/JPY")
else:
    print("Long USD/JPY, Short GBP/USD")

Slope GBP/USD: -0.000000
Slope USD/JPY: -0.000000
Long GBP/USD, Short USD/JPY


In [102]:
# Step 4: Determine entry/exit prices and compute PnL
gbp_mid_prices = df_gbp['mid'].values
jpy_mid_prices = df_jpy['mid'].values

# Ensure equal length
n = min(len(gbp_mid_prices), len(jpy_mid_prices))
gbp_mid_prices = gbp_mid_prices[:n]
jpy_mid_prices = jpy_mid_prices[:n]

# Determine entry/exit based on slope direction
if slope_gbp > slope_jpy:
    long_cp, short_cp = 'GBP_USD', 'USD_JPY'
    long_entry, long_exit = gbp_mid_prices[0], gbp_mid_prices[-1]
    short_entry, short_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
else:
    long_cp, short_cp = 'USD_JPY', 'GBP_USD'
    long_entry, long_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
    short_entry, short_exit = gbp_mid_prices[0], gbp_mid_prices[-1]

# Assume $100 total capital per step, split equally
capital_per_leg = 100 / 2

# Compute PnL for both legs
pnl_long = (long_exit - long_entry) / long_entry * capital_per_leg
pnl_short = (short_entry - short_exit) / short_entry * capital_per_leg
net_pnl = pnl_long + pnl_short

# Log the trade
ls_log.append({
    'step': len(ls_log) + 1,
    'long_cp': long_cp,
    'short_cp': short_cp,
    'long_entry': long_entry,
    'long_exit': long_exit,
    'short_entry': short_entry,
    'short_exit': short_exit,
    'pnl_long': pnl_long,
    'pnl_short': pnl_short,
    'net_pnl': net_pnl
})

# Output the result
print(f"\n📈 Long: {long_cp} | Entry: {long_entry:.5f} → Exit: {long_exit:.5f} | PnL: ${pnl_long:.2f}")
print(f"📉 Short: {short_cp} | Entry: {short_entry:.5f} → Exit: {short_exit:.5f} | PnL: ${pnl_short:.2f}")
print(f"💰 Net PnL for Step {len(ls_log)}: ${net_pnl:.2f}")



📈 Long: GBP_USD | Entry: 0.75346 → Exit: 0.75347 | PnL: $0.00
📉 Short: USD_JPY | Entry: 1.45019 → Exit: 1.44996 | PnL: $0.01
💰 Net PnL for Step 2: $0.01


# 3rd Hour

In [103]:
# fetch #3 hour
fetch_one_hour_data_part2()

Fetching GBP_USD via currency conversion...
Fetching USD_JPY via currency conversion...
[GBP_USD] 16:33:06 tick recorded.
[USD_JPY] 16:33:07 tick recorded.
[GBP_USD] 16:33:07 tick recorded.
[USD_JPY] 16:33:08 tick recorded.
[GBP_USD] 16:33:09 tick recorded.
[USD_JPY] 16:33:09 tick recorded.
[GBP_USD] 16:33:10 tick recorded.
[USD_JPY] 16:33:10 tick recorded.
[GBP_USD] 16:33:11 tick recorded.
[USD_JPY] 16:33:11 tick recorded.
[GBP_USD] 16:33:12 tick recorded.
[USD_JPY] 16:33:12 tick recorded.
[GBP_USD] 16:33:13 tick recorded.
[USD_JPY] 16:33:13 tick recorded.
[GBP_USD] 16:33:14 tick recorded.
[USD_JPY] 16:33:14 tick recorded.
[GBP_USD] 16:33:15 tick recorded.
[USD_JPY] 16:33:15 tick recorded.
[GBP_USD] 16:33:16 tick recorded.
[USD_JPY] 16:33:16 tick recorded.
[GBP_USD] 16:33:17 tick recorded.
[USD_JPY] 16:33:17 tick recorded.
[GBP_USD] 16:33:18 tick recorded.
[USD_JPY] 16:33:18 tick recorded.
[GBP_USD] 16:33:19 tick recorded.
[USD_JPY] 16:33:19 tick recorded.
[GBP_USD] 16:33:20 tick reco

In [15]:
# Step 1: Load the transformed features for 1-hour
df_gbp = load_and_adjust_data('GBP_USD', lib2)
df_jpy = load_and_adjust_data('USD_JPY', lib2)

# Step 2: Compute trend (slope) on mid prices using sklearn
def compute_slope_mid(series):
    X = np.arange(len(series)).reshape(-1, 1)
    y = series.values
    model = LinearRegression().fit(X, y)
    return model.coef_[0]

slope_gbp = compute_slope_mid(df_gbp['mid'])  
slope_jpy = compute_slope_mid(df_jpy['mid'])

print(f"Slope GBP/USD: {slope_gbp:.6f}")
print(f"Slope USD/JPY: {slope_jpy:.6f}")

# Step 3: Decide Long/Short
if slope_gbp > slope_jpy:
    print("Long GBP/USD, Short USD/JPY")
else:
    print("Long USD/JPY, Short GBP/USD")

NameError: name 'LinearRegression' is not defined

In [105]:
# Step 4: Determine entry/exit prices and compute PnL
gbp_mid_prices = df_gbp['mid'].values
jpy_mid_prices = df_jpy['mid'].values

# Ensure equal length
n = min(len(gbp_mid_prices), len(jpy_mid_prices))
gbp_mid_prices = gbp_mid_prices[:n]
jpy_mid_prices = jpy_mid_prices[:n]

# Determine entry/exit based on slope direction
if slope_gbp > slope_jpy:
    long_cp, short_cp = 'GBP_USD', 'USD_JPY'
    long_entry, long_exit = gbp_mid_prices[0], gbp_mid_prices[-1]
    short_entry, short_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
else:
    long_cp, short_cp = 'USD_JPY', 'GBP_USD'
    long_entry, long_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
    short_entry, short_exit = gbp_mid_prices[0], gbp_mid_prices[-1]

# Assume $100 total capital per step, split equally
capital_per_leg = 100 / 2

# Compute PnL for both legs
pnl_long = (long_exit - long_entry) / long_entry * capital_per_leg
pnl_short = (short_entry - short_exit) / short_entry * capital_per_leg
net_pnl = pnl_long + pnl_short

# Log the trade
ls_log.append({
    'step': len(ls_log) + 1,
    'long_cp': long_cp,
    'short_cp': short_cp,
    'long_entry': long_entry,
    'long_exit': long_exit,
    'short_entry': short_entry,
    'short_exit': short_exit,
    'pnl_long': pnl_long,
    'pnl_short': pnl_short,
    'net_pnl': net_pnl
})

# Output the result
print(f"\n📈 Long: {long_cp} | Entry: {long_entry:.5f} → Exit: {long_exit:.5f} | PnL: ${pnl_long:.2f}")
print(f"📉 Short: {short_cp} | Entry: {short_entry:.5f} → Exit: {short_exit:.5f} | PnL: ${pnl_short:.2f}")
print(f"💰 Net PnL for Step {len(ls_log)}: ${net_pnl:.2f}")



📈 Long: GBP_USD | Entry: 0.75353 → Exit: 0.75388 | PnL: $0.02
📉 Short: USD_JPY | Entry: 1.44997 → Exit: 1.44973 | PnL: $0.01
💰 Net PnL for Step 3: $0.03


In [13]:
def add_slope_target(df, horizon=30):
    df = df.copy()
    X = np.arange(horizon).reshape(-1, 1)
    slopes = []
    for i in range(len(df) - horizon):
        y = df['mid'].values[i:i+horizon]
        slope = LinearRegression().fit(X, y).coef_[0]
        slopes.append(slope)
    # Align slope target with last point in each window
    df = df.iloc[horizon:].reset_index(drop=True)
    df['target_slope'] = slopes
    return df

In [None]:
from pycaret.regression import *

# Prepare features
training_df = add_slope_target(load_and_adjust_data('GBP_USD', lib2))
training_df = training_df.dropna().reset_index(drop=True)

# Setup and train
exp = setup(training_df, target='target_slope', session_id=42, silent=True, verbose=False)
model = compare_models()

In [None]:
# Latest data window
latest_df = load_and_adjust_data('GBP_USD', lib2).tail(30).dropna().reset_index(drop=True)

# Predict slope using model
predicted = predict_model(model, data=latest_df)
predicted_slope = predicted['prediction_label'].mean() 

print(f"📊 Predicted slope for GBP/USD: {predicted_slope:.6f}")

In [None]:
# Step 4: Determine entry/exit prices and compute PnL
gbp_mid_prices = df_gbp['predicted_slope'].values
jpy_mid_prices = df_jpy['predicted_slope'].values

# Ensure equal length
n = min(len(gbp_mid_prices), len(jpy_mid_prices))
gbp_mid_prices = gbp_mid_prices[:n]
jpy_mid_prices = jpy_mid_prices[:n]

# Determine entry/exit based on slope direction
if slope_gbp > slope_jpy:
    long_cp, short_cp = 'GBP_USD', 'USD_JPY'
    long_entry, long_exit = gbp_mid_prices[0], gbp_mid_prices[-1]
    short_entry, short_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
else:
    long_cp, short_cp = 'USD_JPY', 'GBP_USD'
    long_entry, long_exit = jpy_mid_prices[0], jpy_mid_prices[-1]
    short_entry, short_exit = gbp_mid_prices[0], gbp_mid_prices[-1]

# Assume $100 total capital per step, split equally
capital_per_leg = 100 / 2

# Compute PnL for both legs
pnl_long = (long_exit - long_entry) / long_entry * capital_per_leg
pnl_short = (short_entry - short_exit) / short_entry * capital_per_leg
net_pnl = pnl_long + pnl_short

# Log the trade
ls_log.append({
    'step': len(ls_log) + 1,
    'long_cp': long_cp,
    'short_cp': short_cp,
    'long_entry': long_entry,
    'long_exit': long_exit,
    'short_entry': short_entry,
    'short_exit': short_exit,
    'pnl_long': pnl_long,
    'pnl_short': pnl_short,
    'net_pnl': net_pnl
})

# Output the result
print(f"\n📈 Long: {long_cp} | Entry: {long_entry:.5f} → Exit: {long_exit:.5f} | PnL: ${pnl_long:.2f}")
print(f"📉 Short: {short_cp} | Entry: {short_entry:.5f} → Exit: {short_exit:.5f} | PnL: ${pnl_short:.2f}")
print(f"💰 Net PnL for Step {len(ls_log)}: ${net_pnl:.2f}")
