In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import skew, kurtosis, levy_stable
import statsmodels.api as sm
from arch import arch_model
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import multiprocessing
from hurst import compute_Hc
import warnings
import time
from datetime import datetime
from arch.univariate.base import ConvergenceWarning

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
numpy2ri.activate()
fBasics = importr('fBasics')

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

: 

In [2]:
data_dir = 'Data/'
files = {
    'bb_bond': 'BB Bond.csv',
    'bb_commodity': 'BB Commodity.csv',
    'bb_crypto': 'BB Crypto.csv',
    'bb_exchange_rate': 'BB Exchange Rate.csv',
    'bb_real_estate': 'BB Real Estate.csv',
    'bb_stock': 'BB Stock.csv'
}

for var_name, file_name in files.items():
    file_path = os.path.join(data_dir, file_name)
    df = pd.read_csv(file_path, index_col=0, parse_dates=True)
    empty_cols = df.columns[df.isnull().all()].tolist()
    df = df.drop(columns=empty_cols)
    globals()[var_name] = df

In [3]:
bb = pd.concat([bb_stock, bb_exchange_rate, bb_commodity, bb_bond, bb_real_estate], axis=1)
cc = pd.read_csv('Data/CC Prices.csv', index_col=0, parse_dates=True)
cc_cap = pd.read_csv('Data/CC Market Cap.csv', index_col=0, parse_dates=True)
cc_list = pd.read_csv('Data/CC List.csv')

dd = pd.concat([cc, bb], axis=1, join='inner')
dd[dd <= 0] = np.nan
dd = dd[dd.columns[dd.notna().sum() > 2000]]
dd.to_csv('Data/DD price.csv', index=True)
cc = cc.reindex(dd.index)

dd_return = np.log(dd / dd.shift(1))
dd_return = dd_return.iloc[1:]
dd_return = dd_return.drop(columns=dd_return.loc[:, (dd_return == 0).sum() > 200])

In [4]:
column_source = {}

for category, df in {
    'Stock': bb_stock,
    'Exchange Rate': bb_exchange_rate,
    'Commodity': bb_commodity,
    'Bond': bb_bond,
    'Real Estate': bb_real_estate,
    'Crypto': cc
}.items():
    for col in df.columns:
        column_source[col] = category

dd_index = pd.DataFrame(list(column_source.items()), columns=['Asset', 'Type'])
dd_index = dd_index[dd_index['Asset'].isin(dd.columns)]

dd_index.to_csv('Data/DD Index.csv', index=False)

In [None]:
start_window = int(len(dd_return) / 3)

start_time = time.time()
print(datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'))

window_size = 250
step_size = 21
results = []

pause_step = 0

for window_size in tqdm(range(start_window, len(dd_return) + 1, step_size), desc="Extending windows", ncols=100):
    start = 0
    dd_window = dd_return.iloc[start:start + window_size]
    dd_date = dd_window.index[-1].strftime('%Y-%m-%d')
    dd_window = dd_window.loc[:, dd_window.notnull().all()]
    dd_window = dd_window.drop(columns=dd_window.loc[:, (dd_window == 0).sum() > (window_size / 10)])
    dd_window = dd_window.loc[:, dd_window.std() != 0]
    if dd_window.shape[1] == 0:
        continue

    dd_factors = pd.DataFrame({"Date": [dd_window.index[-1]] * len(dd_window.columns), "Asset": dd_window.columns})
    dd_factors = dd_factors.merge(dd_index, on="Asset", how="left")

    quantile_levels = [0.005, 0.01, 0.025, 0.05, 0.95, 0.975, 0.99, 0.995]
    for q in quantile_levels:
        dd_factors["Q_" + str(q).replace(".", "_")] = dd_window.quantile(q).values
    for q in quantile_levels[:4]:
        dd_factors["CTE_" + str(q).replace(".", "_")] = dd_window[dd_window <= dd_window.quantile(q)].mean().values
    for q in quantile_levels[4:]:
        dd_factors["CTE_" + str(q).replace(".", "_")] = dd_window[dd_window >= dd_window.quantile(q)].mean().values

    figarch_list = []
    for col in dd_window.columns:
        model = arch_model(dd_window[col], vol="FIGARCH", p=1, q=1, rescale=False)
        result = model.fit(disp="off")
        d_param = result.params.get("d", np.nan)
        figarch_list.append(d_param)

    dd_factors["FIGARCH_d"] = figarch_list

    num_cores = multiprocessing.cpu_count()

    stable_res = pd.DataFrame(
        Parallel(n_jobs=num_cores)(
            delayed(lambda d: (lambda r: (r[0], r[3]))(levy_stable.fit(d)))(dd_window[col].dropna().values)
            for col in dd_window.columns
        ),
        index=dd_window.columns, columns=["Stable_alpha", "Stable_gamma"]
    )

    dd_factors = dd_factors.merge(stable_res, left_on="Asset", right_index=True, how="left")
    dd_factors["Variance"] = dd_window.var().values
    dd_factors["Skewness"] = dd_window.apply(skew).values
    dd_factors["Kurtosis"] = dd_window.apply(kurtosis).values
    dd_factors["ACF_Lag1"] = dd_window.apply(lambda x: sm.tsa.acf(x.dropna(), nlags=1, fft=False)[1] if len(x.dropna()) > 1 else 0).values
    dd_factors["Hurst"] = dd_window.apply(lambda x: compute_Hc(x, kind="change", simplified=False)[0] if len(x.dropna()) >= 20 else np.nan).values

    output_path = os.path.join(f"Factors/{dd_date} Factors.csv")
    dd_factors.to_csv(output_path, index=False)
    results.append(dd_factors)

end_time = time.time()
total_time = end_time - start_time
print(datetime.fromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S'))
print(f"Execution completed in {total_time:.2f} seconds.")

## Test

In [29]:
start_window = int(len(dd_return) / 3)

start_time = time.time()
print(datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S'))

window_size = 250
step_size = 21
results = []

pause_step = 0

# window_size = 2587
window_size = int(len(dd_return) / 3)

start = 0
dd_window = dd_return.iloc[start:start + window_size]
dd_date = dd_window.index[-1].strftime('%Y-%m-%d')
dd_window = dd_window.loc[:, dd_window.notnull().all()]
dd_window = dd_window.drop(columns=dd_window.loc[:, (dd_window == 0).sum() > (window_size / 10)])
dd_window = dd_window.loc[:, dd_window.std() != 0]

dd_factors = pd.DataFrame({"Date": [dd_window.index[-1]] * len(dd_window.columns), "Asset": dd_window.columns})
dd_factors = dd_factors.merge(dd_index, on="Asset", how="left")

2025-03-27 11:30:38


In [None]:
num_cores = multiprocessing.cpu_count()

stable_res = pd.DataFrame(
    Parallel(n_jobs=num_cores)(
        delayed(lambda d: (lambda r: (r[0], r[3]))(levy_stable.fit(d)))(dd_window[col].dropna().values)
        for col in dd_window.columns
    ),
    index=dd_window.columns, columns=["Stable_alpha", "Stable_gamma"]
)

In [33]:
test = dd_window['bitcoin-plus'].dropna().values