In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import wget
import dask
import os
from tqdm import tqdm
import glob
import os

from numpy import linalg as LA
import math

%matplotlib inline

## Download k-line spot data by every minute from binance

In [2]:
url_template = r"https://data.binance.vision/data/spot/daily/klines/{0}/1h/{0}-1h-{1}.zip"
dir_template = r"data/spot/daily/klines/{0}/1h/{0}-1h-{1}.zip"


# @dask.delayed
def download_one_date(url, path):
    try:
        tmp = wget.download(url, out=path)
        return True
    except:
        # print("{} download failed".format(url))
        return False

def download_all_dates(token_pair: str, dates, path):
    first_meet = False
    for each in dates:
        res = download_one_date(url_template.format(token_pair, str(each.date())), path)
        if first_meet == False and res == True:
            first_meet = True
        if first_meet == True and res == False:
            break
    # promises = [download_one_date(url_template.format(token_pair, str(each.date())), path) for each in dates]
    # alldata=dask.compute(promises) 
    
def get_asset_pairs(x):
    with open("asset_pairs.txt", "r") as f:
        names = f.read()
    names = names.replace("\t", "")
    names = names.replace("\n", "")
    names = names.split("/")
    x_names = list(filter(lambda each: each.endswith(x), names))
    print("x: {} results length: {}".format(x, len(x_names)))
    return x_names

@dask.delayed
def main_download(pair, dates):
    path = "data/spot/daily/klines/{0}/1h".format(pair)
    if not os.path.exists(path):
        os.makedirs(path)
    download_all_dates(pair, dates, path)
    

In [4]:
USDT_pairs = get_asset_pairs("USDT")
BUSD_pairs = get_asset_pairs("BUSD")

x: USDT results length: 394
x: BUSD results length: 350


In [5]:
USDT_pairs = ["BTCUSDT"]
dates = pd.date_range(start="2021-03-01",end="2023-01-15")
promises = [main_download(each, dates) for each in USDT_pairs]
dask.compute(promises)
# for i in tqdm(range(len(USDT_pairs))):
#     main_download(USDT_pairs[i], dates)

([None],)

In [5]:
dates = pd.date_range(start="2021-03-01",end="2023-01-15")
print(len(list(dates)))

686


## Data loading & pre-processing

In [12]:
@dask.delayed
def process_raw(pair, path):
    names = [
        "Open time",
        "Open",
        "High",
        "Low",
        "Close",
        "Volume",
        "Close time",
        "Quote asset volume",
        "Number of trades",
        "Taker buy base asset volume",
        "Taker buy quote asset volume",
        "Ignore",
    ]
    asset_data = pd.read_csv(path, names=names, header=None)
    # btcdata = pd.read_csv(dir_template.format(pair, date), names=names, header=None)
    asset_data["time"] = pd.to_datetime(asset_data["Open time"], unit='ms')
    asset_data[pair] = asset_data["Close"]
    date_indexed = asset_data.set_index("time")
    date_indexed.drop([
        "Open time",
        "Open",
        "High",
        "Low",
        "Close",
        "Volume",
        "Close time",
        "Quote asset volume",
        "Number of trades",
        "Taker buy base asset volume",
        "Taker buy quote asset volume",
        "Ignore",
        ], axis=1, inplace=True)
    # date_indexed.drop('Close time', axis=1, inplace=True)
    
    # date_indexed.drop('time', axis=1, inplace=True)

    # date_indexed["s"] = (date_indexed["isBuyerMaker"].astype(int)-0.5)*(-2)
    # date_indexed["mid"] = date_indexed["price"]
    return date_indexed


def load_one_pair(pair):
    files = glob.glob("data/spot/daily/klines/{}/1h/*".format(pair))
    files = [each for each in files if "(" not in each]
    if len(files) == 0:
        print(f"{pair} is empty, no files found")
        return False, None
    tasks = [process_raw(pair, each) for each in files]
    p_data_arr = dask.compute(tasks)
    result = pd.concat(p_data_arr[0])
    return True, result

def merge_assets(pd_arr, col: str):
    assets_close_matrix = pd_arr[0]
    for each in pd_arr[1:]:
        assets_close_matrix = assets_close_matrix.merge(each, how="outer", on=col)
        # assets_close_matrix = assets_close_matrix.join(each)
        # print(assets_close_matrix.shape)
        if assets_close_matrix.shape[0] > 16464:
            print(f"{each.columns} wrong rows: {assets_close_matrix.shape[0]}")
            assert True==False
    return assets_close_matrix

In [13]:
def main_load_and_merge_all_assets():
    existing_pairs = os.listdir("data/spot/daily/klines")
    print("Number of pairs: ", len(existing_pairs))
    chunk_sz = 10
    subsets = [existing_pairs[i:i + chunk_sz] for i in range(0, len(existing_pairs), chunk_sz)]
    # print(sum([len(each) for each in subsets]))
    for i in tqdm(range(len(subsets))):
        chunk = subsets[i]
        tmp_assets_arr = list()
        for each in chunk:
            success, tmp_asset = load_one_pair(each)
            if success:
                if tmp_asset.shape[0] > 16464:
                    print(f"{each} wrong rows: {tmp_asset.shape[0]}")
                tmp_assets_arr.append(tmp_asset)
        # tmp_assets_arr = [load_one_pair(each) for each in chunk]
        tmp_merge_result = merge_assets(tmp_assets_arr, "time")
        tmp_merge_result.to_pickle(f"data/clean/mk_{i}.pkl")
        
    
def inspect_assets_shape():
    existing_pairs = os.listdir("data/spot/daily/klines")
    print("Number of pairs: ", len(existing_pairs))
    # assets_pd_arr = [load_one_pair(each) for each in existing_pairs]
    for pair in existing_pairs:
        success, tmp_asset = load_one_pair(pair)
        assert tmp_asset.shape[0] <= 16464, f"{pair} wrong rows: {tmp_asset.shape[0]}"

res = main_load_and_merge_all_assets()
# inspect_assets_shape()

  0%|          | 0/40 [00:00<?, ?it/s]

Number of pairs:  394


 38%|███▊      | 15/40 [17:17<38:00, 91.22s/it]

GALUSDT is empty, no files found


100%|██████████| 40/40 [43:10<00:00, 64.77s/it]


In [8]:
def merge_all_chunks(arr_id):
    df_arr = [pd.read_pickle(f"data/clean/mk_{i}.pkl") for i in arr_id]
    # for each in df_arr:
    #     print(each.shape)
    res = merge_assets(df_arr, "time")
    return res

In [14]:
res = merge_all_chunks([i for i in range(40)])
res.to_pickle("data/clean/whole_usdt_merge.pkl")
print(res.shape)

## MVP functions

In [103]:
def eigenvalue_clipping(lambdas,v,lambda_plus):
    N=len(lambdas)
    
    
    # _s stands for _structure below
    sum_lambdas_gt_lambda_plus=np.sum(lambdas[lambdas>lambda_plus])
    
    sel_bulk=lambdas<=lambda_plus                     # these eigenvalues come from the seemingly random bulk
    N_bulk=np.sum(sel_bulk)
    sum_lambda_bulk=np.sum(lambdas[sel_bulk])        
    delta=sum_lambda_bulk/N_bulk                      # delta is their average, so as to conserver the trace of C
    
    lambdas_clean=lambdas
    lambdas_clean[lambdas_clean<=lambda_plus]=delta
    
    
    C_clean=np.zeros((N, N))
    v_m=np.matrix(v)
    
    for i in range(N-1):
        C_clean=C_clean+lambdas_clean[i] * np.dot(v_m[i,].T,v_m[i,]) 
        
    np.fill_diagonal(C_clean,1)
            
    return C_clean    

## Data preprocessing

In [26]:
all_data = pd.read_pickle("data/clean/whole_usdt_merge.pkl")
all_data = all_data.reset_index()
all_data = all_data.drop(["time"], axis=1)

In [40]:
all_data

Unnamed: 0,1INCHDOWNUSDT,1INCHUPUSDT,1INCHUSDT,AAVEDOWNUSDT,AAVEUPUSDT,AAVEUSDT,ACAUSDT,ACHUSDT,ACMUSDT,ADADOWNUSDT,...,XVSUSDT,YFIDOWNUSDT,YFIIUSDT,YFIUPUSDT,YFIUSDT,YGGUSDT,ZECUSDT,ZENUSDT,ZILUSDT,ZRXUSDT
0,10.07,9.87,5.8799,1.327380,110.463,438.825,,,12.062,2.250567,...,85.699,0.001495,3044.99,8.890,50826.78,,246.49,96.655,0.20656,2.1644
1,10.21,9.73,5.8589,1.367000,108.459,434.066,,,11.980,2.230800,...,84.578,0.001558,2989.17,8.530,50035.84,,247.08,97.583,0.20509,2.1828
2,9.92,10.00,5.9137,1.320024,111.310,439.611,,,12.067,2.127842,...,83.166,0.001554,3000.41,8.550,50046.70,,248.45,97.487,0.20619,2.1800
3,9.97,9.93,5.9084,1.301529,112.411,443.585,,,12.025,2.241671,...,81.401,0.001595,2954.89,8.301,49476.40,,244.61,95.743,0.20330,2.1540
4,9.10,10.63,6.1123,1.232953,117.000,452.599,,,11.988,2.224815,...,82.750,0.001548,3000.85,8.642,50269.64,,246.76,97.111,0.20582,2.1780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16447,,,,,,80.100,0.1340,0.00926,2.947,0.003657,...,5.030,,1355.00,,7092.00,0.2532,45.30,10.480,0.02575,0.2025
16448,,,,,,79.900,0.1324,0.00926,2.994,0.003627,...,5.100,,1357.30,,7106.00,0.2592,45.40,10.470,0.02584,0.2033
16449,,,,,,79.700,0.1315,0.00920,3.115,0.003646,...,5.050,,1357.10,,7068.00,0.2581,45.30,10.480,0.02579,0.2037
16450,,,,,,79.900,0.1312,0.00914,3.090,0.003607,...,5.050,,1352.20,,7086.00,0.2573,45.20,10.530,0.02595,0.2037


In [38]:
log_ret_all_data = np.log(all_data).diff()

In [39]:
log_ret_all_data

Unnamed: 0,1INCHDOWNUSDT,1INCHUPUSDT,1INCHUSDT,AAVEDOWNUSDT,AAVEUPUSDT,AAVEUSDT,ACAUSDT,ACHUSDT,ACMUSDT,ADADOWNUSDT,...,XVSUSDT,YFIDOWNUSDT,YFIIUSDT,YFIUPUSDT,YFIUSDT,YGGUSDT,ZECUSDT,ZENUSDT,ZILUSDT,ZRXUSDT
0,,,,,,,,,,,...,,,,,,,,,,
1,0.013807,-0.014286,-0.003578,0.029411,-0.018308,-0.010904,,,-0.006821,-0.008822,...,-0.013167,0.041277,-0.018502,-0.041338,-0.015684,,0.002391,0.009555,-0.007142,0.008465
2,-0.028815,0.027371,0.009310,-0.034969,0.025947,0.012694,,,0.007236,-0.047252,...,-0.016836,-0.002571,0.003753,0.002342,0.000217,,0.005529,-0.000984,0.005349,-0.001284
3,0.005028,-0.007025,-0.000897,-0.014110,0.009843,0.008999,,,-0.003487,0.052113,...,-0.021451,0.026041,-0.015288,-0.029555,-0.011461,,-0.015577,-0.018052,-0.014115,-0.011998
4,-0.091306,0.068120,0.033928,-0.054128,0.040012,0.020117,,,-0.003082,-0.007548,...,0.016436,-0.029910,0.015434,0.040258,0.015906,,0.008751,0.014187,0.012319,0.011080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16447,,,,,,-0.014870,0.016554,-0.002157,0.041218,0.004934,...,0.001990,,0.001182,,0.008070,-0.010998,0.002210,-0.011385,-0.002715,-0.002466
16448,,,,,,-0.002500,-0.012012,0.000000,0.015823,-0.008237,...,0.013821,,0.001696,,0.001972,0.023420,0.002205,-0.000955,0.003489,0.003943
16449,,,,,,-0.002506,-0.006821,-0.006501,0.039619,0.005225,...,-0.009852,,-0.000147,,-0.005362,-0.004253,-0.002205,0.000955,-0.001937,0.001966
16450,,,,,,0.002506,-0.002284,-0.006543,-0.008058,-0.010754,...,0.000000,,-0.003617,,0.002543,-0.003104,-0.002210,0.004760,0.006185,0.000000


In [27]:
t0 = 10000
t1 = 16000
X_raw = log_ret_all_data.iloc[t0:t1].dropna(axis=1)
X_raw.shape

(6000, 263)

In [28]:
X_raw

Unnamed: 0,AAVEUSDT,ACAUSDT,ACHUSDT,ACMUSDT,ADADOWNUSDT,ADAUPUSDT,ADAUSDT,ADXUSDT,AIONUSDT,AKROUSDT,...,XTZUSDT,XVGUSDT,XVSUSDT,YFIIUSDT,YFIUSDT,YGGUSDT,ZECUSDT,ZENUSDT,ZILUSDT,ZRXUSDT
10000,179.1,1.4710,0.03426,5.940,0.005241,7.713,0.9190,0.3842,0.09100,0.01136,...,3.089,0.01053,10.49,1829.0,18977.39,2.0000,169.2,37.02,0.11825,0.9205
10001,178.7,1.4770,0.03404,5.958,0.005191,7.753,0.9220,0.3847,0.09170,0.01140,...,3.097,0.01053,10.52,1837.0,19048.49,1.9890,166.9,36.99,0.11811,0.9168
10002,178.4,1.5170,0.03366,5.917,0.005243,7.712,0.9190,0.4051,0.09030,0.01135,...,3.124,0.01069,10.55,1838.0,18966.90,1.9890,165.3,36.86,0.11646,0.9119
10003,176.9,1.5100,0.03362,5.949,0.005268,7.692,0.9190,0.4004,0.09060,0.01126,...,3.130,0.01068,10.55,1836.0,19033.39,1.9810,165.4,36.90,0.11662,0.9022
10004,176.6,1.4810,0.03350,5.913,0.005319,7.636,0.9140,0.4017,0.09040,0.01121,...,3.112,0.01070,10.56,1829.0,18929.18,1.9670,164.3,36.67,0.11489,0.8910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,57.5,0.1217,0.00805,2.345,0.008296,0.212,0.2603,0.1182,0.02691,0.00331,...,0.773,0.00283,3.82,1219.8,5511.00,0.1676,38.8,9.15,0.01723,0.1611
15996,57.0,0.1214,0.00803,2.337,0.008362,0.209,0.2591,0.1180,0.02677,0.00339,...,0.770,0.00281,3.81,1216.7,5485.00,0.1667,38.7,9.17,0.01714,0.1606
15997,56.8,0.1209,0.00798,2.333,0.008403,0.209,0.2580,0.1180,0.02700,0.00336,...,0.766,0.00283,3.81,1217.1,5473.00,0.1650,38.7,9.13,0.01708,0.1608
15998,56.5,0.1204,0.00809,2.340,0.008443,0.206,0.2572,0.1182,0.02698,0.00338,...,0.761,0.00284,3.83,1218.1,5466.00,0.1644,38.6,9.04,0.01697,0.1609
