In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from scipy.signal import butter, filtfilt
from sklearn.linear_model import LinearRegression

In [2]:


def smooth_data(data):
    N, Wn = 1, 0.05
    b, a = butter(N, Wn, btype='low')
    return filtfilt(b, a, data)

def get_slope(smoothed_data):
    X = np.arange(len(smoothed_data)).reshape(-1, 1)
    model = LinearRegression().fit(X, smoothed_data)
    return model.coef_[0]

def get_slope_window(window):
    N, Wn = 1, 0.05
    b, a = butter(N, Wn, btype='low')
    y = filtfilt(b, a, window.values)
    X = np.arange(len(y)).reshape(-1, 1)
    model = LinearRegression().fit(X, y)
    return model.coef_[0]

def chunk(df_train, df_val, df_test):
    chunk_size = 4320
    for i in range(int(len(df_train) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        df_chunk = df_train[start:end].reset_index(drop=True)
        df_chunk.to_feather('./data/ETHUSDT/train/df_{}.feather'.format(i))

    for i in range(int(len(df_val) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        df_chunk = df_val[start:end].reset_index(drop=True)
        df_chunk.to_feather('./data/ETHUSDT/val/df_{}.feather'.format(i))

    for i in range(int(len(df_test) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        df_chunk = df_test[start:end].reset_index(drop=True)
        df_chunk.to_feather('./data/ETHUSDT/test/df_{}.feather'.format(i))



In [3]:
def label_slope(df_train, df_val, df_test):
    chunk_size = 4320
    slopes_train = []
    for i in range(0, int(len(df_train) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df_train['close'][start:end].values
        smoothed_chunk = smooth_data(chunk)
        slope = get_slope(smoothed_chunk)
        slopes_train.append(slope)

    slopes_val = []
    for i in range(0, int(len(df_val) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df_val['close'][start:end].values
        smoothed_chunk = smooth_data(chunk)
        slope = get_slope(smoothed_chunk)
        slopes_val.append(slope)

    slopes_test = []
    for i in range(0, int(len(df_test) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df_test['close'][start:end].values
        smoothed_chunk = smooth_data(chunk)
        slope = get_slope(smoothed_chunk)
        slopes_test.append(slope)

    quantiles = [0, 0.05, 0.35, 0.65, 0.95, 1]
    slope_labels_train, bins = pd.qcut(slopes_train, q=quantiles, retbins=True, labels=False)

    train_indices = [[] for _ in range(5)]
    val_indices = [[] for _ in range(5)]
    test_indices = [[] for _ in range(5)]
    for index, label in enumerate(slope_labels_train):
        train_indices[label].append(index)
    with open('./data/ETHUSDT/train/slope_labels.pkl', 'wb') as file:
        pickle.dump(train_indices, file)

    bins[0] = -100
    bins[-1] = 100
    slope_labels_val = pd.cut(slopes_val, bins=bins, labels=False, include_lowest=True)
    slope_labels_val = [1 if element == 0 else element for element in slope_labels_val]
    slope_labels_val = [3 if element == 4 else element for element in slope_labels_val]
    slope_labels_test = pd.cut(slopes_test, bins=bins, labels=False, include_lowest=True)
    slope_labels_test = [1 if element == 0 else element for element in slope_labels_test]
    slope_labels_test = [3 if element == 4 else element for element in slope_labels_test]

    for index, label in enumerate(slope_labels_val):
        val_indices[label].append(index)
    with open('./data/ETHUSDT/val/slope_labels.pkl', 'wb') as file:
        pickle.dump(val_indices, file)

    for index, label in enumerate(slope_labels_test):
        test_indices[label].append(index)
    with open('./data/ETHUSDT/test/slope_labels.pkl', 'wb') as file:
        pickle.dump(test_indices, file)


def label_volatility(df_train, df_val, df_test):
    chunk_size = 4320
    volatilities_train = []
    for i in range(0, int(len(df_train) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df_train[start:end].copy()
        chunk['return'] = chunk['close'].pct_change().fillna(0)
        volatility = chunk['return'].std()
        volatilities_train.append(volatility)

    volatilities_val = []
    for i in range(0, int(len(df_val) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df_train[start:end].copy()
        chunk['return'] = chunk['close'].pct_change().fillna(0)
        volatility = chunk['return'].std()
        volatilities_val.append(volatility)

    volatilities_test = []
    for i in range(0, int(len(df_test) / chunk_size)):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df_train[start:end].copy()
        chunk['return'] = chunk['close'].pct_change().fillna(0)
        volatility = chunk['return'].std()
        volatilities_test.append(volatility)

    quantiles = [0, 0.05, 0.35, 0.65, 0.95, 1]
    vol_labels_train, bins = pd.qcut(volatilities_train, q=quantiles, retbins=True, labels=False)

    train_indices = [[] for _ in range(5)]
    val_indices = [[] for _ in range(5)]
    test_indices = [[] for _ in range(5)]
    for index, label in enumerate(vol_labels_train):
        train_indices[label].append(index)
    with open('./data/ETHUSDT/train/vol_labels.pkl', 'wb') as file:
        pickle.dump(train_indices, file)

    bins[0] = 0
    bins[-1] = 1
    vol_labels_val = pd.cut(volatilities_val, bins=bins, labels=False, include_lowest=True)
    vol_labels_val = [1 if element == 0 else element for element in vol_labels_val]
    vol_labels_val = [3 if element == 4 else element for element in vol_labels_val]
    vol_labels_test = pd.cut(volatilities_test, bins=bins, labels=False, include_lowest=True)
    vol_labels_test = [1 if element == 0 else element for element in vol_labels_test]
    vol_labels_test = [3 if element == 4 else element for element in vol_labels_test]

    for index, label in enumerate(vol_labels_val):
        val_indices[label].append(index)
    with open('./data/ETHUSDT/val/vol_labels.pkl', 'wb') as file:
        pickle.dump(val_indices, file)
    for index, label in enumerate(vol_labels_test):
        test_indices[label].append(index)
    with open('./data/ETHUSDT/test/vol_labels.pkl', 'wb') as file:
        pickle.dump(test_indices, file)

def label_whole(df):
    window_size_list = [360]
    for i in range(len(window_size_list)):
        window_size = window_size_list[i]
        df['slope_{}'.format(window_size)] = df['close'].rolling(window=window_size).apply(get_slope_window)
        df['return'] = df['close'].pct_change().fillna(0)
        df['vol_{}'.format(window_size)] = df['return'].rolling(window=window_size).std()
    return df

In [4]:

df_train = pd.read_feather('./data/ETHUSDT/df_train.feather')
df_val = pd.read_feather('./data/ETHUSDT/df_val.feather')
df_test = pd.read_feather('./data/ETHUSDT/df_test.feather')
'''
#reduce dataset
df_all = pd.concat([df_train, df_val, df_test], axis=0)

# Calculate the new size (1/10 of the original data)
new_size = len(df_all) // 5

# Reduce the dataset to the last 10% (for example, if you want the last 10% of the data)
df_all_reduced = df_all.iloc[-new_size:]

# Reassign to train, test, and val
# Split the reduced dataset into train, test, and val (60%, 20%, 20%)
train_size = int(0.6 * len(df_all_reduced))
val_size = int(0.2 * len(df_all_reduced))

df_train = df_all_reduced[:train_size]
df_val = df_all_reduced[train_size:train_size + val_size]
df_test = df_all_reduced[train_size + val_size:]

os.makedirs('./data/ETHUSDT/train', exist_ok=True)
os.makedirs('./data/ETHUSDT/val', exist_ok=True)
os.makedirs('./data/ETHUSDT/test', exist_ok=True)
os.makedirs('./data/ETHUSDT/whole', exist_ok=True)
'''

"\n#reduce dataset\ndf_all = pd.concat([df_train, df_val, df_test], axis=0)\n\n# Calculate the new size (1/10 of the original data)\nnew_size = len(df_all) // 5\n\n# Reduce the dataset to the last 10% (for example, if you want the last 10% of the data)\ndf_all_reduced = df_all.iloc[-new_size:]\n\n# Reassign to train, test, and val\n# Split the reduced dataset into train, test, and val (60%, 20%, 20%)\ntrain_size = int(0.6 * len(df_all_reduced))\nval_size = int(0.2 * len(df_all_reduced))\n\ndf_train = df_all_reduced[:train_size]\ndf_val = df_all_reduced[train_size:train_size + val_size]\ndf_test = df_all_reduced[train_size + val_size:]\n\nos.makedirs('./data/ETHUSDT/train', exist_ok=True)\nos.makedirs('./data/ETHUSDT/val', exist_ok=True)\nos.makedirs('./data/ETHUSDT/test', exist_ok=True)\nos.makedirs('./data/ETHUSDT/whole', exist_ok=True)\n"

In [5]:
df_train

Unnamed: 0,timestamp,ask1_price,ask1_size,bid1_price,bid1_size,ask2_price,ask2_size,bid2_price,bid2_size,ask3_price,...,close,kmid,klen,kmid2,kup,kup2,klow,klow2,ksft,ksft2
0,2022-02-01 00:00:00,2684.308500,18.342238,2684.267833,9.997110,2684.394000,3.872907,2684.177833,1.472390,2684.448333,...,2682.53,-3.64,4.76,-0.723909,0.83,0.165067,0.29,0.057674,-4.18,-0.831303
1,2022-02-01 00:01:00,2679.292333,9.461557,2679.265000,6.811332,2679.385500,2.137287,2679.143833,1.373933,2679.461833,...,2675.79,-6.74,7.35,-0.884796,0.04,0.005251,0.57,0.074827,-6.21,-0.815220
2,2022-02-01 00:02:00,2677.534833,8.243908,2677.520833,7.700837,2677.657000,2.148665,2677.418000,2.771002,2677.738000,...,2682.38,6.58,9.39,0.681284,0.53,0.054875,2.28,0.236068,8.33,0.862476
3,2022-02-01 00:03:00,2685.595167,14.569968,2685.555333,5.451135,2685.679000,2.043410,2685.457167,2.169612,2685.749167,...,2684.25,1.87,6.69,0.268739,4.06,0.583465,0.76,0.109220,-1.43,-0.205506
4,2022-02-01 00:04:00,2684.818500,3.575345,2684.786167,7.082942,2684.922333,1.226283,2684.678667,1.813000,2685.006000,...,2685.56,1.31,3.23,0.374440,0.59,0.168641,1.33,0.380157,2.05,0.585956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525595,2023-01-31 23:55:00,1587.304167,54.578250,1587.293667,59.056148,1587.314833,5.758927,1587.274000,19.561002,1587.327167,...,1587.43,0.07,0.56,0.097392,0.04,0.055653,0.45,0.626093,0.48,0.667833
525596,2023-01-31 23:56:00,1586.189000,73.496055,1586.179000,46.007407,1586.200667,13.921285,1586.165333,11.853943,1586.225500,...,1585.45,-1.98,2.34,-0.792461,0.00,0.000000,0.36,0.144084,-1.62,-0.648377
525597,2023-01-31 23:57:00,1585.926000,48.268305,1585.916000,58.029525,1585.947000,2.554725,1585.895167,11.638592,1585.967333,...,1586.31,0.87,1.12,0.680415,0.03,0.023463,0.22,0.172059,1.06,0.829012
525598,2023-01-31 23:58:00,1586.153833,59.995500,1586.143500,34.090373,1586.165333,8.393800,1586.119167,11.684610,1586.182833,...,1585.53,-0.79,0.86,-0.775610,0.07,0.068725,0.00,0.000000,-0.86,-0.844335


In [6]:
chunk(df_train, df_val, df_test)

In [7]:
label_slope(df_train, df_val, df_test)


In [8]:
label_volatility(df_train, df_val, df_test)

In [9]:

df_train = label_whole(df_train).dropna().reset_index(drop=True).iloc[1:].reset_index(drop=True)
df_val = label_whole(df_val).dropna().reset_index(drop=True).iloc[1:].reset_index(drop=True)
df_test = label_whole(df_test).dropna().reset_index(drop=True).iloc[1:].reset_index(drop=True)


In [10]:
df_train

Unnamed: 0,timestamp,ask1_price,ask1_size,bid1_price,bid1_size,ask2_price,ask2_size,bid2_price,bid2_size,ask3_price,...,kmid2,kup,kup2,klow,klow2,ksft,ksft2,slope_360,return,vol_360
0,2022-02-01 06:00:00,2731.168167,4.663482,2731.132167,3.692812,2731.256667,0.881293,2731.010333,1.615357,2731.311667,...,-0.655087,0.01,0.002991,0.87,0.260240,-1.33,-0.397839,0.215068,-0.000896,0.000869
1,2022-02-01 06:01:00,2731.614833,2.077523,2731.572833,3.549348,2731.719000,0.580695,2731.444667,1.327018,2731.769667,...,0.562130,0.58,0.177193,0.58,0.177193,1.84,0.562130,0.213314,0.000740,0.000860
2,2022-02-01 06:02:00,2733.045000,1.527443,2732.999167,4.157523,2733.158333,0.893785,2732.919333,1.537363,2733.218167,...,0.398796,0.47,0.168860,0.93,0.334126,1.57,0.564063,0.213100,0.000406,0.000850
3,2022-02-01 06:03:00,2734.079500,4.517698,2734.039167,4.851038,2734.178000,0.873323,2733.933500,1.776735,2734.229333,...,-0.236170,3.10,0.530528,1.09,0.186540,-3.39,-0.580158,0.211760,-0.000505,0.000850
4,2022-02-01 06:04:00,2732.076833,4.299515,2732.033333,2.955000,2732.165667,1.058407,2731.936667,1.416682,2732.235667,...,-0.429021,0.99,0.290912,0.68,0.199818,-1.77,-0.520115,0.211066,-0.000534,0.000850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525235,2023-01-31 23:55:00,1587.304167,54.578250,1587.293667,59.056148,1587.314833,5.758927,1587.274000,19.561002,1587.327167,...,0.097392,0.04,0.055653,0.45,0.626093,0.48,0.667833,-0.044423,0.000038,0.000885
525236,2023-01-31 23:56:00,1586.189000,73.496055,1586.179000,46.007407,1586.200667,13.921285,1586.165333,11.853943,1586.225500,...,-0.792461,0.00,0.000000,0.36,0.144084,-1.62,-0.648377,-0.044689,-0.001247,0.000888
525237,2023-01-31 23:57:00,1585.926000,48.268305,1585.916000,58.029525,1585.947000,2.554725,1585.895167,11.638592,1585.967333,...,0.680415,0.03,0.023463,0.22,0.172059,1.06,0.829012,-0.044050,0.000542,0.000888
525238,2023-01-31 23:58:00,1586.153833,59.995500,1586.143500,34.090373,1586.165333,8.393800,1586.119167,11.684610,1586.182833,...,-0.775610,0.07,0.068725,0.00,0.000000,-0.86,-0.844335,-0.044307,-0.000492,0.000888


In [11]:

df_train.to_feather('./data/ETHUSDT/whole/train.feather')
df_val.to_feather('./data/ETHUSDT/whole/val.feather')
df_test.to_feather('./data/ETHUSDT/whole/test.feather')