In [1]:
%load_ext autoreload
%autoreload 2

In [172]:
# To ensure our src module can be found and imported
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import subprocess
import time
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from src.data.binance_downloader import download_historical_daily_klines
from src.features.feature_generator import (
    feature_pipeline_v1
)

from src.models.metrics import (
    get_metrics
)

from src.visualization.plot_generator import (
    plot_actual_and_predicted_price
)

RANDOM_SEED = 420

In [None]:
DATA_DIR = Path.cwd().parent / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

MODEL_DIR = Path.cwd().parent / 'models'

BINANCE_HISTORICAL_DATA_DIR = RAW_DATA_DIR / 'binance_historical'
BINANCE_HISTORICAL_FILES_DIR = BINANCE_HISTORICAL_DATA_DIR / 'data/spot/daily/klines/BTCUSDT/1m'
BINANCE_HISTORICAL_DF_PATH = PROCESSED_DATA_DIR / 'binance_historical_df.csv'
BINANCE_PROCESSED_DF_PATH = PROCESSED_DATA_DIR / 'binance_processed_df.csv'
TRAIN_DF_PATH = PROCESSED_DATA_DIR / 'binance_train_df.csv'
VAL_DF_PATH = PROCESSED_DATA_DIR / 'binance_val_df.csv'
TEST_DF_PATH = PROCESSED_DATA_DIR / 'binance_test_df.csv'

TRADING_TYPE = 'spot'
TICKER_SYMBOL = 'BTCUSDT'
INTERVAL = '1m'
# No available data before 2021-03-01
START_DATE = '2021-03-01'
END_DATE = (datetime.utcnow() - timedelta(days=1) ).strftime('%Y-%m-%d')
# Reference: https://github.com/binance/binance-public-data/tree/master
RAW_DF_HEADERS = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume', 'num_trades', 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore']

# Ensure directories are present
BINANCE_HISTORICAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

## Generating features
- day_of_week: Mon-Sun (0-6)
- month_of_year: Jan to Dec (1-12)
- hr_of_day: 0-23
- quarter_of_hour: 1-4 corresponding to the quarters of an hour
- close_5m_ma: average of the closing prices in the previous 5 minutes
- close_30m_ma: average of the closing prices in the previous 30 minutes
- close_1h_ma: average of the closing prices in the previous 60 minutes (1 hour)
- close_4h_ma: average of the closing prices in the previous 240 minutes (4 hours)
- close_12h_ma: average of the closing prices in the previous 720 minutes (12 hours)
- close_1d_ma: average of the closing prices in the previous 1440 minutes (24 hours)
- close_15d_ma: average of the closing prices of the previous 21600 minutes (15 days)
- close_30d_ma: average of the closing prices of the previous 43200 minutes (30 days)
- close_t_minus_[x]: previous closing price at t-x minute
- volume_t_minus_[x]: previous volume at t-x minute

In [153]:
historical_df = pd.read_csv(BINANCE_HISTORICAL_DF_PATH)
historical_df

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,1614556800000,45134.11,45266.77,45130.34,45260.74,72.517978,1614556859999,3.277691e+06,2207,33.689150,1.522869e+06,0
1,1614556860000,45252.67,45362.07,45250.64,45356.00,65.371778,1614556919999,2.961835e+06,2028,32.499895,1.472609e+06,0
2,1614556920000,45356.00,45371.41,45104.36,45128.57,128.114624,1614556979999,5.795551e+06,2706,47.268294,2.138666e+06,0
3,1614556980000,45128.57,45194.65,45020.87,45037.36,59.964922,1614557039999,2.706678e+06,1502,25.519749,1.152231e+06,0
4,1614557040000,45036.62,45107.01,44977.82,45032.48,57.852895,1614557099999,2.605675e+06,1250,22.489983,1.013207e+06,0
...,...,...,...,...,...,...,...,...,...,...,...,...
976841,1673222100000,17071.38,17075.52,17066.03,17072.40,163.906720,1673222159999,2.798155e+06,4434,78.838300,1.345936e+06,0
976842,1673222160000,17071.92,17084.65,17070.80,17081.43,196.225080,1673222219999,3.351317e+06,5390,114.085820,1.948437e+06,0
976843,1673222220000,17081.75,17176.99,17081.31,17116.89,1629.484480,1673222279999,2.791647e+07,24701,1012.595750,1.734438e+07,0
976844,1673222280000,17117.64,17124.17,17102.96,17108.47,383.824150,1673222339999,6.568321e+06,7708,182.448170,3.122237e+06,0


In [154]:
ma_window_sizes_dict = {
    "close_5m_ma": 5,
    "close_30m_ma": 30,
    "close_1h_ma": 60,
    "close_4h_ma": 240,
    "close_12h_ma": 720,
    "close_1d_ma": 1440,
    "close_15d_ma": 21600,
    "close_30d_ma": 43200,
}

processed_df, ohe_encoder = feature_pipeline_v1(historical_df, ma_window_sizes_dict, lag_max_offset_period=120)

In [155]:
ohe_encoder_path = str(MODEL_DIR / 'ohe_encoder.pkl')

with open(ohe_encoder_path, 'wb') as f:
    pickle.dump(ohe_encoder, f)

In [156]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 933646 entries, 0 to 933645
Columns: 297 entries, close_time to close
dtypes: float64(296), int64(1)
memory usage: 2.1 GB


In [157]:
processed_df

Unnamed: 0,close_time,close_5m_ma,close_30m_ma,close_1h_ma,close_4h_ma,close_12h_ma,close_1d_ma,close_15d_ma,close_30d_ma,close_t_minus_1,...,hr_of_day_19,hr_of_day_20,hr_of_day_21,hr_of_day_22,hr_of_day_23,quarter_of_hour_1,quarter_of_hour_2,quarter_of_hour_3,quarter_of_hour_4,close
0,1617154259999,58950.654,58913.326667,58914.140333,58767.203250,58775.950458,58430.623618,56182.553918,54509.127372,58961.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,58985.97
1,1617154319999,58969.512,58913.661333,58917.638833,58768.478125,58775.796319,58431.788597,56182.775644,54509.445085,58985.97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,58971.95
2,1617154379999,58971.342,58914.306000,58920.403500,58769.749208,58775.691319,58432.936583,56183.000437,54509.760269,58971.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,58958.25
3,1617154439999,58969.994,58914.973333,58922.715833,58771.004292,58775.578486,58434.046479,56183.224324,54510.080401,58958.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,58916.84
4,1617154499999,58958.928,58914.505000,58924.179833,58772.108125,58775.443931,58435.152646,56183.450447,54510.401685,58916.84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,58914.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933641,1673222159999,17064.976,17049.523333,17031.495833,16978.964542,16951.424653,16944.938403,16741.329269,16886.826203,17071.39,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17072.40
933642,1673222219999,17067.978,17050.674000,17033.099833,16979.547917,16951.616042,16945.026799,16741.340323,16886.825075,17072.40,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17081.43
933643,1673222279999,17070.396,17052.354333,17034.967333,16980.168333,16951.821208,16945.121187,16741.351661,16886.824224,17081.43,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17116.89
933644,1673222339999,17081.788,17054.989333,17037.330333,16980.928417,16952.077528,16945.240479,16741.364624,16886.824141,17116.89,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17108.47


In [9]:
list(processed_df.columns)

['close_time',
 'close_5m_ma',
 'close_30m_ma',
 'close_1h_ma',
 'close_4h_ma',
 'close_12h_ma',
 'close_1d_ma',
 'close_15d_ma',
 'close_30d_ma',
 'close_t_minus_1',
 'close_t_minus_2',
 'close_t_minus_3',
 'close_t_minus_4',
 'close_t_minus_5',
 'close_t_minus_6',
 'close_t_minus_7',
 'close_t_minus_8',
 'close_t_minus_9',
 'close_t_minus_10',
 'close_t_minus_11',
 'close_t_minus_12',
 'close_t_minus_13',
 'close_t_minus_14',
 'close_t_minus_15',
 'close_t_minus_16',
 'close_t_minus_17',
 'close_t_minus_18',
 'close_t_minus_19',
 'close_t_minus_20',
 'close_t_minus_21',
 'close_t_minus_22',
 'close_t_minus_23',
 'close_t_minus_24',
 'close_t_minus_25',
 'close_t_minus_26',
 'close_t_minus_27',
 'close_t_minus_28',
 'close_t_minus_29',
 'close_t_minus_30',
 'close_t_minus_31',
 'close_t_minus_32',
 'close_t_minus_33',
 'close_t_minus_34',
 'close_t_minus_35',
 'close_t_minus_36',
 'close_t_minus_37',
 'close_t_minus_38',
 'close_t_minus_39',
 'close_t_minus_40',
 'close_t_minus_41',
 

In [12]:
processed_df.to_csv(BINANCE_PROCESSED_DF_PATH, index=False)

## Train-test-validation split
- Test will be the latest day in the data (1440 minutes)
- Train-validation will be 80-20 ratio, randomly shuffled

In [18]:
test_days = 1
test_minutes = test_days * 1440

test_df = processed_df.iloc[-test_minutes:].reset_index(drop=True)

In [19]:
test_df

Unnamed: 0,close_time,close_5m_ma,close_30m_ma,close_1h_ma,close_4h_ma,close_12h_ma,close_1d_ma,close_15d_ma,close_30d_ma,close_t_minus_1,...,hr_of_day_19,hr_of_day_20,hr_of_day_21,hr_of_day_22,hr_of_day_23,quarter_of_hour_1,quarter_of_hour_2,quarter_of_hour_3,quarter_of_hour_4,close
0,1673136059999,16944.756,16943.358000,16942.280500,16941.219958,16933.379153,16936.345368,16733.658138,16894.848814,16943.57,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16947.53
1,1673136119999,16945.240,16943.592333,16942.481000,16941.235250,16933.428264,16936.341958,16733.666010,16894.842357,16947.53,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16947.50
2,1673136179999,16945.638,16943.766333,16942.735667,16941.252917,16933.474597,16936.336868,16733.673637,16894.835841,16947.50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16948.03
3,1673136239999,16946.222,16944.005667,16942.888667,16941.266917,16933.522250,16936.336660,16733.681368,16894.829305,16948.03,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16951.60
4,1673136299999,16947.646,16944.344000,16943.092667,16941.285875,16933.575167,16936.340931,16733.689262,16894.822890,16951.60,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16951.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,1673222159999,17064.976,17049.523333,17031.495833,16978.964542,16951.424653,16944.938403,16741.329269,16886.826203,17071.39,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17072.40
1436,1673222219999,17067.978,17050.674000,17033.099833,16979.547917,16951.616042,16945.026799,16741.340323,16886.825075,17072.40,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17081.43
1437,1673222279999,17070.396,17052.354333,17034.967333,16980.168333,16951.821208,16945.121187,16741.351661,16886.824224,17081.43,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17116.89
1438,1673222339999,17081.788,17054.989333,17037.330333,16980.928417,16952.077528,16945.240479,16741.364624,16886.824141,17116.89,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17108.47


In [25]:
train_df, val_df = train_test_split(processed_df.iloc[:-test_minutes], train_size=0.8, random_state=RANDOM_SEED, shuffle=True)

In [30]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [31]:
train_df

Unnamed: 0,close_time,close_5m_ma,close_30m_ma,close_1h_ma,close_4h_ma,close_12h_ma,close_1d_ma,close_15d_ma,close_30d_ma,close_t_minus_1,...,hr_of_day_19,hr_of_day_20,hr_of_day_21,hr_of_day_22,hr_of_day_23,quarter_of_hour_1,quarter_of_hour_2,quarter_of_hour_3,quarter_of_hour_4,close
0,1619902559999,57738.066,57787.865333,57764.579500,57646.974333,57585.659458,57682.424236,54690.527794,57132.834629,57709.98,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,57697.65
1,1653368339999,29379.760,29379.540667,29343.117167,29257.758125,29400.114306,29898.020319,30027.664135,34084.464284,29365.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,29363.85
2,1631231579999,46366.350,46471.072667,46498.214667,46423.921333,46669.420431,46407.188479,48658.647237,47880.710192,46364.05,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,46319.95
3,1665827399999,19113.620,19106.874000,19096.394333,19147.642875,19167.987042,19300.646028,19481.858864,19401.003137,19111.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19109.07
4,1635839879999,61588.184,61596.754333,61657.118667,61530.076125,61163.820528,61372.253139,61838.177807,58637.584824,61604.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,61570.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745759,1652395319999,28562.996,28504.405000,28464.220167,28516.035708,28744.667139,28565.457000,35979.524476,38092.413775,28567.15,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,28544.81
745760,1657788599999,19861.720,19934.020667,19963.584833,20037.829750,20092.804028,19858.068049,20158.805229,20458.637807,19866.51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19853.58
745761,1655811419999,21115.018,21210.096000,21243.766667,21230.749250,20873.025264,20650.466757,24811.143018,27374.952543,21076.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21062.79
745762,1628796779999,44402.038,44360.229333,44348.765833,44200.891208,44673.904542,45249.908424,41903.569166,37586.906360,44411.72,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,44372.84


In [32]:
val_df

Unnamed: 0,close_time,close_5m_ma,close_30m_ma,close_1h_ma,close_4h_ma,close_12h_ma,close_1d_ma,close_15d_ma,close_30d_ma,close_t_minus_1,...,hr_of_day_19,hr_of_day_20,hr_of_day_21,hr_of_day_22,hr_of_day_23,quarter_of_hour_1,quarter_of_hour_2,quarter_of_hour_3,quarter_of_hour_4,close
0,1626176819999,33157.988,33072.082333,33067.163000,33169.253875,33096.353486,33147.477514,34055.082187,34764.518758,33130.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,33149.99
1,1620605219999,58233.168,58237.226000,58194.618833,57994.566750,57536.874556,57935.820486,55656.995050,56826.319938,58153.25,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,58110.44
2,1625413559999,35473.538,35458.796333,35454.528667,35420.683542,35317.502361,34965.452292,33810.062260,35344.805151,35481.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35476.41
3,1644536159999,43597.262,43775.614333,43835.435000,44045.789708,44541.723750,44327.239007,39790.904372,39989.903410,43522.53,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,43517.37
4,1669880639999,17115.278,17110.108000,17112.578167,17125.992792,17119.777333,16993.027986,16504.123075,17651.727190,17115.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,17116.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186437,1659170639999,23875.606,23882.269667,23877.675167,23815.832708,23843.058431,23849.246764,22335.034915,21257.658361,23902.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,23895.30
186438,1672280099999,16528.492,16544.341000,16552.506667,16545.597167,16595.386944,16623.686569,16912.939343,16970.417597,16526.89,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16531.30
186439,1636120259999,61672.088,61652.842333,61582.959333,61522.844125,61884.829194,61578.360215,61567.530157,60049.574798,61652.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,61661.07
186440,1618480379999,62583.306,62687.582000,62673.229000,62880.699375,62913.134250,63052.099840,59303.392965,57787.765515,62555.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,62552.39


In [37]:
test_df

Unnamed: 0,close_time,close_5m_ma,close_30m_ma,close_1h_ma,close_4h_ma,close_12h_ma,close_1d_ma,close_15d_ma,close_30d_ma,close_t_minus_1,...,hr_of_day_19,hr_of_day_20,hr_of_day_21,hr_of_day_22,hr_of_day_23,quarter_of_hour_1,quarter_of_hour_2,quarter_of_hour_3,quarter_of_hour_4,close
0,1673136059999,16944.756,16943.358000,16942.280500,16941.219958,16933.379153,16936.345368,16733.658138,16894.848814,16943.57,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16947.53
1,1673136119999,16945.240,16943.592333,16942.481000,16941.235250,16933.428264,16936.341958,16733.666010,16894.842357,16947.53,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16947.50
2,1673136179999,16945.638,16943.766333,16942.735667,16941.252917,16933.474597,16936.336868,16733.673637,16894.835841,16947.50,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16948.03
3,1673136239999,16946.222,16944.005667,16942.888667,16941.266917,16933.522250,16936.336660,16733.681368,16894.829305,16948.03,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16951.60
4,1673136299999,16947.646,16944.344000,16943.092667,16941.285875,16933.575167,16936.340931,16733.689262,16894.822890,16951.60,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16951.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,1673222159999,17064.976,17049.523333,17031.495833,16978.964542,16951.424653,16944.938403,16741.329269,16886.826203,17071.39,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17072.40
1436,1673222219999,17067.978,17050.674000,17033.099833,16979.547917,16951.616042,16945.026799,16741.340323,16886.825075,17072.40,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17081.43
1437,1673222279999,17070.396,17052.354333,17034.967333,16980.168333,16951.821208,16945.121187,16741.351661,16886.824224,17081.43,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17116.89
1438,1673222339999,17081.788,17054.989333,17037.330333,16980.928417,16952.077528,16945.240479,16741.364624,16886.824141,17116.89,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,17108.47


In [34]:
train_df.to_csv(TRAIN_DF_PATH, index=False)
val_df.to_csv(VAL_DF_PATH, index=False)

In [38]:
test_df.to_csv(TEST_DF_PATH, index=False)