In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
# To ensure our src module can be found and imported
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import subprocess
import time
import pickle
import pandas as pd

from datetime import datetime, timedelta
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder

from src.data.binance_downloader import (
    download_historical_daily_klines,
    generate_latest_historical_df,
    get_realtime_klines,
)

from src.features.feature_generator import (
    feature_pipeline_v1
)

In [15]:
DATA_DIR = Path.cwd().parent / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

MODEL_DIR = Path.cwd().parent / 'models'

BINANCE_HISTORICAL_DATA_DIR = RAW_DATA_DIR / 'binance_historical'
BINANCE_HISTORICAL_FILES_DIR = BINANCE_HISTORICAL_DATA_DIR / 'data/spot/daily/klines/BTCUSDT/1m'
BINANCE_HISTORICAL_DF_PATH = PROCESSED_DATA_DIR / 'binance_historical_df.csv'
BINANCE_PROCESSED_DF_PATH = PROCESSED_DATA_DIR / 'binance_processed_df.csv'
TRAIN_DF_PATH = PROCESSED_DATA_DIR / 'binance_train_df.csv'
VAL_DF_PATH = PROCESSED_DATA_DIR / 'binance_val_df.csv'
TEST_DF_PATH = PROCESSED_DATA_DIR / 'binance_test_df.csv'

TRADING_TYPE = 'spot'
TICKER_SYMBOL = 'BTCUSDT'
INTERVAL = '1m'
# No available data before 2021-03-01
START_DATE = '2021-03-01'
END_DATE = (datetime.utcnow() - timedelta(days=1) ).strftime('%Y-%m-%d')
# Reference: https://github.com/binance/binance-public-data/tree/master
RAW_DF_HEADERS = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_asset_volume', 'num_trades', 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore']

# Ensure directories are present
BINANCE_HISTORICAL_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [23]:
historical_df = generate_latest_historical_df(TRADING_TYPE, 
                                                TICKER_SYMBOL, 
                                                INTERVAL, 
                                                START_DATE, 
                                                END_DATE, 
                                                str(BINANCE_HISTORICAL_DATA_DIR), 
                                                str(BINANCE_HISTORICAL_FILES_DIR), 
                                                str(BINANCE_PROCESSED_DF_PATH),
                                                RAW_DF_HEADERS,
                                                write_csv=True,
                                            )

{'serverTime': 1673361875450}
Found 1 symbols
[1/1] - start download daily BTCUSDT klines 


In [24]:
historical_df

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,1614556800000,45134.11,45266.77,45130.34,45260.74,72.517978,1614556859999,3.277691e+06,2207,33.689150,1.522869e+06,0
1,1614556860000,45252.67,45362.07,45250.64,45356.00,65.371778,1614556919999,2.961835e+06,2028,32.499895,1.472609e+06,0
2,1614556920000,45356.00,45371.41,45104.36,45128.57,128.114624,1614556979999,5.795551e+06,2706,47.268294,2.138666e+06,0
3,1614556980000,45128.57,45194.65,45020.87,45037.36,59.964922,1614557039999,2.706678e+06,1502,25.519749,1.152231e+06,0
4,1614557040000,45036.62,45107.01,44977.82,45032.48,57.852895,1614557099999,2.605675e+06,1250,22.489983,1.013207e+06,0
...,...,...,...,...,...,...,...,...,...,...,...,...
978281,1673308500000,17195.01,17195.62,17188.40,17190.49,73.435490,1673308559999,1.262463e+06,2453,37.196090,6.394515e+05,0
978282,1673308560000,17189.86,17193.71,17189.86,17191.33,60.592610,1673308619999,1.041708e+06,1993,30.530710,5.248899e+05,0
978283,1673308620000,17191.33,17192.57,17190.03,17190.39,44.755440,1673308679999,7.694088e+05,1548,22.543370,3.875600e+05,0
978284,1673308680000,17190.39,17191.50,17184.94,17187.42,96.226770,1673308739999,1.653931e+06,2473,51.140710,8.790027e+05,0


In [25]:
historical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978286 entries, 0 to 978285
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   open_time                     978286 non-null  int64  
 1   open                          978286 non-null  float64
 2   high                          978286 non-null  float64
 3   low                           978286 non-null  float64
 4   close                         978286 non-null  float64
 5   volume                        978286 non-null  float64
 6   close_time                    978286 non-null  int64  
 7   quote_asset_volume            978286 non-null  float64
 8   num_trades                    978286 non-null  int64  
 9   taker_buy_base_asset_volume   978286 non-null  float64
 10  taker_buy_quote_asset_volume  978286 non-null  float64
 11  ignore                        978286 non-null  int64  
dtypes: float64(8), int64(4)
memory usage: 89.6 M

## Get today's klines using the real-time API

In [28]:
# Let the latest historical close time be the start time for querying the real-time API
historical_end_time = int(historical_df.iloc[-1]['close_time'])

realtime_klines = get_realtime_klines(start_time=historical_end_time + 1, ticker=TICKER_SYMBOL, interval=INTERVAL)
realtime_df = pd.DataFrame(realtime_klines, columns=RAW_DF_HEADERS).apply(pd.to_numeric)

In [29]:
realtime_df

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,1673308800000,17179.04,17184.12,17178.19,17182.31,131.36468,1673308859999,2.257004e+06,3521,74.00751,1.271546e+06,0
1,1673308860000,17182.31,17189.33,17181.41,17185.84,153.28000,1673308919999,2.634216e+06,3929,85.57492,1.470653e+06,0
2,1673308920000,17185.45,17191.90,17182.51,17190.72,178.41648,1673308979999,3.066535e+06,3833,99.24887,1.705825e+06,0
3,1673308980000,17190.72,17196.29,17190.00,17196.28,150.97590,1673309039999,2.595736e+06,4435,87.33872,1.501633e+06,0
4,1673309040000,17195.83,17200.81,17195.43,17199.69,147.13360,1673309099999,2.530435e+06,3281,88.10233,1.515208e+06,0
...,...,...,...,...,...,...,...,...,...,...,...,...
883,1673361780000,17259.00,17276.16,17256.32,17263.20,330.28720,1673361839999,5.702682e+06,8389,162.17186,2.800053e+06,0
884,1673361840000,17263.20,17265.99,17259.53,17260.18,161.81834,1673361899999,2.793351e+06,5063,76.74275,1.324786e+06,0
885,1673361900000,17260.18,17276.00,17256.00,17269.93,435.92700,1673361959999,7.526697e+06,8032,200.86198,3.467965e+06,0
886,1673361960000,17270.57,17271.74,17253.62,17254.17,342.03439,1673362019999,5.904022e+06,7789,130.39259,2.250798e+06,0
