In [13]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 15)
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
%matplotlib inline

In [96]:
import talib
from ta import add_all_ta_features

### Read dataset

In [5]:
DATA_PATH = r'../data/'
FILE_NAME = '20220313_034756_111.pkl'
FULL_PATH = Path(DATA_PATH,FILE_NAME)

In [35]:
%time
raw = pd.read_pickle(FULL_PATH)
raw.shape

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.82 µs


(5675397, 14)

In [37]:
raw.dtypes

symbol                              object
openTimets                           int64
closeTimets                          int64
openTime                    datetime64[ns]
closeTime                   datetime64[ns]
open                                object
high                                object
low                                 object
close                               object
volume                              object
numTrades                            int64
quoteAssetVolume                    object
takerBuyBaseAssetVolume             object
takerBuyQuoteAssetVolume            object
dtype: object

In [38]:
raw.symbol.value_counts()

BTCUSDT     87964
ETHUSDT     80325
BCHUSDT     78208
XRPUSDT     76486
EOSUSDT     76292
            ...  
BAKEUSDT    28582
BTTUSDT     28339
GTCUSDT     26390
TLMUSDT     23032
KEEPUSDT    22843
Name: symbol, Length: 111, dtype: int64

### Prepare and subset dataset

In [265]:
df = raw.query('symbol=="ETHUSDT"')
assert df.isnull().sum().sum() == 0
df.shape

(80325, 14)

In [266]:
# create datatypes dictionary
datatypes = {k:'float32' if k!='numTrades' else 'int32' for k in df.loc[:,'open':].columns}
# assign datatypes
df = df.astype(datatypes)
assert df.isnull().sum().sum() == 0
df.dtypes

symbol                              object
openTimets                           int64
closeTimets                          int64
openTime                    datetime64[ns]
closeTime                   datetime64[ns]
open                               float32
high                               float32
low                                float32
close                              float32
volume                             float32
numTrades                            int32
quoteAssetVolume                   float32
takerBuyBaseAssetVolume            float32
takerBuyQuoteAssetVolume           float32
dtype: object

In [268]:
df.to_pickle('../data/eth.pkl')

### Target Related Features

In [232]:
# df.set_index('openTime',inplace=True)
df.columns = [x.lower() for x in df.columns]
df['change_pct'] = df.close.pct_change()
df['change_chr'] = df.change_pct.apply(np.sign)
df.dropna(inplace=True)

In [224]:
df.change_pct.describe()

count    80324.000000
mean         0.000009
std          0.009614
min         -0.514409
25%         -0.002202
50%          0.000050
75%          0.002320
max          1.063550
Name: change_pct, dtype: float64

### Time Grouper - Resample

In [262]:
df.set_index('opentime').loc['2022-03-12 20:00:00':'2022-03-12 23:30:00',:].head(10)

Unnamed: 0_level_0,symbol,opentimets,closetimets,closetime,open,high,low,close,volume,numtrades,quoteassetvolume,takerbuybaseassetvolume,takerbuyquoteassetvolume,change_pct,change_chr
opentime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-03-12 20:00:00,ETHUSDT,1647115200000,1647116099999,2022-03-12 20:14:59,2576.76001,2580.379883,2573.870117,2579.51001,8516.182617,9737.0,21950342.0,4004.438965,10319064.0,0.001063,1.0
2022-03-12 20:15:00,ETHUSDT,1647116100000,1647116999999,2022-03-12 20:29:59,2579.51001,2587.629883,2579.449951,2587.060059,7683.106934,10116.0,19855858.0,3845.318115,9937714.0,0.002927,1.0
2022-03-12 20:30:00,ETHUSDT,1647117000000,1647117899999,2022-03-12 20:44:59,2587.050049,2588.810059,2584.469971,2588.070068,3602.998047,7579.0,9321289.0,1559.209961,4034204.75,0.00039,1.0
2022-03-12 20:45:00,ETHUSDT,1647117900000,1647118799999,2022-03-12 20:59:59,2588.080078,2591.659912,2584.870117,2586.800049,5347.899902,8491.0,13840698.0,3395.300049,8788191.0,-0.000491,-1.0
2022-03-12 21:00:00,ETHUSDT,1647118800000,1647119699999,2022-03-12 21:14:59,2586.800049,2591.719971,2586.01001,2589.600098,4861.140137,6872.0,12587590.0,2508.48291,6494942.5,0.001082,1.0
2022-03-12 21:15:00,ETHUSDT,1647119700000,1647120599999,2022-03-12 21:29:59,2589.600098,2591.169922,2587.870117,2588.419922,2795.876953,5603.0,7239653.5,1135.925049,2941497.25,-0.000456,-1.0
2022-03-12 21:30:00,ETHUSDT,1647120600000,1647121499999,2022-03-12 21:44:59,2588.429932,2591.02002,2588.429932,2590.300049,2369.573975,5759.0,6136689.0,1155.935059,2993456.0,0.000726,1.0
2022-03-12 21:45:00,ETHUSDT,1647121500000,1647122399999,2022-03-12 21:59:59,2590.300049,2590.469971,2583.060059,2585.649902,2991.048096,6033.0,7737060.5,1319.43396,3412909.75,-0.001795,-1.0
2022-03-12 22:00:00,ETHUSDT,1647122400000,1647123299999,2022-03-12 22:14:59,2585.649902,2588.409912,2584.429932,2584.639893,3061.704102,6739.0,7919266.0,1248.446045,3229312.75,-0.000391,-1.0
2022-03-12 22:15:00,ETHUSDT,1647123300000,1647124199999,2022-03-12 22:29:59,2584.629883,2588.070068,2583.26001,2587.76001,3016.220947,7105.0,7798097.5,2110.601074,5456568.0,0.001207,1.0


In [261]:
# df.resample('30min',on='opentime').agg({'close':'last','high':'max'}).tail(10)
df.groupby(pd.Grouper(key="opentime", freq="30min")).agg({'close':'last','high':'max'}).tail(10)

Unnamed: 0_level_0,close,high
opentime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-03-12 20:00:00,2587.060059,2587.629883
2022-03-12 20:30:00,2586.800049,2591.659912
2022-03-12 21:00:00,2588.419922,2591.719971
2022-03-12 21:30:00,2585.649902,2591.02002
2022-03-12 22:00:00,2587.76001,2588.409912
2022-03-12 22:30:00,2585.830078,2591.379883
2022-03-12 23:00:00,2587.5,2590.199951
2022-03-12 23:30:00,2567.639893,2587.5
2022-03-13 00:00:00,2572.189941,2573.780029
2022-03-13 00:30:00,2565.97998,2572.820068


### Manually Create Features

In [264]:
df.assign(
    fx_sma_fast = lambda x : talib.SMA(x.close,8),
    fx_sma_slow = lambda x : x.close.rolling(24).mean(),
    fx_ema_fast = lambda x : talib.EMA(x.close,8),
    fx_ema_slow = lambda x : talib.EMA(x.close,24),
    fx_atr = lambda x : talib.ATR(x.high,x.low,x.close,12),
    fx_adx = lambda x : talib.ADX(x.high,x.low,x.close,20)
    fx_rsi = lambda x : talib.RSI(x.close,12),
    fx_bb_upper = lambda x : talib.BBANDS(x.close)[0],
    fx_bb_middle = lambda x : talib.BBANDS(x.close)[1],
    fx_bb_lower = lambda x : talib.BBANDS(x.close)[2],
).tail(2)

Unnamed: 0,symbol,opentimets,closetimets,opentime,closetime,open,high,...,fx_ema_slow,fx_atr,fx_rsi,fx_bb_upper,fx_bb_middle,fx_bb_lower,fx_adx
2153164,ETHUSDT,1575202500000,1575203399999,2019-12-01 12:15:00,2019-12-01 12:29:59,147.600006,147.800003,...,147.482921,0.693618,51.553855,147.846085,147.467999,147.089913,35.121647
2153165,ETHUSDT,1575203400000,1575204299999,2019-12-01 12:30:00,2019-12-01 12:44:59,147.559998,148.149994,...,147.524287,0.685816,58.078764,148.09086,147.529999,146.969138,34.31499


### Add all features at once

In [147]:
# qq = add_all_ta_features(
#     df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)
# qq.columns

### Enable Pattern Recognition Features

In [198]:
print(talib.get_function_groups().keys())
# pattern_recognition_functions = [x for x in dir(talib) if x.startswith('CDL')]
pattern_recognition_functions = talib.get_function_groups().get('Pattern Recognition')
pattern_features = {
    f'pt_{x.lower()}': getattr(talib,x)(df['open'],df['high'],df['low'],df['close'])
                    for x in pattern_recognition_functions
    }

dict_keys(['Cycle Indicators', 'Math Operators', 'Math Transform', 'Momentum Indicators', 'Overlap Studies', 'Pattern Recognition', 'Price Transform', 'Statistic Functions', 'Volatility Indicators', 'Volume Indicators'])


In [199]:
df.assign(**pattern_features).head(2)

Unnamed: 0_level_0,symbol,opentimets,closetimets,closetime,open,high,low,...,pt_cdltakuri,pt_cdltasukigap,pt_cdlthrusting,pt_cdltristar,pt_cdlunique3river,pt_cdlupsidegap2crows,pt_cdlxsidegap3methods
openTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-03-02 15:15:00,ETHUSDT,1646234100000,1646234999999,2022-03-02 15:29:59,2954.810059,2985.76001,2945.139893,...,0,0,0,0,0,0,0
2022-03-02 15:30:00,ETHUSDT,1646235000000,1646235899999,2022-03-02 15:44:59,2978.209961,2994.97998,2950.810059,...,0,0,0,0,0,0,0


### Enable Pivot Levels

In [190]:
def PPS(high,low,close):
    if all([isinstance(item,(list,pd.core.series.Series,np.ndarray)) for item in items]):
        high = np.array(high,dtype='float32')
        low = np.array(low,dtype='float32')
        close = np.array(close,dtype='float32')
    else:
        raise Exception('Wrong type input data type')
    pp = (high + low + close)/3
    r1 = (2 * pp - low)
    s1 = (2 * pp - high)
    r2 = pp + high - low
    s2 = pp - high + low
    r3 = high + 2 * (pp - low)
    s3 = low - 2 * (high - pp)
    return {'pp':pp,'r1':r1,'s1':s1,'r2':r2,'s2':s2,'r3':r3,'s3':s3}

In [197]:
df.assign(**PPS(df.high,df.low,df.close)).head(2)

Unnamed: 0_level_0,symbol,opentimets,closetimets,closetime,open,high,low,...,pp,r1,s1,r2,s2,r3,s3
openTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-03-02 15:15:00,ETHUSDT,1646234100000,1646234999999,2022-03-02 15:29:59,2954.810059,2985.76001,2945.139893,...,2969.703125,2994.266357,2953.64624,3010.322998,2929.083008,3034.886475,2913.026123
2022-03-02 15:30:00,ETHUSDT,1646235000000,1646235899999,2022-03-02 15:44:59,2978.209961,2994.97998,2950.810059,...,2967.900146,2984.990234,2940.820312,3012.069824,2923.730225,3029.160156,2896.650391
