In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 15)
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
%matplotlib inline

In [2]:
import talib
from ta import add_all_ta_features

### Read full database

In [5]:
DATA_PATH = r'../data/'
FILE_NAME = '20220313_034756_111.pkl'
FULL_PATH = Path(DATA_PATH,FILE_NAME)

In [35]:
%time
raw = pd.read_pickle(FULL_PATH)
raw.shape

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 8.82 µs


(5675397, 14)

In [37]:
raw.dtypes

symbol                              object
openTimets                           int64
closeTimets                          int64
openTime                    datetime64[ns]
closeTime                   datetime64[ns]
open                                object
high                                object
low                                 object
close                               object
volume                              object
numTrades                            int64
quoteAssetVolume                    object
takerBuyBaseAssetVolume             object
takerBuyQuoteAssetVolume            object
dtype: object

In [38]:
raw.symbol.value_counts()

BTCUSDT     87964
ETHUSDT     80325
BCHUSDT     78208
XRPUSDT     76486
EOSUSDT     76292
            ...  
BAKEUSDT    28582
BTTUSDT     28339
GTCUSDT     26390
TLMUSDT     23032
KEEPUSDT    22843
Name: symbol, Length: 111, dtype: int64

### Prepare and subset dataset

In [265]:
df = raw.query('symbol=="ETHUSDT"')
assert df.isnull().sum().sum() == 0
df.shape

(80325, 14)

In [266]:
# create datatypes dictionary
datatypes = {k:'float32' if k!='numTrades' else 'int32' for k in df.loc[:,'open':].columns}
# assign datatypes
df = df.astype(datatypes)
assert df.isnull().sum().sum() == 0
df.dtypes

symbol                              object
openTimets                           int64
closeTimets                          int64
openTime                    datetime64[ns]
closeTime                   datetime64[ns]
open                               float32
high                               float32
low                                float32
close                              float32
volume                             float32
numTrades                            int32
quoteAssetVolume                   float32
takerBuyBaseAssetVolume            float32
takerBuyQuoteAssetVolume           float32
dtype: object

In [268]:
df.to_pickle('../data/eth.pkl')

### Read dataset from disk

In [36]:
raw = pd.read_pickle('../data/eth.pkl')

### Group on desired frequency

In [49]:
df = \
    raw.groupby(pd.Grouper(key="openTime", freq="1H"),as_index=False)\
    .agg(
    cnt=pd.NamedAgg(column="symbol", aggfunc="size"),
    opentime=pd.NamedAgg(column="openTime", aggfunc="min"),
    closetime=pd.NamedAgg(column="openTime", aggfunc="min"),
    open=pd.NamedAgg(column="open", aggfunc="first"),
    high=pd.NamedAgg(column="high", aggfunc="max"),
    low=pd.NamedAgg(column="low", aggfunc="min"),
    close=pd.NamedAgg(column="close", aggfunc="last"),
    volume=pd.NamedAgg(column="volume", aggfunc="sum"),
    numtrade=pd.NamedAgg(column="numTrades", aggfunc="sum"),
    )\
    .query('cnt >= 4')\
    .drop('cnt',axis=1)\
    .assign(closetime = lambda x : x.closetime + pd.Timedelta(minutes=59))

# alternative way to resample and regroup data
# gdf = df.resample('30min',on='opentime').agg({'close':'last','high':'max'}).tail(10)
df.tail()

Unnamed: 0,opentime,closetime,open,high,low,close,volume,numtrade
20077,2022-03-12 20:00:00,2022-03-12 20:59:00,2576.76001,2591.659912,2573.870117,2586.800049,25150.1875,35923
20078,2022-03-12 21:00:00,2022-03-12 21:59:00,2586.800049,2591.719971,2583.060059,2585.649902,13017.638672,24267
20079,2022-03-12 22:00:00,2022-03-12 22:59:00,2585.649902,2591.379883,2583.26001,2585.830078,10603.583008,26367
20080,2022-03-12 23:00:00,2022-03-12 23:59:00,2585.840088,2590.199951,2560.0,2567.639893,35792.011719,49081
20081,2022-03-13 00:00:00,2022-03-13 00:59:00,2567.629883,2573.780029,2555.810059,2565.97998,41614.472656,50466


### Examine Target Related Features

In [57]:
df.close.pct_change().describe()
df.close.pct_change().hist(bins=50)
df.close.pct_change().plot.box()

count    20080.000000
mean         0.000204
std          0.010596
min         -0.216952
25%         -0.004041
50%          0.000210
75%          0.004539
max          0.157422
Name: close, dtype: float64

In [232]:
# df.set_index('openTime',inplace=True)
# df.columns = [x.lower() for x in df.columns]
df['change_pct'] = df.close.pct_change()
df['change_chr'] = df.change_pct.apply(np.sign)
df.dropna(inplace=True)

In [224]:
df.change_pct.describe()

count    80324.000000
mean         0.000009
std          0.009614
min         -0.514409
25%         -0.002202
50%          0.000050
75%          0.002320
max          1.063550
Name: change_pct, dtype: float64

### Manually Create Features

In [65]:
df = \
    df.assign(
        fx_sma_fast = lambda x : talib.SMA(x.close,8),
        fx_sma_slow = lambda x : x.close.rolling(24).mean(),
        fx_ema_fast = lambda x : talib.EMA(x.close,8),
        fx_ema_slow = lambda x : talib.EMA(x.close,24),
        fx_atr = lambda x : talib.ATR(x.high,x.low,x.close,12),
        fx_adx = lambda x : talib.ADX(x.high,x.low,x.close,20),
        fx_rsi = lambda x : talib.RSI(x.close,12),
        fx_bb_upper = lambda x : talib.BBANDS(x.close)[0],
        fx_bb_middle = lambda x : talib.BBANDS(x.close)[1],
        fx_bb_lower = lambda x : talib.BBANDS(x.close)[2],
    )

df.tail(2)

Unnamed: 0,opentime,closetime,open,high,low,close,volume,...,fx_ema_slow,fx_atr,fx_adx,fx_rsi,fx_bb_upper,fx_bb_middle,fx_bb_lower
20080,2022-03-12 23:00:00,2022-03-12 23:59:00,2585.840088,2590.199951,2560.0,2567.639893,35792.011719,...,2582.337709,19.186999,6.528699,42.159494,2595.340954,2580.537988,2565.735023
20081,2022-03-13 00:00:00,2022-03-13 00:59:00,2567.629883,2573.780029,2555.810059,2565.97998,41614.472656,...,2581.029091,19.08558,6.748433,41.420362,2597.319099,2578.37998,2559.440862


In [66]:
df.columns

Index(['opentime', 'closetime', 'open', 'high', 'low', 'close', 'volume',
       'numtrade', 'fx_sma_fast', 'fx_sma_slow', 'fx_ema_fast', 'fx_ema_slow',
       'fx_atr', 'fx_adx', 'fx_rsi', 'fx_bb_upper', 'fx_bb_middle',
       'fx_bb_lower'],
      dtype='object')

In [91]:
df.shape

(20081, 79)

### Add all features at once

In [147]:
# qq = add_all_ta_features(
#     df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)
# qq.columns

### Enable Pattern Recognition Features

In [58]:
talib.get_function_groups().keys()

dict_keys(['Cycle Indicators', 'Math Operators', 'Math Transform', 'Momentum Indicators', 'Overlap Studies', 'Pattern Recognition', 'Price Transform', 'Statistic Functions', 'Volatility Indicators', 'Volume Indicators'])

In [60]:
# pattern_recognition_functions = [x for x in dir(talib) if x.startswith('CDL')]
pattern_recognition_functions = talib.get_function_groups().get('Pattern Recognition')
pattern_recognition_functions[1::10]

['CDL3BLACKCROWS',
 'CDLCLOSINGMARUBOZU',
 'CDLGAPSIDESIDEWHITE',
 'CDLIDENTICAL3CROWS',
 'CDLMATHOLD',
 'CDLSPINNINGTOP']

In [62]:
# create pattern recognition features
pattern_features = {
    f'pt_{x.lower()}': getattr(talib,x)(df['open'],df['high'],df['low'],df['close'])
                    for x in pattern_recognition_functions
    }

In [67]:
df = df.assign(**pattern_features)
df.head(2)

Unnamed: 0,opentime,closetime,open,high,low,close,volume,...,pt_cdltakuri,pt_cdltasukigap,pt_cdlthrusting,pt_cdltristar,pt_cdlunique3river,pt_cdlupsidegap2crows,pt_cdlxsidegap3methods
1,2019-11-27 08:00:00,2019-11-27 08:59:00,146.0,146.0,125.029999,133.0,0.08,...,0,0,0,0,0,0,0
2,2019-11-27 09:00:00,2019-11-27 09:59:00,133.0,133.0,133.0,133.0,0.0,...,0,0,0,0,0,0,0


In [86]:
# df[[x for x in df.columns if x.startswith('pt_')]].describe().T['mean'].plot(kind='barh',figsize=(10,15))
df[[x for x in df.columns if x.startswith('pt_')]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pt_cdl2crows,20081.0,-0.009960,0.997956,-100.0,0.0,0.0,0.0,0.0
pt_cdl3blackcrows,20081.0,-0.004980,0.705679,-100.0,0.0,0.0,0.0,0.0
pt_cdl3inside,20081.0,0.074697,8.319707,-100.0,0.0,0.0,0.0,100.0
pt_cdl3linestrike,20081.0,0.054778,6.029227,-100.0,0.0,0.0,0.0,100.0
pt_cdl3outside,20081.0,0.104576,19.492797,-100.0,0.0,0.0,0.0,100.0
...,...,...,...,...,...,...,...,...
pt_cdlthrusting,20081.0,-0.009960,0.997956,-100.0,0.0,0.0,0.0,0.0
pt_cdltristar,20081.0,-0.009960,0.997956,-100.0,0.0,0.0,0.0,0.0
pt_cdlunique3river,20081.0,0.004980,0.705679,0.0,0.0,0.0,0.0,100.0
pt_cdlupsidegap2crows,20081.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0


### Enable Pivot Levels

In [89]:
def PPS(high,low,close):
    if all([isinstance(item,(list,pd.core.series.Series,np.ndarray)) for item in items]):
        high = np.array(high,dtype='float32')
        low = np.array(low,dtype='float32')
        close = np.array(close,dtype='float32')
    else:
        raise Exception('Wrong type input data type')
    pp = (high + low + close)/3
    r1 = (2 * pp - low)
    s1 = (2 * pp - high)
    r2 = pp + high - low
    s2 = pp - high + low
    r3 = high + 2 * (pp - low)
    s3 = low - 2 * (high - pp)
    return {'pp':pp,'r1':r1,'s1':s1,'r2':r2,'s2':s2,'r3':r3,'s3':s3}

In [90]:
df = df.assign(**PPS(df.high,df.low,df.close))


NameError: name 'items' is not defined