In [248]:
from common.mysql import get_close_prices
from datetime import datetime
import pandas as pd

raw_df = get_close_prices("btcusdtperp", "1h", None, datetime(2022, 10, 30, 0, 0, 0))

print('simple size: %d' %raw_df.shape[0])
print(raw_df.head(10))



simple size: 24689
                              open     high      low    close    volume
open_time                                                              
2020-01-01 08:00:00+08:00  7189.43  7190.52  7170.15  7171.55  2449.050
2020-01-01 09:00:00+08:00  7171.43  7225.00  7171.10  7210.24  3865.040
2020-01-01 10:00:00+08:00  7210.38  7239.30  7206.46  7237.99  3228.360
2020-01-01 11:00:00+08:00  7237.41  7239.74  7215.00  7221.65  2513.310
2020-01-01 12:00:00+08:00  7221.80  7225.41  7211.22  7213.86  1176.670
2020-01-01 13:00:00+08:00  7213.86  7224.09  7213.86  7220.31   928.221
2020-01-01 14:00:00+08:00  7220.31  7230.00  7217.60  7221.00  1283.560
2020-01-01 15:00:00+08:00  7220.99  7227.20  7193.52  7205.26  2349.910
2020-01-01 16:00:00+08:00  7205.26  7206.29  7174.00  7195.24  3385.350
2020-01-01 17:00:00+08:00  7195.23  7204.99  7182.76  7184.45  1687.290


In [249]:
import numpy as np
import pandas_ta as ta

class TargetBuilder:
    timeperiod: int
    
    def __init__(self, timeperiod: int = 1, pnl_pct: float = 0.015, fee_pct: float = 0.08):
        self.timeperiod = timeperiod
        self.target_value = fee_pct/100 + pnl_pct/100
    
    def predo(self, ohlcv: pd.DataFrame) -> pd.DataFrame:
        ohlcv['log_close'] = np.log(ohlcv['close'])
        ohlcv['log_return'] = ta.log_return(ohlcv['close'], cumulative=True)
        ohlcv['lr_diff'] = ohlcv['log_return'].diff(self.timeperiod)
        ohlcv['close_roc'] = ta.roc(ohlcv['close'], length=self.timeperiod)
        ohlcv['target_lr_diff'] = ohlcv['log_return'].diff(self.timeperiod).shift(-self.timeperiod)
        ohlcv['target_close_roc'] = ta.roc(ohlcv['close'], length=self.timeperiod).shift(-self.timeperiod)
        return ohlcv
        
    def target(self, ohlcv: pd.DataFrame) -> pd.DataFrame:
        df = self.predo(ohlcv)
        
        # FIXME: this is a hack, need to fix the logic
        df['target'] = df['target_lr_diff'].apply(lambda x: 1 if x > self.target_value else 0)
        return df
        
# feature building
class FeatureBuilder:
    data: pd.DataFrame

    def __init__(self, raw: pd.DataFrame, target_builder: TargetBuilder = TargetBuilder(timeperiod=4)):
        self.data = raw.copy()
        self.target_builder = target_builder
    
    def build_target(self) -> pd.DataFrame:
        self.data = self.target_builder.target(self.data)
        return self.data
    
    
    
df = FeatureBuilder(raw_df).build_target()

# df.head(50)

In [250]:
import talib
from mlq.features.correlation import CorrelationAnalyer
from mlq.features.adf import ADFChecker


# corr_checker = CorrChecker(tmp)
# print(corr_checker.corr_check('close', 'slop'))
# corr_checker.plot(['close', 'slop'], method='pearson')


class SlopeFactor:
    timeperiod: int
    factor: pd.Series
    source: str
    

    def __init__(self, timeperiod: int, source: str = 'close'):
        self.timeperiod = timeperiod
        self.source = source
    
    def name(self) -> str:
        return self.source+'_slope_%d' %self.timeperiod
    
    def extract(self, df: pd.DataFrame) -> pd.DataFrame:
        self.factor = pd.Series(talib.LINEARREG_SLOPE(df[self.source].values, timeperiod=self.timeperiod), index=df.index)
        ADFChecker(self.factor).is_stationary()
        df[self.name()] = self.factor
        return df


slope_4 = SlopeFactor(4)
df = slope_4.extract(df)
df.dropna(inplace=True)

ca = CorrelationAnalyer()
print(ca.corr(df['lr_diff'], df[slope_4.name()]))

ca.plot(df[['close', 'lr_diff', 'close_roc', slope_4.name(),]], method='pearson')

df


0.7404605969871658


Unnamed: 0_level_0,open,high,low,close,volume,log_close,log_return,lr_diff,close_roc,target_lr_diff,target_close_roc,target,close_slope_4
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-01-01 12:00:00+08:00,7221.80,7225.41,7211.22,7213.86,1176.670,8.883759,0.005882,0.005882,0.589970,-0.002584,-0.258114,0,-0.548
2020-01-01 13:00:00+08:00,7213.86,7224.09,7213.86,7220.31,928.221,8.884653,0.006776,0.001396,0.139662,-0.004979,-0.496655,0,-6.083
2020-01-01 14:00:00+08:00,7220.31,7230.00,7217.60,7221.00,1283.560,8.884749,0.006872,-0.002350,-0.234734,-0.003294,-0.328902,0,0.450
2020-01-01 15:00:00+08:00,7220.99,7227.20,7193.52,7205.26,2349.910,8.882567,0.004690,-0.002272,-0.226956,-0.001752,-0.175011,0,-2.511
2020-01-01 16:00:00+08:00,7205.26,7206.29,7174.00,7195.24,3385.350,8.881175,0.003298,-0.002584,-0.258114,0.003535,0.354123,1,-9.095
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-29 16:00:00+08:00,20765.80,20977.90,20716.60,20912.70,52811.000,9.948112,1.070235,0.011668,1.173676,-0.007449,-0.742133,0,88.700
2022-10-29 17:00:00+08:00,20912.60,20960.00,20866.00,20944.40,18370.900,9.949627,1.071749,0.014272,1.437455,-0.008261,-0.822654,0,95.820
2022-10-29 18:00:00+08:00,20944.40,21088.00,20708.90,20765.90,58478.100,9.941067,1.063190,0.004435,0.444520,0.001593,0.159396,1,3.230
2022-10-29 19:00:00+08:00,20766.00,20795.70,20637.00,20688.70,29009.600,9.937343,1.059466,-0.003715,-0.370804,0.010793,1.085133,1,-85.050


In [251]:
df = FeatureBuilder(raw_df).build_target()

for i in range(3, 10):
    tmp = df.copy()
    slope = SlopeFactor(i)
    tmp = slope.extract(tmp)
    tmp.dropna(inplace=True)
    corr = ca.corr(tmp['lr_diff'], tmp[slope.name()])
    if abs(corr) > 0.70:
        print(i, corr)

###
# range 3-100: 
#   4 0.7404605969871658
#   5 0.8490465368552432
#   6 0.8047044788906014
#   7 0.7253844844051531
###

4 0.7404605969871658
5 0.8490465368552432
6 0.8047044788906014
7 0.7253844844051531


In [252]:
feature = FeatureBuilder(raw_df).build_target()

slope = SlopeFactor(5)
feature = slope.extract(feature)
tmp = feature.dropna()

ca.plot(tmp[['close', 'lr_diff', 'close_roc', slope.name(),]], method='all')

In [253]:
df = FeatureBuilder(raw_df).build_target()
low_slope= SlopeFactor(4, source='low')
close_slope= SlopeFactor(4)
df = low_slope.extract(df)
df = close_slope.extract(df)
df.dropna(inplace=True)

print(ca.corr(df['lr_diff'], df[low_slope.name()]))

ca.plot(df[['close', 'lr_diff', 'close_roc', low_slope.name(), close_slope.name()]], method='pearson')

0.7153077741045916


In [254]:
class RSIFactor:
    timeperiod: int
    factor: pd.Series
    source: str
    

    def __init__(self, timeperiod: int):
        self.timeperiod = timeperiod
    
    def name(self) -> str:
        return 'rsi_%d' %self.timeperiod
    
    def extract(self, df: pd.DataFrame) -> pd.DataFrame:
        self.factor = ta.rsi(df['close'], length=self.timeperiod)
        ADFChecker(self.factor).is_stationary()
        df[self.name()] = self.factor
        return df

df = FeatureBuilder(raw_df).build_target()

rsi = RSIFactor(14)
df = rsi.extract(df)

df.dropna(inplace=True)

print(ca.corr(df['lr_diff'], df[rsi.name()]))
ca.plot(df[['close', 'lr_diff', 'close_roc', rsi.name(),]], method='pearson')

0.5688727015765142


In [255]:
df = FeatureBuilder(raw_df).build_target()

for i in range(3, 24):
    tmp = df.copy()
    factor = RSIFactor(i)
    tmp = factor.extract(tmp)
    tmp.dropna(inplace=True)
    corr = ca.corr(tmp['lr_diff'], tmp[factor.name()])
    if abs(corr) > 0.60:
        print(i, corr)

3 0.7157577000982975
4 0.726083951304441
5 0.716089431483751
6 0.6992440032931271
7 0.6804528281644244
8 0.6616292597700635
9 0.6435564507623047
10 0.6265185555718413
11 0.6105795955549073


In [256]:
# rsi = RSIFactor(4)
# feature = rsi.extract(feature)
# tmp = feature.dropna()

# ca.plot(tmp[['close', 'lr_diff', 'close_roc', slope.name(), rsi.name()]], method='all')

# feature

In [257]:

df = FeatureBuilder(raw_df).build_target()

rsi = RSIFactor(14)
df = rsi.extract(df)

df['rsi_80'] = df[rsi.name()] - 80
df['rsi_90'] = df[rsi.name()] - 90
df['rsi_20'] = df[rsi.name()] - 20
df['rsi_60'] = df[rsi.name()] - 60
df['rsi_diff'] = df[rsi.name()].diff(4)
df['rsi_60_diff'] = df['rsi_60'].diff(4)
df['rsi_80_diff'] = df['rsi_80'].diff(4)
df['rsi_20_diff'] = df['rsi_20'].diff(4)
df['rsi_90_diff'] = df['rsi_90'].diff(4)

df.dropna(inplace=True)

print(ca.corr(df['lr_diff'], df[rsi.name()]))
ca.plot(df[['close', 'lr_diff', 'close_roc', rsi.name(), 'rsi_diff', 'rsi_20_diff', 'rsi_60_diff', 'rsi_80_diff', 'rsi_90_diff']], method='pearson')

df

0.5688957311828076


Unnamed: 0_level_0,open,high,low,close,volume,log_close,log_return,lr_diff,close_roc,target_lr_diff,...,rsi_14,rsi_80,rsi_90,rsi_20,rsi_60,rsi_diff,rsi_60_diff,rsi_80_diff,rsi_20_diff,rsi_90_diff
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 02:00:00+08:00,7233.87,7241.95,7222.00,7237.47,1827.67,8.887027,0.009150,0.002860,0.286415,-0.005884,...,64.855166,-15.144834,-25.144834,44.855166,4.855166,2.702295,2.702295,2.702295,2.702295,2.702295
2020-01-02 03:00:00+08:00,7237.83,7237.83,7223.15,7226.35,1028.58,8.885489,0.007612,-0.000434,-0.043433,-0.003991,...,60.598452,-19.401548,-29.401548,40.598452,0.598452,-4.142663,-4.142663,-4.142663,-4.142663,-4.142663
2020-01-02 04:00:00+08:00,7226.56,7242.00,7223.09,7233.30,1476.53,8.886451,0.008574,-0.001190,-0.118891,-0.003464,...,62.265445,-17.734555,-27.734555,42.265445,2.265445,-4.846269,-4.846269,-4.846269,-4.846269,-4.846269
2020-01-02 05:00:00+08:00,7233.32,7238.79,7225.00,7229.71,1210.88,8.885954,0.008077,-0.000566,-0.056540,-0.005684,...,60.833722,-19.166278,-29.166278,40.833722,0.833722,-3.248226,-3.248226,-3.248226,-3.248226,-3.248226
2020-01-02 06:00:00+08:00,7229.71,7229.71,7183.26,7195.01,4361.31,8.881143,0.003266,-0.005884,-0.586669,-0.003962,...,49.085218,-30.914782,-40.914782,29.085218,-10.914782,-15.769948,-15.769948,-15.769948,-15.769948,-15.769948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-29 16:00:00+08:00,20765.80,20977.90,20716.60,20912.70,52811.00,9.948112,1.070235,0.011668,1.173676,-0.007449,...,70.389351,-9.610649,-19.610649,50.389351,10.389351,10.963428,10.963428,10.963428,10.963428,10.963428
2022-10-29 17:00:00+08:00,20912.60,20960.00,20866.00,20944.40,18370.90,9.949627,1.071749,0.014272,1.437455,-0.008261,...,71.538203,-8.461797,-18.461797,51.538203,11.538203,13.924189,13.924189,13.924189,13.924189,13.924189
2022-10-29 18:00:00+08:00,20944.40,21088.00,20708.90,20765.90,58478.10,9.941067,1.063190,0.004435,0.444520,0.001593,...,57.912677,-22.087323,-32.087323,37.912677,-2.087323,-1.273766,-1.273766,-1.273766,-1.273766,-1.273766
2022-10-29 19:00:00+08:00,20766.00,20795.70,20637.00,20688.70,29009.60,9.937343,1.059466,-0.003715,-0.370804,0.010793,...,53.193786,-26.806214,-36.806214,33.193786,-6.806214,-10.966211,-10.966211,-10.966211,-10.966211,-10.966211


In [258]:
class RSIDiffFactor:
    timeperiod: int
    factor: pd.Series
    source: str
    

    def __init__(self, timeperiod: int, diff: int):
        self.timeperiod = timeperiod
        self.diff = diff
    
    def name(self) -> str:
        return 'rsi_diff_%d' %self.timeperiod
    
    def extract(self, df: pd.DataFrame) -> pd.DataFrame:
        self.factor = ta.rsi(df['close'], length=self.timeperiod)
        ADFChecker(self.factor).is_stationary()
        df[self.name()] = self.factor.diff(self.diff)
        return df

df = FeatureBuilder(raw_df).build_target()

rsi = RSIDiffFactor(14, 4)
df = rsi.extract(df)
df.dropna(inplace=True)

print(ca.corr(df['lr_diff'], df[rsi.name()]))
ca.plot(df[['close', 'lr_diff', 'close_roc', rsi.name(),]], method='pearson')


0.7813455805246339


In [259]:
df = FeatureBuilder(raw_df).build_target()

for i in range(250, 251):
    for j in range(4, 6):
        tmp = df.copy()
        factor = RSIDiffFactor(i, j)
        tmp = factor.extract(tmp)
        tmp.dropna(inplace=True)
        corr = ca.corr(tmp['lr_diff'], tmp[factor.name()])
        if abs(corr) > 0.80:
            print(i,j, corr)

for i in range(366, 367):
    for j in range(4, 6):
        tmp = df.copy()
        factor = RSIDiffFactor(i, j)
        tmp = factor.extract(tmp)
        tmp.dropna(inplace=True)
        corr = ca.corr(tmp['lr_diff'], tmp[factor.name()])
        if abs(corr) > 0.80:
            print(i,j, corr)

250 4 0.9133501605733154
250 5 0.8136335507309784
366 4 0.9236737207144331
366 5 0.8218245553901113


In [260]:
rsi = RSIDiffFactor(366, 4)
feature = rsi.extract(feature)
tmp = feature.dropna()

ca.plot(tmp[['close', 'lr_diff', 'close_roc', slope.name(), rsi.name()]], method='all')

feature

Unnamed: 0_level_0,open,high,low,close,volume,log_close,log_return,lr_diff,close_roc,target_lr_diff,target_close_roc,target,close_slope_5,rsi_diff_366
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-01 08:00:00+08:00,7189.43,7190.52,7170.15,7171.55,2449.05,8.877877,0.000000,,,0.005882,0.589970,1,,
2020-01-01 09:00:00+08:00,7171.43,7225.00,7171.10,7210.24,3865.04,8.883258,0.005380,,,0.001396,0.139662,1,,
2020-01-01 10:00:00+08:00,7210.38,7239.30,7206.46,7237.99,3228.36,8.887099,0.009222,,,-0.002350,-0.234734,0,,
2020-01-01 11:00:00+08:00,7237.41,7239.74,7215.00,7221.65,2513.31,8.884839,0.006962,,,-0.002272,-0.226956,0,,
2020-01-01 12:00:00+08:00,7221.80,7225.41,7211.22,7213.86,1176.67,8.883759,0.005882,0.005882,0.589970,-0.002584,-0.258114,0,9.603,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-29 20:00:00+08:00,20688.60,20767.10,20669.30,20757.50,20205.00,9.940663,1.062786,-0.007449,-0.742133,0.004528,0.453812,1,-56.610,-0.432350
2022-10-29 21:00:00+08:00,20757.40,20775.00,20680.40,20772.10,17113.00,9.941366,1.063489,-0.008261,-0.822654,,,0,-35.300,-0.471894
2022-10-29 22:00:00+08:00,20772.20,20929.00,20746.00,20799.00,37011.30,9.942660,1.064783,0.001593,0.159396,,,0,14.960,0.057195
2022-10-29 23:00:00+08:00,20798.90,20939.00,20784.40,20913.20,29157.80,9.948136,1.070259,0.010793,1.085133,,,0,49.050,0.521776
