In [10]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import numpy as np


class CorrelationAnalyer:
    def corr(self, k1: pd.Series, k2: pd.Series) -> float:
        return np.corrcoef(k1.values, k2.values)[0, 1]

    def plot(self, df: pd.DataFrame, method: str = "all") -> None:
        fig = make_subplots(
            rows=1,
            cols=3 if method == "all" else 1,
            subplot_titles=(
                "皮尔逊积矩相关系数 (Pearson correlation coefficient)",
                "肯德尔等级相关系数 (Kendall rank correlation coefficient)",
                "斯皮尔曼等级相关系数 (Spearman's rank correlation coefficient)",
            )
            if method == "all"
            else ("皮尔逊积矩相关系数 (Pearson correlation coefficient)",)
            if method == "pearson"
            else ("肯德尔等级相关系数 (Kendall rank correlation coefficient)",)
            if method == "kendall"
            else ("斯皮尔曼等级相关系数 (Spearman's rank correlation coefficient)",),
        )

        def generate_heatmap(df: pd.DataFrame, col: int):
            fig.add_trace(
                go.Heatmap(
                    x=df.columns,
                    y=df.index,
                    z=np.array(df),
                    text=df.values,
                    texttemplate="%{text:.2f}",
                    colorscale=px.colors.diverging.RdBu,
                ),
                row=1,
                col=col,
            )

        if method == "all":
            generate_heatmap(df.corr(method="pearson"), 1)
            generate_heatmap(df.corr(method="kendall"), 2)
            generate_heatmap(df.corr(method="spearman"), 3)

        elif method == "pearson":
            generate_heatmap(df.corr(method="pearson"), 1)

        elif method == "kendall":
            generate_heatmap(df.corr(method="kendall"), 1)

        elif method == "spearman":
            generate_heatmap(df.corr(method="spearman"), 1)

        fig.update_layout(
            title="相关系数矩阵 (Correlation matrix)",
        )

        fig.show()


In [11]:
from common.mysql import get_close_prices
from datetime import datetime
import pandas as pd

raw_df = get_close_prices("btcusdtperp", "1h", None, datetime(2022, 10, 30, 0, 0, 0))

print('simple size: %d' %raw_df.shape[0])
print(raw_df.head(10))



simple size: 24689
                              open     high      low    close    volume
open_time                                                              
2020-01-01 08:00:00+08:00  7189.43  7190.52  7170.15  7171.55  2449.050
2020-01-01 09:00:00+08:00  7171.43  7225.00  7171.10  7210.24  3865.040
2020-01-01 10:00:00+08:00  7210.38  7239.30  7206.46  7237.99  3228.360
2020-01-01 11:00:00+08:00  7237.41  7239.74  7215.00  7221.65  2513.310
2020-01-01 12:00:00+08:00  7221.80  7225.41  7211.22  7213.86  1176.670
2020-01-01 13:00:00+08:00  7213.86  7224.09  7213.86  7220.31   928.221
2020-01-01 14:00:00+08:00  7220.31  7230.00  7217.60  7221.00  1283.560
2020-01-01 15:00:00+08:00  7220.99  7227.20  7193.52  7205.26  2349.910
2020-01-01 16:00:00+08:00  7205.26  7206.29  7174.00  7195.24  3385.350
2020-01-01 17:00:00+08:00  7195.23  7204.99  7182.76  7184.45  1687.290


In [12]:
import numpy as np
import pandas_ta as ta

class TargetBuilder:
    timeperiod: int
    
    def __init__(self, timeperiod: int = 1, pnl_pct: float = 0.015, fee_pct: float = 0.08):
        self.timeperiod = timeperiod
        self.target_value = fee_pct/100 + pnl_pct/100
    
    def predo(self, ohlcv: pd.DataFrame) -> pd.DataFrame:
        ohlcv['log_close'] = np.log(ohlcv['close'])
        ohlcv['log_return'] = ta.log_return(ohlcv['close'], cumulative=True)
        ohlcv['lr_diff'] = ohlcv['log_return'].diff(self.timeperiod)
        ohlcv['close_roc'] = ta.roc(ohlcv['close'], length=self.timeperiod)
        ohlcv['target_lr_diff'] = ohlcv['log_return'].diff(self.timeperiod).shift(-self.timeperiod)
        ohlcv['target_close_roc'] = ta.roc(ohlcv['close'], length=self.timeperiod).shift(-self.timeperiod)
        return ohlcv
        
    def target(self, ohlcv: pd.DataFrame) -> pd.DataFrame:
        df = self.predo(ohlcv)
        
        # FIXME: this is a hack, need to fix the logic
        df['target'] = df['target_lr_diff'].apply(lambda x: 1 if x > self.target_value else 0)
        return df
        
# feature building
class FeatureBuilder:
    data: pd.DataFrame

    def __init__(self, raw: pd.DataFrame, target_builder: TargetBuilder = TargetBuilder(timeperiod=4)):
        self.data = raw.copy()
        self.target_builder = target_builder
    
    def build_target(self) -> pd.DataFrame:
        self.data = self.target_builder.target(self.data)
        return self.data
    
    
    
df = FeatureBuilder(raw_df).build_target()

df.head(50)

Unnamed: 0_level_0,open,high,low,close,volume,log_close,log_return,lr_diff,close_roc,target_lr_diff,target_close_roc,target
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-01-01 08:00:00+08:00,7189.43,7190.52,7170.15,7171.55,2449.05,8.877877,0.0,,,0.005882,0.58997,1
2020-01-01 09:00:00+08:00,7171.43,7225.0,7171.1,7210.24,3865.04,8.883258,0.00538,,,0.001396,0.139662,1
2020-01-01 10:00:00+08:00,7210.38,7239.3,7206.46,7237.99,3228.36,8.887099,0.009222,,,-0.00235,-0.234734,0
2020-01-01 11:00:00+08:00,7237.41,7239.74,7215.0,7221.65,2513.31,8.884839,0.006962,,,-0.002272,-0.226956,0
2020-01-01 12:00:00+08:00,7221.8,7225.41,7211.22,7213.86,1176.67,8.883759,0.005882,0.005882,0.58997,-0.002584,-0.258114,0
2020-01-01 13:00:00+08:00,7213.86,7224.09,7213.86,7220.31,928.221,8.884653,0.006776,0.001396,0.139662,-0.004979,-0.496655,0
2020-01-01 14:00:00+08:00,7220.31,7230.0,7217.6,7221.0,1283.56,8.884749,0.006872,-0.00235,-0.234734,-0.003294,-0.328902,0
2020-01-01 15:00:00+08:00,7220.99,7227.2,7193.52,7205.26,2349.91,8.882567,0.00469,-0.002272,-0.226956,-0.001752,-0.175011,0
2020-01-01 16:00:00+08:00,7205.26,7206.29,7174.0,7195.24,3385.35,8.881175,0.003298,-0.002584,-0.258114,0.003535,0.354123,1
2020-01-01 17:00:00+08:00,7195.23,7204.99,7182.76,7184.45,1687.29,8.879674,0.001797,-0.004979,-0.496655,0.004318,0.43274,1


In [13]:
import talib
from mlq.features.adf import ADFChecker


# corr_checker = CorrChecker(tmp)
# print(corr_checker.corr_check('close', 'slop'))
# corr_checker.plot(['close', 'slop'], method='pearson')


class SlopeFactor:
    timeperiod: int
    factor: pd.Series
    

    def __init__(self, timeperiod: int):
        self.timeperiod = timeperiod
    
    def name(self) -> str:
        return 'slope_%d' %self.timeperiod
    
    def extract(self, df: pd.DataFrame) -> pd.DataFrame:
        self.factor = pd.Series(talib.LINEARREG_SLOPE(df['close'].values, timeperiod=self.timeperiod), index=df.index)
        ADFChecker(self.factor).is_stationary()
        df[self.name()] = self.factor
        return df


slope_4 = SlopeFactor(4)
df = slope_4.extract(df)
df.dropna(inplace=True)

ca = CorrelationAnalyer()
print(ca.corr(df['lr_diff'], df[slope_4.name()]))

ca.plot(df[['close', 'lr_diff', 'close_roc', slope_4.name(),]], method='pearson')

df


0.7404605969871658


Unnamed: 0_level_0,open,high,low,close,volume,log_close,log_return,lr_diff,close_roc,target_lr_diff,target_close_roc,target,slope_4
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-01-01 12:00:00+08:00,7221.80,7225.41,7211.22,7213.86,1176.670,8.883759,0.005882,0.005882,0.589970,-0.002584,-0.258114,0,-0.548
2020-01-01 13:00:00+08:00,7213.86,7224.09,7213.86,7220.31,928.221,8.884653,0.006776,0.001396,0.139662,-0.004979,-0.496655,0,-6.083
2020-01-01 14:00:00+08:00,7220.31,7230.00,7217.60,7221.00,1283.560,8.884749,0.006872,-0.002350,-0.234734,-0.003294,-0.328902,0,0.450
2020-01-01 15:00:00+08:00,7220.99,7227.20,7193.52,7205.26,2349.910,8.882567,0.004690,-0.002272,-0.226956,-0.001752,-0.175011,0,-2.511
2020-01-01 16:00:00+08:00,7205.26,7206.29,7174.00,7195.24,3385.350,8.881175,0.003298,-0.002584,-0.258114,0.003535,0.354123,1,-9.095
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-29 16:00:00+08:00,20765.80,20977.90,20716.60,20912.70,52811.000,9.948112,1.070235,0.011668,1.173676,-0.007449,-0.742133,0,88.700
2022-10-29 17:00:00+08:00,20912.60,20960.00,20866.00,20944.40,18370.900,9.949627,1.071749,0.014272,1.437455,-0.008261,-0.822654,0,95.820
2022-10-29 18:00:00+08:00,20944.40,21088.00,20708.90,20765.90,58478.100,9.941067,1.063190,0.004435,0.444520,0.001593,0.159396,1,3.230
2022-10-29 19:00:00+08:00,20766.00,20795.70,20637.00,20688.70,29009.600,9.937343,1.059466,-0.003715,-0.370804,0.010793,1.085133,1,-85.050


In [14]:
df = FeatureBuilder(raw_df).build_target()

for i in range(3, 100):
    tmp = df.copy()
    slope = SlopeFactor(i)
    tmp = slope.extract(tmp)
    tmp.dropna(inplace=True)
    corr = ca.corr(tmp['lr_diff'], tmp[slope.name()])
    if abs(corr) > 0.70:
        print(i, corr)

4 0.7404605969871658
5 0.8490465368552432
6 0.8047044788906014
7 0.7253844844051531


In [15]:
df = FeatureBuilder(raw_df).build_target()

slope = SlopeFactor(5)
df = slope.extract(df)
tmp = df.dropna()

ca.plot(tmp[['close', 'lr_diff', 'close_roc', slope.name(),]], method='all')