In [208]:
from common.mysql import get_close_prices
from datetime import datetime
import pandas as pd

raw_df = get_close_prices("btcusdtperp", "1h", None, datetime(2022, 10, 30, 0, 0, 0))

print('simple size: %d' %raw_df.shape[0])
print(raw_df.head(10))



simple size: 24689
                              open     high      low    close    volume
open_time                                                              
2020-01-01 08:00:00+08:00  7189.43  7190.52  7170.15  7171.55  2449.050
2020-01-01 09:00:00+08:00  7171.43  7225.00  7171.10  7210.24  3865.040
2020-01-01 10:00:00+08:00  7210.38  7239.30  7206.46  7237.99  3228.360
2020-01-01 11:00:00+08:00  7237.41  7239.74  7215.00  7221.65  2513.310
2020-01-01 12:00:00+08:00  7221.80  7225.41  7211.22  7213.86  1176.670
2020-01-01 13:00:00+08:00  7213.86  7224.09  7213.86  7220.31   928.221
2020-01-01 14:00:00+08:00  7220.31  7230.00  7217.60  7221.00  1283.560
2020-01-01 15:00:00+08:00  7220.99  7227.20  7193.52  7205.26  2349.910
2020-01-01 16:00:00+08:00  7205.26  7206.29  7174.00  7195.24  3385.350
2020-01-01 17:00:00+08:00  7195.23  7204.99  7182.76  7184.45  1687.290


In [209]:
import numpy as np
import pandas_ta as ta

# feature building
class FeatureBuilder:
    data: pd.DataFrame

    def __init__(self, raw: pd.DataFrame):
        self.data = raw.copy()

df = FeatureBuilder(raw_df).data

In [210]:
# log return
df['log_close'] = np.log(df['close'])
df['log_return'] = ta.log_return(df['close'], cumulative=True)
df['log_return_diff_1'] = df['log_return'].diff()

def target_lr_diff(df: pd.DataFrame, p: int) -> pd.DataFrame:
    df['target_lr_diff_%d' %p] = df['log_close'].diff(p).shift(-p)
    df['close_roc_%d' %p] = ta.roc(df['close'], length=p).shift(-p)
    return df

df = target_lr_diff(df, 4)
df = target_lr_diff(df, 8)
df = target_lr_diff(df, 12)
df = target_lr_diff(df, 16)
df = target_lr_diff(df, 20)
df = target_lr_diff(df, 24)
df = target_lr_diff(df, 28)


In [211]:
# adf check
from arch.unitroot import ADF

def adf_check(data: pd.Series, trend: str = "c", pv_check: float = 0.05, show: bool = False):
    adf = ADF(data, trend=trend)
    if adf.pvalue > pv_check:
        print("The process contains a unit root")
        print(data.name, adf.summary().as_text())
    elif show:
        print("The process does not contain a unit root")
        print(data.name, adf.summary().as_text())


In [212]:
# corr check
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots


def corr(df: pd.DataFrame) -> None:
    """生成相关系数矩阵

    Args:
        df (pd.DataFrame): Price-Balance 数据
    """
    fig = make_subplots(
        rows=1,
        cols=3,
        subplot_titles=(
            "皮尔逊积矩相关系数 (Pearson correlation coefficient)",
            "肯德尔等级相关系数 (Kendall rank correlation coefficient)",
            "斯皮尔曼等级相关系数 (Spearman's rank correlation coefficient)",
        ),
    )

    def generate_heatmap(df: pd.DataFrame, col: int):
        fig.add_trace(
            go.Heatmap(
                x=df.columns,
                y=df.index,
                z=np.array(df),
                text=df.values,
                texttemplate="%{text:.2f}",
                colorscale=px.colors.diverging.RdBu,
            ),
            row=1,
            col=col,
        )

    generate_heatmap(df.corr(method="pearson"), 1)
    generate_heatmap(df.corr(method="kendall"), 2)
    generate_heatmap(df.corr(method="spearman"), 3)

    fig.update_layout(
        title="相关系数矩阵 (Correlation matrix)",
        font=dict(family="Courier New, monospace", size=18, color="RebeccaPurple"),
    )

    fig.show()



In [213]:
import talib

# slope
df['slop'] = talib.LINEARREG_SLOPE(df['close'].values, timeperiod=4)

tmp = df.dropna()
adf_check(tmp['slop'], trend="c", pv_check=0.05, show=True)

print(tmp['slop'].head(30))

check = tmp[['close', 'slop', 'target_lr_diff_4']]
corr(check)

The process does not contain a unit root
slop    Augmented Dickey-Fuller Results   
Test Statistic                -23.131
P-value                         0.000
Lags                               48
-------------------------------------

Trend: Constant
Critical Values: -3.43 (1%), -2.86 (5%), -2.57 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary.
open_time
2020-01-01 11:00:00+08:00    17.805
2020-01-01 12:00:00+08:00    -0.548
2020-01-01 13:00:00+08:00    -6.083
2020-01-01 14:00:00+08:00     0.450
2020-01-01 15:00:00+08:00    -2.511
2020-01-01 16:00:00+08:00    -9.095
2020-01-01 17:00:00+08:00   -11.967
2020-01-01 18:00:00+08:00    -3.482
2020-01-01 19:00:00+08:00     0.503
2020-01-01 20:00:00+08:00    10.421
2020-01-01 21:00:00+08:00     8.294
2020-01-01 22:00:00+08:00     6.727
2020-01-01 23:00:00+08:00     2.757
2020-01-02 00:00:00+08:00     9.180
2020-01-02 01:00:00+08:00     6.342
2020-01-02 02:00:00+08:00     1.583

In [214]:
def target_value(pnl: float) -> float:
    free = 0.08/100
    p = pnl/100
    return free + p

df.dropna(inplace=True)

df['target'] = df['target_lr_diff_4'].apply(lambda x: 1 if x > 0 else 0)
df

feature = pd.DataFrame(df["target"], columns=['target'], index=df.index)

feature

Unnamed: 0_level_0,target
open_time,Unnamed: 1_level_1
2020-01-01 11:00:00+08:00,0
2020-01-01 12:00:00+08:00,0
2020-01-01 13:00:00+08:00,0
2020-01-01 14:00:00+08:00,0
2020-01-01 15:00:00+08:00,0
...,...
2022-10-28 16:00:00+08:00,1
2022-10-28 17:00:00+08:00,1
2022-10-28 18:00:00+08:00,1
2022-10-28 19:00:00+08:00,1
