In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [12]:
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import config

from utils import DataReader, Validator
from tree import Helper as TreeHelper

# Load data

In [4]:
df = DataReader.read_trade_from_directory(f"{config.trade_logs_id_binance_data_dir}DOGEUSDT/")
df

Unnamed: 0_level_0,id,price,qty,quoteQty,isBuyerMaker
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-04-17 11:07:04.135,171131000,0.308076,30.0,9.242280,False
2021-04-17 11:07:04.145,171131001,0.308060,480.0,147.868800,True
2021-04-17 11:07:04.145,171131002,0.308059,438.0,134.929842,True
2021-04-17 11:07:04.145,171131003,0.307953,19709.0,6069.445677,True
2021-04-17 11:07:04.145,171131004,0.307952,20.0,6.159040,True
...,...,...,...,...,...
2021-05-09 17:27:57.481,365633995,0.474102,16000.0,7585.632000,False
2021-05-09 17:27:57.481,365633996,0.474109,240.0,113.786160,False
2021-05-09 17:27:57.481,365633997,0.474120,26.0,12.327120,False
2021-05-09 17:27:57.481,365633998,0.474121,13333.0,6321.455293,False


# ARIMA

In [16]:
from statsmodels.tsa.arima.model import ARIMA
from timeseries import Var

In [11]:
train_df = df[ df["id"] < 300000000].groupby(pd.Grouper(freq="1s")).last()
test_df = df[ df["id"] >= 300000000].groupby(pd.Grouper(freq="1s")).last()

In [12]:
model = ARIMA(train_df["price"], order=(5,1,0))
model_fit = model.fit()

In [14]:
model_fit.predict(test_df)

TypeError: Cannot convert input [                            id     price     qty     quoteQty  isBuyerMaker
time                                                                       
2021-05-09 17:00:48  364797039  0.492767  2108.0  1038.752836         False
2021-05-09 17:00:49  364797416  0.492652  1115.0   549.306980          True
2021-05-09 17:00:50  364797653  0.492550   198.0    97.524900         False
2021-05-09 17:00:51  364797915  0.492097    81.0    39.859857         False
2021-05-09 17:00:52  364798135  0.492009  1054.0   518.577486          True
...                        ...       ...     ...          ...           ...
2021-05-09 17:27:53  365632792  0.472868   632.0   298.852576          True
2021-05-09 17:27:54  365633049  0.473796   215.0   101.866140          True
2021-05-09 17:27:55  365633578  0.474177    43.0    20.389611          True
2021-05-09 17:27:56  365633940  0.473853    33.0    15.637149         False
2021-05-09 17:27:57  365633999  0.474065    17.0     8.059105          True

[1630 rows x 5 columns]] of type <class 'pandas.core.frame.DataFrame'> to Timestamp

# Decision tree

In [32]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree

In [33]:
n_look_back = 5
skip_look_back = 5
skip_look_forword = 10

In [34]:
df

Unnamed: 0_level_0,id,price,qty,quoteQty,isBuyerMaker
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-04-17 11:07:04.135,171131000,0.308076,30.0,9.242280,False
2021-04-17 11:07:04.145,171131001,0.308060,480.0,147.868800,True
2021-04-17 11:07:04.145,171131002,0.308059,438.0,134.929842,True
2021-04-17 11:07:04.145,171131003,0.307953,19709.0,6069.445677,True
2021-04-17 11:07:04.145,171131004,0.307952,20.0,6.159040,True
...,...,...,...,...,...
2021-05-09 17:27:57.481,365633995,0.474102,16000.0,7585.632000,False
2021-05-09 17:27:57.481,365633996,0.474109,240.0,113.786160,False
2021-05-09 17:27:57.481,365633997,0.474120,26.0,12.327120,False
2021-05-09 17:27:57.481,365633998,0.474121,13333.0,6321.455293,False


In [35]:
def clean_decision_tree_data(df, is_class):
    decision_tree_df = pd.DataFrame(df["price"].groupby(pd.Grouper(freq="1s")).last())
    decision_tree_df["quoteQty"] = df["quoteQty"].groupby(pd.Grouper(freq="1s")).sum()
    
    decision_tree_df[f"percent_price"] = decision_tree_df["price"].pct_change(skip_look_back)
    for n in range(1, n_look_back + 1):
        decision_tree_df[f"percent_price_t-{n * skip_look_back}"] = decision_tree_df["percent_price"].shift(n * skip_look_back)
        decision_tree_df[f"quoteQty_t-{n * skip_look_back}"] = decision_tree_df["quoteQty"].shift(n * skip_look_back)
        
    decision_tree_df[f"percent_price_{skip_look_forword}"] = decision_tree_df["price"].pct_change(skip_look_forword)
    decision_tree_df[f"percent_price_t+{skip_look_forword}"] = decision_tree_df[f"percent_price_{skip_look_forword}"].shift(-skip_look_forword)

    if is_class:
        decision_tree_df[f"percent_price_t+{skip_look_forword}"] = decision_tree_df[f"percent_price_t+{skip_look_forword}"].apply(lambda v: 1 if v > 0 else 0)
        
    decision_tree_df = decision_tree_df.drop(["price", f"percent_price_{skip_look_forword}"], axis=1)
    decision_tree_df = decision_tree_df.dropna()
    
    return decision_tree_df.drop([f"percent_price_t+{skip_look_forword}"], axis=1), decision_tree_df[[f"percent_price_t+{skip_look_forword}"]]

In [36]:
train_x_df, train_y_df = clean_decision_tree_data(df[ df["id"] < 300000000], is_class=True)
test_x_df, test_y_df = clean_decision_tree_data(df[ df["id"] >= 300000000], is_class=True)
# train_x_df[:21]
train_y_df

Unnamed: 0_level_0,percent_price_t+10
time,Unnamed: 1_level_1
2021-04-17 11:07:34,1
2021-04-17 11:07:35,1
2021-04-17 11:07:36,1
2021-04-17 11:07:37,1
2021-04-17 11:07:38,1
...,...
2021-04-17 14:34:02,0
2021-04-17 14:34:03,0
2021-04-17 14:34:04,0
2021-04-17 14:34:05,0


In [37]:
model = DecisionTreeClassifier(max_depth=6)
model.fit(train_x_df, train_y_df)

DecisionTreeClassifier(max_depth=6)

In [39]:
predict_list = model.predict(test_x_df)
Validator.calculate_class_accuracy(test_y_df[f"percent_price_t+{skip_look_forword}"], predict_list)

0.5091318949257028

In [None]:
TreeHelper.export_decision_tree_graph(model, feature_list=test_x_df.columns, output_filename="tree.png")