In [64]:
import pandas as pd
import numpy as np
from packages import wrappers, models, ta
from sklearn.linear_model import LinearRegression

In [65]:
asset_df = wrappers.get_df_from_cryptowatch(pair='ethusd')
asset_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,QuoteVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-08-08,3.00000,3.0000,0.15000,1.19999,1942.888147,0.000000e+00
2015-08-11,1.19999,1.1500,0.65038,0.99000,9234.568705,0.000000e+00
2015-08-12,0.99000,1.2880,0.90500,1.28800,1736.114983,0.000000e+00
2015-08-13,1.28800,1.8848,1.26280,1.88480,2815.450762,0.000000e+00
2015-08-14,1.88480,2.1000,1.79489,1.79489,5775.809099,0.000000e+00
...,...,...,...,...,...,...
2021-08-21,3286.66000,3310.0000,3210.21000,3226.33000,29906.733236,9.759180e+07
2021-08-22,3226.51000,3274.7100,3131.00000,3240.90000,19105.965266,6.121164e+07
2021-08-23,3241.16000,3381.4700,3234.59000,3322.69000,54892.600367,1.825813e+08
2021-08-24,3323.08000,3360.0000,3150.00000,3173.55000,36608.576253,1.186916e+08


In [66]:
day_predicting = '2021-08-25'
num_days = 30
features = ['Close', 'Volume', 'ATR_14', 'OBV']
# features = ['Volume', 'ATR_14', 'OBV']

In [67]:
prepped_df = models.add_TA(asset_df)

In [68]:
prepped_df = models.prepare_for_model(asset_df, day_predicting, num_days, features=features, target='Close')

In [69]:
prepped_df

Unnamed: 0_level_0,Target,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-24,2192.33,2187.0,40409.504502,152.865318,4585668.0
2021-07-25,2229.46,2192.33,24942.156385,148.31851,4610610.0
2021-07-26,2301.73,2229.46,94672.968291,155.969331,4705283.0
2021-07-27,2300.1,2301.73,55647.131254,156.747236,4760931.0
2021-07-28,2381.58,2300.1,44623.9205,152.691005,4716307.0
2021-07-29,2463.82,2381.58,32646.550292,151.53879,4748953.0
2021-07-30,2530.53,2463.82,48801.772976,151.590305,4797755.0
2021-07-31,2557.45,2530.53,33361.97334,150.084569,4831117.0
2021-08-01,2609.6,2557.45,70557.168652,152.444243,4901674.0
2021-08-02,2507.34,2609.6,41724.944457,152.802511,4943399.0


# Analysis

https://www.machinelearningplus.com/time-series/time-series-analysis-python/ <br>

This is a guide to applying time-series to machine learning, it could be worth exploring and is explored a little in this cell.

Here we're checking the autocorrelation (https://en.wikipedia.org/wiki/Autocorrelation) and partial autocorrelation (https://en.wikipedia.org/wiki/Partial_correlation) of the data where each row is represented by a day (can change if we decide to use hour candles). Lets assume that the last value above the blue region is 30. To my understanding, if I try predicting the price of btc, I should use the last 30 days (month) to do so.

In [70]:
# import matplotlib.pyplot as plt

# from statsmodels.tsa.stattools import acf, pacf
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# # acf_50 = acf(prepped_df.Close, nlags=50)

# fig, axes = plt.subplots(1, 2, figsize=(16, 3), dpi=100)

# plot_acf(prepped_df.Close, lags=50, ax=axes[0])
# plot_pacf(prepped_df.Close, lags=50, ax=axes[1])

### Approximate and Sample Entropy

https://en.wikipedia.org/wiki/Approximate_entropy <br>
https://en.wikipedia.org/wiki/Sample_entropy

These are used to see how forecastable the data is. The rand_small and rand_big values represent the entropy values for different data sizes

In [71]:
def ApEn(U, m, r):
    """Compute Aproximate entropy"""
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[U[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * sum(np.log(C))

    N = len(U)
    return abs(_phi(m+1) - _phi(m))

In [72]:
def SampEn(U, m, r):
    """Compute Sample entropy"""
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[U[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for j in range(len(x)) if i != j and _maxdist(x[i], x[j]) <= r]) for i in range(len(x))]
        return sum(C)

    N = len(U)
    return -np.log(_phi(m+1) / _phi(m))

In [73]:
# rand_small = np.random.randint(0, 100, size=36)
# rand_big = np.random.randint(0, 100, size=136)

# print(ApEn(prepped_df.Close, m=2, r=0.2*np.std(prepped_df.Close)))
# print(ApEn(rand_small, m=2, r=0.2*np.std(rand_small)))
# print(ApEn(rand_big, m=2, r=0.2*np.std(rand_big)))
# print()
# print(SampEn(prepped_df.Close, m=2, r=0.2*np.std(prepped_df.Close)))
# print(SampEn(rand_small, m=2, r=0.2*np.std(rand_small)))
# print(SampEn(rand_big, m=2, r=0.2*np.std(rand_big)))

# Regression

In [74]:
Xtrain, Xtest, ytrain, ytest = models.get_train_test(prepped_df)
display(Xtrain, ytrain, Xtest, ytest)

Unnamed: 0_level_0,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-24,2187.0,40409.504502,152.865318,4585668.0
2021-07-25,2192.33,24942.156385,148.31851,4610610.0
2021-07-26,2229.46,94672.968291,155.969331,4705283.0
2021-07-27,2301.73,55647.131254,156.747236,4760931.0
2021-07-28,2300.1,44623.9205,152.691005,4716307.0
2021-07-29,2381.58,32646.550292,151.53879,4748953.0
2021-07-30,2463.82,48801.772976,151.590305,4797755.0
2021-07-31,2530.53,33361.97334,150.084569,4831117.0
2021-08-01,2557.45,70557.168652,152.444243,4901674.0
2021-08-02,2609.6,41724.944457,152.802511,4943399.0


Unnamed: 0_level_0,Target
Date,Unnamed: 1_level_1
2021-07-24,2192.33
2021-07-25,2229.46
2021-07-26,2301.73
2021-07-27,2300.1
2021-07-28,2381.58
2021-07-29,2463.82
2021-07-30,2530.53
2021-07-31,2557.45
2021-08-01,2609.6
2021-08-02,2507.34


Unnamed: 0_level_0,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-08-23,3322.69,54892.600367,187.308239,5268678.0


Unnamed: 0_level_0,Target
Date,Unnamed: 1_level_1
2021-08-23,3173.55


# Backtester class

Eventually add different types of metrics to evaluate backtester

In [75]:
class Backtester:
    def __init__(self, df, end_date, num_days_backtest=365, data_num_days=365):
        self.end_date = pd.to_datetime(end_date)
        self.num_days_backtest = num_days_backtest
        self.data_num_days = data_num_days
        self.df = df
        self.trained_models = []
        self.dataframes = []
        self.predictions = []
        self.model_performance = []
        
    def backtest_pipeline(self):
        pass
        
    def train_models(self):
        features = ['Close', 'Volume', 'ATR_14', 'OBV'] # temp
        for i in range(self.num_days_backtest):
            models.add_TA(self.df)
            day_predicting = self.end_date - pd.to_timedelta(i, 'D')
            
            prepped_df = models.prepare_for_model(self.df, day_predicting, self.data_num_days, features=features, target='Close')
            self.dataframes.append(prepped_df)
            
            Xtrain, Xtest, ytrain, ytest = models.get_train_test(prepped_df)
            lr = LinearRegression().fit(Xtrain, ytrain)
            self.trained_models.append(lr)
            
            prediction = lr.predict(Xtest)[0][0]
            self.predictions.append(prediction)
            self.model_performance.append(get_pct_change(prepped_df['Close'].iloc[[-1]][0], ytest.iloc[0, 0]) - get_pct_change(prepped_df['Close'].iloc[[-1]][0], prediction)) # Actual - Predicted

In [76]:
def get_pct_change(initial, final):
    return ((final - initial) / initial) * 100

In [77]:
bt = Backtester(asset_df, end_date='2021-08-21', num_days_backtest=365, data_num_days=30)
bt.train_models()

In [78]:
np.std(bt.model_performance)

6.759556802578524

In [79]:
bt.dataframes[-1]

Unnamed: 0_level_0,Target,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-07-22,275.59,264.53,63931.610631,9.210214,1097409.0
2020-07-23,279.42,275.59,110724.228391,10.056627,1208134.0
2020-07-24,305.89,279.42,85495.71058,10.719725,1293629.0
2020-07-25,311.52,305.89,97505.640281,12.134031,1391135.0
2020-07-26,322.44,311.52,101680.138722,12.649457,1492815.0
2020-07-27,317.43,322.44,183362.194921,13.644496,1676177.0
2020-07-28,318.18,317.43,96578.728282,14.161318,1579599.0
2020-07-29,335.25,318.18,58271.031875,14.020509,1637870.0
2020-07-30,347.0,335.25,83409.870958,15.041901,1721279.0
2020-07-31,387.57,347.0,74692.385767,15.465337,1795972.0


# Training model

In [80]:
lr = LinearRegression().fit(Xtrain, ytrain)
prediction = lr.predict(Xtest)[0][0]
actual = ytest.iloc[0, 0]
predicted_pct = round(get_pct_change(prepped_df['Close'].iloc[[-1]][0], prediction), 2)
actual_pct = round(get_pct_change(prepped_df['Close'].iloc[[-1]][0], actual), 2)

In [81]:
print('Prediction:', prediction, 'or', predicted_pct, '%')
print('Actual:', actual, 'or', actual_pct, '%')

Prediction: 3250.821766832203 or -2.16 %
Actual: 3173.55 or -4.49 %


In [82]:
# display(prediction)
# display(actual)

# Model performance

In [83]:
from sklearn.metrics import mean_squared_error

# print(mean_squared_error(ytest, prediction))

# Testing Documentation

In [84]:
# ?models.get_train_test