In [1]:
import pandas as pd
import numpy as np
from packages import wrappers, models, ta
from sklearn.linear_model import LinearRegression

In [2]:
asset_df = wrappers.get_df_from_cryptowatch(pair='ethusd')
asset_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,QuoteVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-08-08,3.00000,3.0000,0.15000,1.19999,1942.888147,0.000000e+00
2015-08-11,1.19999,1.1500,0.65038,0.99000,9234.568705,0.000000e+00
2015-08-12,0.99000,1.2880,0.90500,1.28800,1736.114983,0.000000e+00
2015-08-13,1.28800,1.8848,1.26280,1.88480,2815.450762,0.000000e+00
2015-08-14,1.88480,2.1000,1.79489,1.79489,5775.809099,0.000000e+00
...,...,...,...,...,...,...
2021-08-17,3147.65000,3292.1000,2995.00000,3011.71000,54077.404853,1.692588e+08
2021-08-18,3011.71000,3128.2900,2952.12000,3014.36000,42430.666413,1.290144e+08
2021-08-19,3014.61000,3189.4900,2961.41000,3184.67000,30748.288424,9.406486e+07
2021-08-20,3184.66000,3300.0000,3178.49000,3286.67000,37076.189352,1.205339e+08


In [3]:
day_predicting = '2021-08-20'
num_days = 30
features = ['Close', 'Volume', 'ATR_14', 'OBV']
# features = ['Volume', 'ATR_14', 'OBV']

In [4]:
prepped_df = models.add_TA(asset_df)

In [5]:
prepped_df = models.prepare_for_model(asset_df, day_predicting, num_days, features=features, target='Close')

In [6]:
prepped_df

Unnamed: 0_level_0,Target,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-20,1995.0,1785.01,95628.588945,155.463736,4391783.0
2021-07-21,2018.2,1995.0,82066.500928,164.142041,4473850.0
2021-07-22,2124.03,2018.2,41886.085828,159.534038,4515736.0
2021-07-23,2187.0,2124.03,29522.94935,157.598035,4545259.0
2021-07-24,2192.33,2187.0,40409.504502,152.865318,4585668.0
2021-07-25,2229.46,2192.33,24942.156385,148.31851,4610610.0
2021-07-26,2301.73,2229.46,94672.968291,155.969331,4705283.0
2021-07-27,2300.1,2301.73,55647.131254,156.747236,4760931.0
2021-07-28,2381.58,2300.1,44623.9205,152.691005,4716307.0
2021-07-29,2463.82,2381.58,32646.550292,151.53879,4748953.0


# Analysis

https://www.machinelearningplus.com/time-series/time-series-analysis-python/ <br>

This is a guide to applying time-series to machine learning, it could be worth exploring and is explored a little in this cell.

Here we're checking the autocorrelation (https://en.wikipedia.org/wiki/Autocorrelation) and partial autocorrelation (https://en.wikipedia.org/wiki/Partial_correlation) of the data where each row is represented by a day (can change if we decide to use hour candles). Lets assume that the last value above the blue region is 30. To my understanding, if I try predicting the price of btc, I should use the last 30 days (month) to do so.

In [7]:
# import matplotlib.pyplot as plt

# from statsmodels.tsa.stattools import acf, pacf
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# # acf_50 = acf(prepped_df.Close, nlags=50)

# fig, axes = plt.subplots(1, 2, figsize=(16, 3), dpi=100)

# plot_acf(prepped_df.Close, lags=50, ax=axes[0])
# plot_pacf(prepped_df.Close, lags=50, ax=axes[1])

### Approximate and Sample Entropy

https://en.wikipedia.org/wiki/Approximate_entropy <br>
https://en.wikipedia.org/wiki/Sample_entropy

These are used to see how forecastable the data is. The rand_small and rand_big values represent the entropy values for different data sizes

In [8]:
def ApEn(U, m, r):
    """Compute Aproximate entropy"""
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[U[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for x_j in x if _maxdist(x_i, x_j) <= r]) / (N - m + 1.0) for x_i in x]
        return (N - m + 1.0)**(-1) * sum(np.log(C))

    N = len(U)
    return abs(_phi(m+1) - _phi(m))

In [9]:
def SampEn(U, m, r):
    """Compute Sample entropy"""
    def _maxdist(x_i, x_j):
        return max([abs(ua - va) for ua, va in zip(x_i, x_j)])

    def _phi(m):
        x = [[U[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for j in range(len(x)) if i != j and _maxdist(x[i], x[j]) <= r]) for i in range(len(x))]
        return sum(C)

    N = len(U)
    return -np.log(_phi(m+1) / _phi(m))

In [10]:
# rand_small = np.random.randint(0, 100, size=36)
# rand_big = np.random.randint(0, 100, size=136)

# print(ApEn(prepped_df.Close, m=2, r=0.2*np.std(prepped_df.Close)))
# print(ApEn(rand_small, m=2, r=0.2*np.std(rand_small)))
# print(ApEn(rand_big, m=2, r=0.2*np.std(rand_big)))
# print()
# print(SampEn(prepped_df.Close, m=2, r=0.2*np.std(prepped_df.Close)))
# print(SampEn(rand_small, m=2, r=0.2*np.std(rand_small)))
# print(SampEn(rand_big, m=2, r=0.2*np.std(rand_big)))

# Regression

In [11]:
Xtrain, Xtest, ytrain, ytest = models.get_train_test(prepped_df)
display(Xtrain, ytrain, Xtest, ytest)

Unnamed: 0_level_0,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-20,1785.01,95628.588945,155.463736,4391783.0
2021-07-21,1995.0,82066.500928,164.142041,4473850.0
2021-07-22,2018.2,41886.085828,159.534038,4515736.0
2021-07-23,2124.03,29522.94935,157.598035,4545259.0
2021-07-24,2187.0,40409.504502,152.865318,4585668.0
2021-07-25,2192.33,24942.156385,148.31851,4610610.0
2021-07-26,2229.46,94672.968291,155.969331,4705283.0
2021-07-27,2301.73,55647.131254,156.747236,4760931.0
2021-07-28,2300.1,44623.9205,152.691005,4716307.0
2021-07-29,2381.58,32646.550292,151.53879,4748953.0


Unnamed: 0_level_0,Target
Date,Unnamed: 1_level_1
2021-07-20,1995.0
2021-07-21,2018.2
2021-07-22,2124.03
2021-07-23,2187.0
2021-07-24,2192.33
2021-07-25,2229.46
2021-07-26,2301.73
2021-07-27,2300.1
2021-07-28,2381.58
2021-07-29,2463.82


Unnamed: 0_level_0,Close,Volume,ATR_14,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-08-19,3184.67,30748.288424,207.393331,5187510.0


Unnamed: 0_level_0,Target
Date,Unnamed: 1_level_1
2021-08-19,3286.67


# Backtester class

Eventually add different types of metrics to evaluate backtester

In [92]:
class Backtester:
    def __init__(self, df, end_date, num_days_backtest=365, data_num_days=365):
        self.end_date = pd.to_datetime(end_date)
        self.num_days_backtest = num_days_backtest
        self.data_num_days = data_num_days
        self.df = df
        self.trained_models = []
        self.dataframes = []
        self.predictions = []
        self.model_performance = []
        
    def train_models(self):
        features = ['Close', 'Volume', 'ATR_14', 'OBV'] # temp
        for i in range(self.num_days_backtest):
            models.add_TA(self.df)
            day_predicting = self.end_date - pd.to_timedelta(i, 'D')
            prepped_df = models.prepare_for_model(self.df, day_predicting, self.data_num_days, features=features, target='Close')
            self.dataframes.append(prepped_df)
            Xtrain, Xtest, ytrain, ytest = models.get_train_test(prepped_df)
            lr = LinearRegression().fit(Xtrain, ytrain)
            self.trained_models.append(lr)
            prediction = lr.predict(Xtest)[0][0]
            self.predictions.append(prediction)
            self.model_performance.append(get_pct_change(prepped_df['Close'].iloc[[-1]][0], ytest.iloc[0, 0]) - get_pct_change(prepped_df['Close'].iloc[[-1]][0], prediction)) # Actual - Predicted

In [93]:
def get_pct_change(initial, final):
    return (final - initial) / initial * 100

In [94]:
bt = Backtester(asset_df, end_date='2021-08-21', num_days_backtest=100, data_num_days=30)
bt.train_models()

In [100]:
np.std(bt.model_performance) # 8% is terrible

8.295950329874033

# Training model

In [20]:
lr = LinearRegression().fit(Xtrain, ytrain)
prediction = lr.predict(Xtest)[0][0]
actual = ytest.iloc[0, 0]
predicted_pct = round(get_pct_change(prepped_df['Close'].iloc[[-1]][0], prediction), 2)
actual_pct = round(get_pct_change(prepped_df['Close'].iloc[[-1]][0], actual), 2)

In [21]:
print('Prediction:', prediction, 'or', predicted_pct, '%')
print('Actual:', actual, 'or', actual_pct, '%')

Prediction: 3210.3809124460804 or 0.81 %
Actual: 3286.67 or 3.2 %


In [22]:
# display(prediction)
# display(actual)

# Model performance

In [23]:
from sklearn.metrics import mean_squared_error

# print(mean_squared_error(ytest, prediction))

# Testing Documentation

In [24]:
# ?models.get_train_test