In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from datetime import date
import statsmodels.api as sm
from scipy import stats
from scipy.special import inv_boxcox
from math import fabs

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data loading

In [None]:

df = pd.read_csv('../input/bitcoin-prices-in-usd-from-20120201to20210411/Bitcoin_USD_2012-02-01_2021-04-11.csv')
df = df.drop("Unnamed: 0", axis = 1)


df["Date"] = pd.to_datetime(df["Date"])
df.index = df["Date"]
df = df.drop("Date", axis = 1)

df.head()

# Some Plots, and data interpretation

There was three halvings in the history of bitcoin. Let's show it on the plot.

In [None]:
df['Price USD'].plot()

halving_dates = [date(2012, 11, 28), date(2016,7,9), date(2020, 6, 11)]
a = plt.plot(halving_dates, df["Price USD"].loc[halving_dates], marker = "o", linewidth = 0)

One year after each halvig, there is a price spike that creates a local high for the coming years. Сonsider the post-halving periods in more detail.

In [None]:
fig = plt.figure(figsize=[10, 24])

plt.subplot(311)
temp_df = df.loc[df.index > "2012-11-28"]
temp_df["Price USD"].loc[temp_df.index < "2014-11-28"].plot()
plt.title("after first halving")

plt.subplot(312)
temp_df = df.loc[df.index > "2016-07-09"]
temp_df["Price USD"].loc[temp_df.index < "2018-07-09"].plot()
plt.title("after second halving")

plt.subplot(313)
df["Price USD"].loc[df.index > "2020-06-11"].plot()
a = plt.title("after third halving")


# Statistics tests and data transformations

index transformation, and adf test

In [None]:
month_df = df.resample('M').mean()
print(month_df.shape)
print("ADF test: p=%f" % sm.tsa.stattools.adfuller(month_df["Price USD"])[1])

The time series is not stationary, due to the presence of a trend and a non-variable variance. To verify the variability of the variance, we use the Bartlett test.

In [None]:
# take the data between the halvings as comparable samples
first_arr = month_df.loc[month_df.index > "2012-11-28"]; first_arr = first_arr.loc[first_arr.index < "2016-07-09"]["Price USD"].to_numpy()
second_arr = month_df.loc[month_df.index > "2016-07-09"]; second_arr = second_arr.loc[second_arr.index < "2020-06-11"]["Price USD"].to_numpy()
third_arr = month_df.loc[month_df.index > "2020-06-11"]["Price USD"].to_numpy()


# Bartlett test
stat, p = stats.bartlett(first_arr, second_arr, third_arr)
print("Bartlett P-value: " + str(p))

# calculate the variance of the different sections
print('Variances:')
print([np.var(x, ddof=1) for x in [first_arr, second_arr, third_arr]])

So, the variance is statistically non-integral and it can be assumed from the variance estimates that the variance increases monotonically. In order to correct for this, we can use the Box-Cox transform.

In [None]:
month_df['Box-Cox transformed'], lmbda = stats.boxcox(month_df['Price USD'])

fig = plt.figure(figsize = [15, 10])

plt.subplot(211)
month_df['Price USD'].plot()
plt.subplot(212)
month_df['Box-Cox transformed'].plot()

first_arr = month_df.loc[month_df.index > "2012-11-28"]; first_arr = first_arr.loc[first_arr.index < "2016-07-09"]["Box-Cox transformed"].to_numpy()
second_arr = month_df.loc[month_df.index > "2016-07-09"]; second_arr = second_arr.loc[second_arr.index < "2020-06-11"]["Box-Cox transformed"].to_numpy()
third_arr = month_df.loc[month_df.index > "2020-06-11"]["Box-Cox transformed"].to_numpy()

stat, p = stats.bartlett(first_arr, second_arr, third_arr)
print("Bartlett P-value: " + str(p))

print('Variances:')
print([np.var(x, ddof=1) for x in [first_arr, second_arr, third_arr]])

print("ADF test: p=%f" % sm.tsa.stattools.adfuller(month_df["Box-Cox transformed"])[1])

Now much better. But there is still a variable mathematical expectation. Let's fix this with finite differences.

In [None]:
month_df['diffs1'] = month_df['Box-Cox transformed'] - month_df['Box-Cox transformed'].shift(1)
print("ADF test: p=%f" % sm.tsa.stattools.adfuller(month_df["diffs1"][1:])[1])

fig = plt.figure(figsize = [15, 5])
a = month_df['diffs1'].plot()

It can now be assumed that the data is stationary.

# Model identification

Lets chek acf and pacf plots

In [None]:

fig = plt.figure(figsize = [15, 10])
ax = plt.subplot('211')
a = sm.graphics.tsa.plot_acf(month_df['diffs1'][1:], lags=48, ax = ax)

ax = plt.subplot('212')
a = sm.graphics.tsa.plot_pacf(month_df['diffs1'][1:], lags=48, ax = ax)

Nice PACF at lag 47, but it not enought data to build model with 4 year seasonality. I have no ideas about orders, but lets check aic value for defferent models. 

In [None]:
# this line will spend some time, the best order is (4,8) = 141.21
#test = sm.tsa.arma_order_select_ic(month_df['diffs1'][1:],max_ar=10, max_ma=10, ic='aic')
#test

I see no reason to suspect seasonality in the data, so let's leave the simple ARIMA model

# Model building and quality check

function for calculating the approximation error, MLEResult.mae does not work for me.

In [None]:
def mae_recompute(y, y_est):
    A = 0
    
    for i in y.index:
        A += fabs((y.loc[i] - y_est.loc[i])/y.loc[i])

    return A/len(y.index)

I have consistently excluded weakly significant coefficients from the model until only significant ones remain.

In [None]:
model=sm.tsa.statespace.SARIMAX(month_df['Box-Cox transformed'], order=(0, 1, [1,7])).fit(disp=-1)

result_box_cox = model.predict(start=0, end=110)
print('mean absolute error by myself: ' + str(mae_recompute(month_df['Box-Cox transformed'] ,result_box_cox)))

Let's look at the residuals parallelogram as it may suggest which coefficients it makes sense to include as well

In [None]:
fig = plt.figure(figsize = [15, 10])
ax = plt.subplot('211')
a = sm.graphics.tsa.plot_acf(model.resid, lags=48, ax = ax)

ax = plt.subplot('212')
a = sm.graphics.tsa.plot_pacf(model.resid, lags=48, ax = ax)

#q_test = sm.tsa.stattools.acf(model.resid, qstat=True)

you can see quite a few odds that stand out.  PACF will help you select additional AR coefficients. ACF will help select new MA coefficients. I tried to add AR 21, 26, 28, 40 but none of them turned out to be significant, while 47 turned out to be significant and improved the performance of the model. In addition, I tried a number of MA coefficients, only 47 turned out to be significant.

In [None]:
model=sm.tsa.statespace.SARIMAX(month_df['Box-Cox transformed'], order=([47], 1, [1,7,47])).fit(disp=-1)

print(model.summary())

result_box_cox = model.predict(start=0, end=110)
result = inv_boxcox(model.predict(start=0, end=110), lmbda)

print('mean absolute error box-cox: ' + str(mae_recompute(month_df['Box-Cox transformed'] ,result_box_cox)))
print('mean absolute error: ' + str(mae_recompute(month_df['Price USD'] ,result)))

figure = plt.figure(figsize = [20,20])

plt.subplot('211')

month_df['Box-Cox transformed'].plot()
result_box_cox.plot(color = 'r' , ls = '--')
plt.title("box-cox data")

plt.subplot('212')
month_df['Price USD'].plot()
result.plot(color = 'r' , ls = '--')

Generally speaking, this result confirms some seasonality of 4 years (48 months). Just halving passed with a period not exactly 4 years - a little less, because the dependence with an interval of 47 months.

Let's check the residuals of the resulting model

In [None]:
q_test = sm.tsa.stattools.acf(model.resid, qstat=True)
print(pd.DataFrame({'ACF':q_test[0][1:],'Q-stat':q_test[1], 'p-value':q_test[2]}))

Nice result - observations are likely to be white noise. We got the final model.

# Prediction

Once again confirm seasonality at four years with a spot long-term forecast.

In [None]:
pred_end = 250

pred_box_cox = model.get_prediction(start = 110, end = pred_end)
poind_pred = inv_boxcox(pred_box_cox.predicted_mean, lmbda)

figure = plt.figure(figsize = [15,15])

plt.subplot('212')
month_df['Price USD'].plot()
poind_pred.plot(color = 'r' , ls = '--')

We can say from the spot forecast that the next price boom is expected to start in 2024, which is why we will make more detailed forecasts before then.

In [None]:
pred_end = 143

pred_box_cox = model.get_prediction(start = 110, end = pred_end)
poind_pred = inv_boxcox(pred_box_cox.predicted_mean, lmbda)

upper_pred = inv_boxcox(pred_box_cox.conf_int(alpha=0.05)['upper Box-Cox transformed'], lmbda)
lower_pred = inv_boxcox(pred_box_cox.conf_int(alpha=0.05)['lower Box-Cox transformed'], lmbda)

figure = plt.figure(figsize = [15,10])

ax = plt.subplot('111')

#month_df.loc[month_df.index > "2020-06-11"]["Price USD"].plot(ax = ax)
#poind_pred.plot(ls = '--', color = 'red', ax = ax)

plt.plot(month_df.loc[month_df.index > "2020-06-11"]["Price USD"])
plt.plot(poind_pred, ls = '--', color = 'red')

plt.fill_between(upper_pred.index, upper_pred, lower_pred, alpha = 0.5)
plt.show()

print(pd.concat([poind_pred, upper_pred, lower_pred], axis = 1))

We get that, judging by the model, the current iteration will peak in December 2021 at an average of 171.08 thousand, with a 95 percent probability of being between 66.16 thousand and 405.4 thousand. Obviously, it is around this point that cryptocurrency will be most profitable to sell.
And the next bottom should be expected in December 2022. On average, the price should be around 144.63 thousand per cryptocurrency unit, and with a 95 probability it will lie between 16.06 and 581.89 thousand. This downturn is the best time to invest in bitcoin.