Predictive modeling for BTC closing price (We'll explore both a multiple linear regression and a logistic linear regression model)

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
%matplotlib inline
import keras
print(tf.__version__)

2.4.1


Data obtained from Yahoo Finance (https://finance.yahoo.com/quote/BTC-USD/history?p=BTC-USD)

In [7]:
df = pd.read_csv("BTC-USD.csv")
df.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ClosedHigherThanOpen,CTHO_Label
0,2/2/2021,35025,35858,33556,35641,35641,62920052736,Yes,1
1,2/1/2021,33115,34638,32384,33537,33537,61400400660,Yes,1
2,1/31/2021,34271,34288,32270,33114,33114,52754542671,No,0


In [8]:
df.columns

Index(['Date', ' Open ', ' High ', ' Low ', ' Close ', ' Adj Close ',
       ' Volume ', 'ClosedHigherThanOpen', ' CTHO_Label '],
      dtype='object')

In [None]:
Set the Close price as our dependent variable (y), Independent variables (X) = (Open, High, Low). Note for BTC, Close price is pretty much same as Adjusted close price

In [9]:
X = df[[' Open ', ' High ', ' Low ' ]]
y = df[' Adj Close ']

Multiple Linear Regression to predict BTC closing price

In [10]:
reg_model = linear_model.LinearRegression()
reg_model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
reg_model.coef_

array([-0.34669217,  0.77447896,  0.57469282])

In [12]:
reg_model.intercept_

-41.83925612980056

In [13]:
reg_model.predict([[37174, 37409, 35448]])

array([36414.42036689])

Logistic Regression to predict if Closing BTC Price is Higher than Open price (1 for "YES", 0 for "NO")

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

In [15]:
X1 = df[[' Open ', ' High ', ' Low ' ]]
X2 = df[[' Open ']]
y1 = df[' CTHO_Label ']
X1, X2, y1 = shuffle(X1, X2, y1)
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size = 0.1)
#X_train, y_train = shuffle(X_train, y_train)
#X_test, y_test = shuffle(X_test, y_test)

In [16]:
len(y_train)
len(y_test)

37

In [17]:
log_model = LogisticRegression()

In [18]:
log_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
log_model.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [20]:
log_model.score(X_test, y_test)

0.5945945945945946

In [21]:
dummies = pd.get_dummies(df['ClosedHigherThanOpen'])
merged = pd.concat([df, dummies], axis='columns')
merged

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ClosedHigherThanOpen,CTHO_Label,No,Yes
0,2/2/2021,35025,35858,33556,35641,35641,62920052736,Yes,1,0,1
1,2/1/2021,33115,34638,32384,33537,33537,61400400660,Yes,1,0,1
2,1/31/2021,34271,34288,32270,33114,33114,52754542671,No,0,1,0
3,1/30/2021,34296,34835,32940,34270,34270,65141828798,No,0,1,0
4,1/29/2021,34319,38406,32065,34316,34316,117894572511,No,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
358,2/6/2020,9618,9825,9540,9730,9730,37628823716,Yes,1,0,1
359,2/5/2020,9183,9701,9164,9613,9613,35222060874,Yes,1,0,1
360,2/4/2020,9293,9331,9113,9181,9181,29893183716,No,0,1,0
361,2/3/2020,9345,9540,9249,9294,9294,30934096509,No,0,1,0


In [22]:
finals = merged.drop(['ClosedHigherThanOpen'], axis = 'columns')
finals

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,CTHO_Label,No,Yes
0,2/2/2021,35025,35858,33556,35641,35641,62920052736,1,0,1
1,2/1/2021,33115,34638,32384,33537,33537,61400400660,1,0,1
2,1/31/2021,34271,34288,32270,33114,33114,52754542671,0,1,0
3,1/30/2021,34296,34835,32940,34270,34270,65141828798,0,1,0
4,1/29/2021,34319,38406,32065,34316,34316,117894572511,0,1,0
...,...,...,...,...,...,...,...,...,...,...
358,2/6/2020,9618,9825,9540,9730,9730,37628823716,1,0,1
359,2/5/2020,9183,9701,9164,9613,9613,35222060874,1,0,1
360,2/4/2020,9293,9331,9113,9181,9181,29893183716,0,1,0
361,2/3/2020,9345,9540,9249,9294,9294,30934096509,0,1,0


In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df2 = df
df2['ClosedHigherThanOpen'] = le.fit_transform(df['ClosedHigherThanOpen'])
df2

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ClosedHigherThanOpen,CTHO_Label
0,2/2/2021,35025.0,35858.0,33556.0,35641.0,35641.0,6.292005e+10,1,1
1,2/1/2021,33115.0,34638.0,32384.0,33537.0,33537.0,6.140040e+10,1,1
2,1/31/2021,34271.0,34288.0,32270.0,33114.0,33114.0,5.275454e+10,0,0
3,1/30/2021,34296.0,34835.0,32940.0,34270.0,34270.0,6.514183e+10,0,0
4,1/29/2021,34319.0,38406.0,32065.0,34316.0,34316.0,1.178946e+11,0,0
...,...,...,...,...,...,...,...,...,...
358,2/6/2020,9618.0,9825.0,9540.0,9730.0,9730.0,3.762882e+10,1,1
359,2/5/2020,9183.0,9701.0,9164.0,9613.0,9613.0,3.522206e+10,1,1
360,2/4/2020,9293.0,9331.0,9113.0,9181.0,9181.0,2.989318e+10,0,0
361,2/3/2020,9345.0,9540.0,9249.0,9294.0,9294.0,3.093410e+10,0,0


# **Build a Neural network**

In [24]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

In [25]:
X1 = df[[' Open ', ' High ', ' Low ' ]]
y1 = df[' CTHO_Label ']
X1, y1 = shuffle(X1, y1)
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.1)

In [26]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

In [27]:
len(scaled_X_train)

326

In [28]:
model = Sequential([
    Dense(units=16, input_shape=(3,), activation='relu'),
    Dense(units=32, activation='relu'),
    Dense(units=2, activation='softmax')
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                64        
_________________________________________________________________
dense_1 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 674
Trainable params: 674
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [30]:
model.fit(x=scaled_X_train, y=y_train, validation_split=0.1, epochs=30, verbose=2)

Epoch 1/30
10/10 - 1s - loss: 0.6951 - accuracy: 0.4846 - val_loss: 0.6958 - val_accuracy: 0.4848
Epoch 2/30
10/10 - 0s - loss: 0.6911 - accuracy: 0.5597 - val_loss: 0.6992 - val_accuracy: 0.4848
Epoch 3/30
10/10 - 0s - loss: 0.6891 - accuracy: 0.5597 - val_loss: 0.7030 - val_accuracy: 0.4848
Epoch 4/30
10/10 - 0s - loss: 0.6878 - accuracy: 0.5597 - val_loss: 0.7060 - val_accuracy: 0.4848
Epoch 5/30
10/10 - 0s - loss: 0.6878 - accuracy: 0.5597 - val_loss: 0.7061 - val_accuracy: 0.4848
Epoch 6/30
10/10 - 0s - loss: 0.6870 - accuracy: 0.5597 - val_loss: 0.7075 - val_accuracy: 0.4848
Epoch 7/30
10/10 - 0s - loss: 0.6871 - accuracy: 0.5597 - val_loss: 0.7046 - val_accuracy: 0.4848
Epoch 8/30
10/10 - 0s - loss: 0.6868 - accuracy: 0.5597 - val_loss: 0.7037 - val_accuracy: 0.4848
Epoch 9/30
10/10 - 0s - loss: 0.6867 - accuracy: 0.5597 - val_loss: 0.7050 - val_accuracy: 0.4848
Epoch 10/30
10/10 - 0s - loss: 0.6864 - accuracy: 0.5597 - val_loss: 0.7048 - val_accuracy: 0.4848
Epoch 11/30
10/10 -

<tensorflow.python.keras.callbacks.History at 0x7fe423ae9690>

In [31]:
predictions = model.predict(x=scaled_X_test, batch_size=10, verbose=0) 

In [32]:
len(scaled_X_test)

37

In [33]:
for i in predictions:
  print(i)

[0.43593282 0.5640672 ]
[0.45011866 0.5498814 ]
[0.44967556 0.5503244 ]
[0.4344739  0.56552607]
[0.43733567 0.56266433]
[0.44244102 0.557559  ]
[0.44386593 0.55613405]
[0.4494698  0.55053025]
[0.44631773 0.5536822 ]
[0.44714263 0.55285734]
[0.44937643 0.5506236 ]
[0.44982255 0.55017745]
[0.44794437 0.55205566]
[0.44880098 0.5511991 ]
[0.43968076 0.56031924]
[0.4434792 0.5565209]
[0.44541648 0.5545836 ]
[0.45050448 0.54949546]
[0.4531628  0.54683715]
[0.44818282 0.5518172 ]
[0.44952327 0.55047673]
[0.43401212 0.5659879 ]
[0.44401124 0.5559888 ]
[0.44809586 0.55190414]
[0.4377689  0.56223106]
[0.4471219  0.55287814]
[0.44488052 0.55511945]
[0.44220498 0.55779505]
[0.45054957 0.5494504 ]
[0.44194338 0.55805665]
[0.4485149 0.5514851]
[0.45055044 0.54944956]
[0.4445131  0.55548686]
[0.4446231 0.5553769]
[0.44767904 0.5523209 ]
[0.45146036 0.54853964]
[0.44554812 0.5544518 ]
