# G-Research Cryptocurrencies : Modelisation

- Utils.

In [None]:
# system libraries
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import gc

# data manipulation libraries
import pandas as pd
import numpy as np

# graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# modelisation libraries
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [None]:
def regression_metrics(y_test, y_pred):
    """Function which contains differents metrics about regression
    Input: y_test, prediction
    
    Output: MAE, MSE, RMSE, MAPE & R² score  
    """
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("MAE: ",mae.round(5))
    print("MSE: ", mse.round(5))
    print("RMSE: ", rmse)
    print("MAPE: ", mape.round(5))

In [None]:
path = 'Datasets/new_df/'

## 1) Data Preprocessing

In [None]:
btc = pd.read_parquet(path+"bitcoin.parquet.gzip")
lite = pd.read_parquet(path+"litecoin.parquet.gzip")
merged_df = btc.concat(lite)
print(merged_df)

In [None]:
del btc, lite
gc.collect()

In [None]:
merged_df.fillna(0, inplace=True)

In [None]:
merged_df.drop(["asset_name", "vwap", "count",
          "open", "close", "high",
          "low", "volume", "log_open",
          "FRAMA", "RSI", "log_close",
          "log_low", "log_high",
          "MA_7d", "MA_14d", "MA_21d",
          "STD_7d"],
          axis=1, inplace=True)

In [None]:
train_set = merged_df.loc[merged_df.index <= "2021-06-20 23:59:00"]
test_set = merged_df.loc[merged_df.index >= "2021-06-21 00:00:00"]

train_set = train_set.reset_index()
train_set.drop("timestamp", axis=1, inplace=True)

test_set = test_set.reset_index()
test_set.drop("timestamp", axis=1, inplace=True)

y_train = train_set["target"]
X_train = train_set.drop("target", axis=1)

y_test = test_set["target"]
X_test = test_set.drop("target", axis=1)

In [None]:
del train_set,\
    test_set,\
    merged_df
gc.collect()

In [None]:
cat_var = ["asset_name"]
num_var = ["H-L", "O-C", "MA_7min.",
           "MA_14min.", "MA_21min.", "STD_7min."]

In [None]:
cat_pipe = Pipeline([
    ('encoder', OneHotEncoder())
])

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

preprocessing_pipe = ColumnTransformer(
    transformers=[
    ("cat", cat_pipe, cat_var),
    ('num', num_pipe, num_var)
])

In [None]:
X_train = preprocessing_pipe.fit_transform(X_train)

## 2) Modelisation with Machine Learning methods

### a) Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
X_test = preprocessing_pipe.transform(X_test)

In [None]:
y_pred = lin_reg.predict(X_test)

In [None]:
regression_metrics(y_test, y_pred)

In [None]:
del y_pred, lin_reg
gc.collect()

### b) Random Forest

In [None]:
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

In [None]:
y_pred = rf_reg.predict(X_test)
regression_metrics(y_test, y_pred)

In [None]:
del y_pred,\
    rf_reg,\
    X_train,\
    X_test,\
    y_train,\
    y_test

gc.collect()

## 3) Modelisation with Deep Learning methods

### a) ANN according to Vijdha et al. (2020)

In [None]:
from tensorflow import keras
import tensorflow as tf
import datetime

In [None]:
merged_df = pd.read_parquet(path+"bitcoin.parquet.gzip")

In [None]:
merged_df.fillna(0, inplace=True)

In [None]:
merged_df.drop(["asset_name", "vwap", "count",
          "open", "close", "high",
          "low", "volume", "log_open",
          "FRAMA", "RSI", "log_close",
          "log_low", "log_high"],
          axis=1, inplace=True)

In [None]:
train_features = merged_df.loc[merged_df.index <= "2021-06-20 23:59:00"]
train_features = train_features.reset_index()
train_features.drop("timestamp", axis=1, inplace=True)
train_labels = train_features["target"]
train_features = train_features.drop("target", axis=1)
train_features = np.asarray(train_features).astype("float32")

In [None]:
test_features = merged_df.loc[merged_df.index <= "2021-06-20 23:59:00"]
test_features = test_features.reset_index()
test_features.drop("timestamp", axis=1, inplace=True)
test_labels = test_features["target"]
test_features = test_features.drop("target", axis=1)
test_features = np.asarray(test_features).astype("float32")

In [None]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(train_features)

In [None]:
def build_and_compile_model(data):
  model = keras.Sequential([
      data,
      keras.layers.Dense(3, activation="relu"),
      keras.layers.Dense(1)
  ])

  model.compile(loss="mean_absolute_error",
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

In [None]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

In [None]:
%%time

history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split = 0.2,
    verbose = 1, epochs = 100)

In [None]:
test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [USD]')
plt.ylabel('Predictions [USD]')
lims = [0, 0.5]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [USD]')
_ = plt.ylabel('Count')

In [None]:
regression_metrics(test_predictions, test_labels)