In [1]:
# system libraries
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
import gc

# data manipulation libraries
import pandas as pd
import dask.dataframe as dd
import numpy as np

# graphical libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# modelisation libraries
from dask_ml.preprocessing import Categorizer, OneHotEncoder, StandardScaler
from dask_ml.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from dask_ml.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from dask_ml.linear_model import LinearRegression
from dask_ml.xgboost import XGBRegressor
from dask_ml.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [2]:
# source : https://stackoverflow.com/questions/53165807/how-to-calculate-rmspe-in-python-using-numpy
def rmpse(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true))))

    return loss

def regression_metrics(y_test, y_pred):
    """Function which contains differents metrics about regression
    Input: y_test, prediction
    
    Output: MAE, MSE, RMSE, MAPE & R² score  
    """
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    rmpse_cal = rmpse(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print("MAE: ",mae.round(5))
    print("MSE: ", mse.round(5))
    print("RMSE: ", rmse)
    print("RMPSE: ", rmpse_cal.round(3))
    print("MAPE: ", mape.round(5))

In [3]:
dtype_dict = {"H-L" : "float16",
              "O-C" : "float16",
              "MA_7d" : "float16", 
              "MA_14d" : "float16",
              "MA_21d" : "float16",
              "STD_7d" : "float16"}

In [4]:
df = dd.read_csv('Datasets/new_df/*.csv',
                 dtype=dtype_dict)

In [5]:
df = df.drop(["open", "close", "high", "low", "volume",
         "log_open", "log_close", "log_low", "log_high",
         "FRAMA", "RSI"], axis=1)

In [6]:
df = df.set_index("timestamp")

In [7]:
df = df.categorize("asset_name")

In [8]:
train_set = df.loc[:"2021-06-20 23:59:00"]
test_set = df.loc["2021-06-21 00:00:00":]

In [9]:
train_set = train_set.reset_index()
train_set = train_set.drop("timestamp", axis=1)

test_set = test_set.reset_index()
test_set = test_set.drop("timestamp", axis=1)

In [10]:
y_train = train_set["target"]
X_train = train_set.drop("target", axis=1)

y_test = test_set["target"]
X_test = test_set.drop("target", axis=1)

In [11]:
del df, train_set, test_set
gc.collect()

0

In [None]:
type(train_set)

In [12]:
cat_var = ["asset_name"]
num_var = ["H-L", "O-C", "MA_7d", "MA_14d", "MA_21d", "STD_7d"]

In [13]:
cat_pipe = Pipeline([
    ('categorizer', Categorizer()),
    ('encoder', OneHotEncoder(sparse=False))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

preprocessing_pipe = ColumnTransformer(
    transformers=[
    ('cat',  cat_pipe, cat_var),
    ('num', num_pipe, num_var)
])

In [14]:
X_train = preprocessing_pipe.fit_transform(X_train)

In [15]:
X_train = X_train.to_dask_array()
y_train = y_train.to_dask_array()

In [16]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

ValueError: failed to initialize intent(inout) array -- input 'O' not compatible to 'd'

In [None]:
X_test = preprocessing_pipe.transform(X_test)
X_test = X_test.to_dask_array()
y_test = y_test.to_dask_array()
y_pred = lin_reg.predict(X_test)
regression_metrics(np.array(y_test), y_pred)

In [None]:
del y_pred, lin_reg

In [None]:
xgboost = XGBRegressor(random_state=42)
xgboost.fit(X_train, y_train)

In [None]:
y_pred = xgboost.predict(X_test)
regression_metrics(np.array(y_test), y_pred)

In [None]:
del y_pred, xgboost