# Projet data - alternative au stage de fin de formation

## G-Research Crypto Forecasting

https://www.kaggle.com/c/g-research-crypto-forecasting

<img src="crypto.png"/>

### Modélisation et prédiction

**Important** : Executez le code de la partie 2 `crypto_preparation` avant de continuer.

In [49]:
import numpy as np
import pandas as pd
import pickle as pk
import matplotlib.pyplot as plt
import matplotlib.dates as md
import seaborn as sns
import datetime
import time
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
%matplotlib inline

In [50]:
def load_data(filename):
    with open("./" + filename + ".pkl", 'rb') as f:
        objects = []
        unpickler = pk.Unpickler(f)
        while True:
            try:
                obj = unpickler.load()
                objects.append(obj)
            except:
                return tuple(objects)
    return tuple()

In [51]:
(df,) = load_data("crypto_df")
(d_assets_1_year,) = load_data("crypto_d_assets_1_year")
d_assets_1_year[0]["Asset_Data"].head()

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Upper_Shadow,Lower_Shadow,Log_Return_1min
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1600552800,0.0,319.0,27.3176,27.3346,27.29,27.3084,3401.2153,27.317935,-0.002849,0.017,0.0184,2e-05
1600552860,0.0,169.0,27.308,27.3399,27.2958,27.32445,1756.3163,27.318485,-0.001419,0.01545,0.0122,2e-05
1600552920,0.0,170.0,27.3175,27.3367,27.2489,27.2773,1031.057,27.293355,-8.4e-05,0.0192,0.0284,-0.00092
1600552980,0.0,153.0,27.2705,27.281,27.2151,27.2511,1623.7244,27.270969,0.001622,0.0105,0.036,-0.000821
1600553040,0.0,112.0,27.2333,27.262,27.2106,27.2587,914.269,27.231585,0.002324,0.0033,0.0227,-0.001445


#### Séparation en données de train et de test

Prenons une période de 4 mois pour le train et 1 mois pour le test.<br>
`train : du 01-02-2021 au 01-06-2021`<br>
`test : du 01-06-2021 au 01-07-2021`

In [69]:
def get_train_data(df):
    df["datetime"] = pd.to_datetime(df.index, unit='s')
    return df[(df["datetime"] >= '2021-02-01') & (df["datetime"] < '2021-06-01')]

In [85]:
def get_test_data(df):
    df["datetime"] = pd.to_datetime(df.index, unit='s')
    return df[(df["datetime"] >= '2021-06-01') & (df["datetime"] < '2021-07-01')]

In [86]:
df_bitcoin = d_assets_1_year[1]["Asset_Data"]
df_bitcoin_train = get_train_data(df_bitcoin)
df_bitcoin_test = get_test_data(df_bitcoin)
print("train:", df_bitcoin_train.iloc[0]["datetime"], "~", df_bitcoin_train.iloc[-1]["datetime"])
print("test:", df_bitcoin_test.iloc[0]["datetime"], "~", df_bitcoin_test.iloc[-1]["datetime"])

train: 2021-02-01 00:00:00 ~ 2021-05-31 23:59:00
test: 2021-06-01 00:00:00 ~ 2021-06-30 23:59:00


S'agissant d'un problème de regression, essayons et comparons les performances des modèles suivants :
- `LinearRegression`
- `CatBoostRegressor`
- `LGBMRegressor`
- `XGBRegressor`

In [87]:
MODELS = {
    "LinearRegression": { "type": LinearRegression, "params": { "n_jobs": -1 } },
    "CatBoostRegressor": { "type": CatBoostRegressor, "params": { "num_threads": -1 } },
    "LGBMRegressor": { "type": LGBMRegressor, "params": { "n_jobs": -1 } },
    "XGBRegressor": { "type": XGBRegressor, "params": { "n_jobs": -1 } },
}
FEATURES = [
    "Count",
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    "VWAP",
    "Upper_Shadow",
    "Lower_Shadow",
    "Log_Return_1min",
]

In [88]:
def create_model(df_train, df_test, model_type):
    X_train = df_train[FEATURES]
    y_train = df_train["Target"]
    X_test = df_test[FEATURES]
    y_test = df_test["Target"]
    model = model_type["type"](**model_type["params"])
    model.fit(X_train, y_train)
    return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "model": model }

In [89]:
def weighted_correlation(expected, predicted, weights):
    weights = np.ravel(weights)
    expected = np.ravel(expected)
    predicted = np.ravel(predicted)
    sum_weights = np.sum(weights)
    mean_expected = np.sum(expected * weights) / sum_weights
    mean_predicted = np.sum(predicted * weights) / sum_weights
    var_expected = np.sum(weights * np.square(expected - mean_expected)) / sum_weights
    var_predicted = np.sum(weights * np.square(predicted - mean_predicted)) / sum_weights
    cov = np.sum((expected * predicted * weights)) / np.sum(weights) - mean_expected * mean_predicted
    corr = cov / np.sqrt(var_expected * var_predicted)
    return abs(corr)

In [90]:
for asset in d_assets_1_year.values():
    df_train = get_train_data(asset["Asset_Data"])
    df_test = get_test_data(asset["Asset_Data"])
    linreg_model = create_model(df_train, df_test, MODELS["LinearRegression"])
    y_pred = linreg_model["model"].predict(linreg_model["X_test"])
    wcorr = weighted_correlation(linreg_model["y_test"], y_pred, np.repeat(asset["Asset_Weight"], len(y_pred)))
    print("%2d - %-16s : %f" % (asset["Asset_ID"], asset["Asset_Name"], wcorr))

 2 - Bitcoin Cash     : 0.017738
 0 - Binance Coin     : 0.019331
 1 - Bitcoin          : 0.015502
 5 - EOS.IO           : 0.005757
 7 - Ethereum Classic : 0.004655
 6 - Ethereum         : 0.016576
 9 - Litecoin         : 0.030998
11 - Monero           : 0.007975
13 - TRON             : 0.087173
12 - Stellar          : 0.077853
 3 - Cardano          : 0.040639
 8 - IOTA             : 0.010734
10 - Maker            : 0.043932
 4 - Dogecoin         : 0.025617
