## G-Research Crypto Forecasting

https://www.kaggle.com/c/g-research-crypto-forecasting

In [1]:
import numpy as np
import pandas as pd
import pickle as pk
import datetime
import time
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

### Exploration

In [2]:
TRAIN_CSV = "../input/g-research-crypto-forecasting/train.csv"
ASSET_CSV = "../input/g-research-crypto-forecasting/asset_details.csv"

In [3]:
df_assets = pd.read_csv(ASSET_CSV)

In [4]:
types = {
    "row_id"   : 'int32',
    "Asset_ID" : 'int8',
    "Count"    : 'int32',
    "Open"     : 'float64',
    "High"     : 'float64',
    "Low"      : 'float64',
    "Close"    : 'float64',
    "Volume"   : 'float64',
    "VWAP"     : 'float64',
}
df = pd.read_csv(TRAIN_CSV, dtype=types)
df = df.replace([np.inf, -np.inf], np.nan).ffill().bfill()

In [5]:
def extract_asset_and_fill_gaps(df, asset_id):
    asset_df = df[df["Asset_ID"] == asset_id].set_index("timestamp")
    asset_df = asset_df.reindex(np.array(range(asset_df.index[0], asset_df.index[-1] + 60, 60))).ffill()
    return asset_df

assets = {}
for i, row in df_assets.iterrows():
    asset_id = row["Asset_ID"]
    asset_name = row["Asset_Name"]
    asset_weight = row["Weight"]
    asset_df = extract_asset_and_fill_gaps(df, asset_id)
    assets[asset_id] = {
        "Asset_ID": asset_id,
        "Asset_Name": asset_name,
        "Asset_Weight": asset_weight,
        "Asset_Data": asset_df,
    }

In [7]:
def date_to_timestamp(date):
    return np.int32(time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()))

assets_1_year = {}
for asset in assets.values():
    assets_1_year[asset["Asset_ID"]] = {
        "Asset_ID": asset["Asset_ID"],
        "Asset_Name": asset["Asset_Name"],
        "Asset_Weight": asset["Asset_Weight"],
        "Asset_Data": asset["Asset_Data"].loc[date_to_timestamp("2020-09-20"):date_to_timestamp("2021-09-20")],
    }

### Preparation

In [8]:
def get_upper_shadow(df):
    return df.High - df[["Open", "Close"]].max(axis=1)

def get_lower_shadow(df):
    return df[["Open", "Close"]].min(axis=1) - df.Low

for asset_id in assets_1_year.keys():
    assets_1_year[asset_id]["Asset_Data"]["Upper_Shadow"] = get_upper_shadow(asset["Asset_Data"])
    assets_1_year[asset_id]["Asset_Data"]["Lower_Shadow"] = get_lower_shadow(asset["Asset_Data"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assets_1_year[asset_id]["Asset_Data"]["Upper_Shadow"] = get_upper_shadow(asset["Asset_Data"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assets_1_year[asset_id]["Asset_Data"]["Lower_Shadow"] = get_lower_shadow(asset["Asset_Data"])


### Prediction

In [9]:
MODELS = {
    "LinearRegression": { "type": LinearRegression, "params": { "n_jobs": -1 } },
    "CatBoostRegressor": { "type": CatBoostRegressor, "params": { "thread_count": -1, "verbose": 0 } },
    "LGBMRegressor": { "type": LGBMRegressor, "params": { "n_jobs": -1 } },
    "XGBRegressor": { "type": XGBRegressor, "params": { "n_jobs": -1 } },
}
FEATURES = [
    "Count",
    "Open",
    "High",
    "Low",
    "Close",
    "Volume",
    "VWAP",
    "Upper_Shadow",
    "Lower_Shadow",
]

In [10]:
def get_train_data(df):
    return df[(pd.to_datetime(df.index, unit='s') >= '2021-02-01') & (pd.to_datetime(df.index, unit='s') < '2021-06-01')]

def get_test_data(df):
    return df[(pd.to_datetime(df.index, unit='s') >= '2021-06-01') & (pd.to_datetime(df.index, unit='s') < '2021-07-01')]

def create_model(df_train, df_test, model_type):
    X_train = df_train[FEATURES]
    y_train = df_train["Target"]
    X_test = df_test[FEATURES]
    y_test = df_test["Target"]
    model = model_type["type"](**model_type["params"])
    model.fit(X_train, y_train)
    return { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test, "model": model }

def weighted_correlation(expected, predicted, weights):
    weights = np.ravel(weights)
    expected = np.ravel(expected)
    predicted = np.ravel(predicted)
    sum_weights = np.sum(weights)
    mean_expected = np.sum(expected * weights) / sum_weights
    mean_predicted = np.sum(predicted * weights) / sum_weights
    var_expected = np.sum(weights * np.square(expected - mean_expected)) / sum_weights
    var_predicted = np.sum(weights * np.square(predicted - mean_predicted)) / sum_weights
    cov = np.sum((expected * predicted * weights)) / np.sum(weights) - mean_expected * mean_predicted
    corr = cov / np.sqrt(var_expected * var_predicted)
    return corr

In [11]:
results = {}
for asset in assets_1_year.values():
    print(asset["Asset_Name"])
    df_train = get_train_data(asset["Asset_Data"])
    df_test = get_test_data(asset["Asset_Data"])
    results[asset["Asset_ID"]] = { "Asset_Name": asset["Asset_Name"] }
    for model_name, model in MODELS.items():
        reg_model = create_model(df_train, df_test, model)
        y_pred = reg_model["model"].predict(reg_model["X_test"])
        wcorr = weighted_correlation(reg_model["y_test"], y_pred, np.repeat(asset["Asset_Weight"], len(y_pred)))
        print("  %-18s: %f" % (model_name, wcorr))
        results[asset["Asset_ID"]][model_name] = {
            "model": reg_model["model"],
            "wcorr": wcorr,
        }

Bitcoin Cash
  LinearRegression  : -0.003640
  CatBoostRegressor : 0.027794
  LGBMRegressor     : 0.007601
  XGBRegressor      : 0.012998
Binance Coin
  LinearRegression  : 0.016654
  CatBoostRegressor : 0.014153
  LGBMRegressor     : 0.015378
  XGBRegressor      : 0.000155
Bitcoin
  LinearRegression  : 0.017717
  CatBoostRegressor : -0.010041
  LGBMRegressor     : -0.031355
  XGBRegressor      : 0.000729
EOS.IO
  LinearRegression  : -0.015020
  CatBoostRegressor : 0.029644
  LGBMRegressor     : 0.023615
  XGBRegressor      : 0.015222
Ethereum Classic
  LinearRegression  : 0.008256
  CatBoostRegressor : 0.020103
  LGBMRegressor     : 0.025386
  XGBRegressor      : 0.023657
Ethereum
  LinearRegression  : -0.021257
  CatBoostRegressor : 0.016021
  LGBMRegressor     : -0.006060
  XGBRegressor      : 0.010950
Litecoin
  LinearRegression  : 0.025322
  CatBoostRegressor : 0.030862
  LGBMRegressor     : 0.012929
  XGBRegressor      : 0.017480
Monero
  LinearRegression  : -0.006568
  CatBoostR

In [12]:
for asset_id in results.keys():
    best_index = np.argmax([results[asset_id][model_name]["wcorr"] for model_name in MODELS.keys()])
    results[asset_id]["best_model"] = list(results[asset_id].keys())[best_index + 1]
    print("%-16s | %-18s| %f" % (results[asset_id]["Asset_Name"], results[asset_id]["best_model"], results[asset_id][results[asset_id]["best_model"]]["wcorr"]))

Bitcoin Cash     | CatBoostRegressor | 0.027794
Binance Coin     | LinearRegression  | 0.016654
Bitcoin          | LinearRegression  | 0.017717
EOS.IO           | CatBoostRegressor | 0.029644
Ethereum Classic | LGBMRegressor     | 0.025386
Ethereum         | CatBoostRegressor | 0.016021
Litecoin         | CatBoostRegressor | 0.030862
Monero           | CatBoostRegressor | 0.017075
TRON             | LinearRegression  | 0.077180
Stellar          | XGBRegressor      | 0.020009
Cardano          | LinearRegression  | 0.044004
IOTA             | LinearRegression  | 0.006028
Maker            | LinearRegression  | 0.041885
Dogecoin         | LinearRegression  | 0.025703


### Submission

In [13]:
def get_row_upper_shadow(row):
    return row.High - max(row.Open, row.Close)

def get_row_lower_shadow(row):
    return min(row.Open, row.Close) - row.Low

In [14]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
for (df_test, sample_prediction_df) in iter_test:
    for i, row in df_test.iterrows():
        asset_id = row["Asset_ID"]
        model = results[asset_id][results[asset_id]["best_model"]]["model"]
        X_test = row.copy()
        X_test["Upper_Shadow"] = get_row_upper_shadow(row)
        X_test["Lower_Shadow"] = get_row_lower_shadow(row)
        X_test = X_test[FEATURES]
        y_pred = model.predict(np.array([X_test]))[0]
        sample_prediction_df.loc[sample_prediction_df['row_id'] == row['row_id'], 'Target'] = y_pred
    env.predict(sample_prediction_df)

ModuleNotFoundError: No module named 'gresearch_crypto'