In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.stats import pearsonr
import seaborn as sns
from tqdm.auto import tqdm
import time

import warnings
import lightgbm as lgb
import pickle
modelpath = './catM'



In [2]:
df_asset_details = pd.read_csv(r"../input/g-research-crypto-forecasting/asset_details.csv").sort_values("Asset_ID")

In [3]:
def get_weighted_asset_feature(df, col):
    df['w'] = df['Asset_ID'].map(df_asset_details.set_index(keys='Asset_ID')['Weight'])
    weight_sum = df_asset_details.Weight.sum()
    df['W_'+col] = df.w * df[col]
    time_group = df.groupby('datetime')
    m = time_group['W_'+col].sum() / time_group['w'].sum()
    df.set_index(keys=['datetime'], inplace=True)
    df['W_'+col] = m
    df.reset_index(inplace=True)
    return df

In [4]:
def get_data_for_asset(df_train, asset_id):
    # Get X and y
    
    df = df_train[df_train["Asset_ID"] == asset_id]    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    #df_proc = df_proc[~df_proc.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index(drop=True)
    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    return X, y

In [5]:

def get_features(df, row = False):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['upper_Shadow'] = upper_shadow(df_feat)
    df_feat['lower_Shadow'] = lower_shadow(df_feat)
    df_feat["high_div_low"] = df_feat["High"] / df_feat["Low"]
    df_feat['trade'] = df_feat['Close'] - df_feat['Open']
    df_feat['gtrade'] = df_feat['trade'] / df_feat['Count']
    df_feat['shadow1'] = df_feat['trade'] / df_feat['Volume']
    df_feat['shadow3'] = df_feat['upper_Shadow'] / df_feat['Volume']
    df_feat['shadow5'] = df_feat['lower_Shadow'] / df_feat['Volume']
    df_feat['diff1'] = df_feat['Volume'] - df_feat['Count']
    df_feat['mean1'] = (df_feat['shadow5'] + df_feat['shadow3']) / 2
    df_feat['mean2'] = (df_feat['shadow1'] + df_feat['Volume']) / 2
    df_feat['mean3'] = (df_feat['trade'] + df_feat['gtrade']) / 2
    df_feat['mean4'] = (df_feat['diff1'] + df_feat['upper_Shadow']) / 2
    df_feat['mean5'] = (df_feat['diff1'] + df_feat['lower_Shadow']) / 2
    return df_feat

In [6]:
df_asset_details = pd.read_csv(r"../input/g-research-crypto-forecasting/asset_details.csv").sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


In [7]:
modellist_lgb = []
for fold in range(4):
    model_name = f"{modelpath}/lgb_fold{fold}.txt"
    model = lgb.Booster(model_file=model_name) 
    modellist_lgb.append(model)


In [8]:
features = model.feature_name()

In [9]:

modellist_cat = []
for fold in range(4):
    model_name = f"{modelpath}/cat_fold{fold}.pkl"
    model = pickle.load(open(model_name, "rb"))
    modellist_cat.append(model)

In [10]:

def hist_fea(df_test_keep,test_df,num,colname):
    tmp = df_test_keep[-num-1][['Asset_ID',colname]]
    tmp = tmp.rename(columns = {colname:f'{colname}{num}'})
    test_df = pd.merge(test_df,tmp,how='left',on=['Asset_ID'])
    test_df[f'{colname}_now_{num}'] = test_df[f'{colname}{num}']/test_df[colname]
    test_df["datetime"] = pd.to_datetime(test_df["timestamp"], unit="s")
    del test_df[f'{colname}{num}']
    return test_df

In [11]:
def asset_feature(df, col):
    df['w'] = df['Asset_ID'].map(df_asset_details.set_index(keys='Asset_ID')['Weight'])
    weight_sum = df_asset_details.Weight.sum()
    df['W_'+col] = df.w * df[col]
    time_group = df.groupby('datetime')
    m = time_group['W_'+col].sum() / time_group['w'].sum()
    df.set_index(keys=['datetime'], inplace=True)
    df['W_'+col] = m
    df.reset_index(inplace=True)
    return df

In [12]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()



ModuleNotFoundError: No module named 'gresearch_crypto'

In [None]:
import time
t1 = time.time()
result = []
for i,(test_df, df_pred) in enumerate(iter_test):
    if i == 0:
        dftrain = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
        dftrain_add = pd.read_csv("../input/g-research-crypto-forecasting/supplemental_train.csv")
        dftrain_add = dftrain_add[dftrain_add.timestamp>dftrain.timestamp.max()]
        dftrain = pd.concat([dftrain,dftrain_add]).reset_index(drop=True)
        dftrain = dftrain[dftrain.timestamp<test_df.loc[0,'timestamp']].reset_index(drop=True)
        df_test_keep = []
        for timestamp_tmp in dftrain.timestamp.unique()[-1501:]:
            df_test_keep.append(dftrain[dftrain.timestamp==timestamp_tmp])
        import gc
        del dftrain
        gc.collect()

    test_predictions = 0
    df_test_keep.append(test_df)
    df_test_keep = df_test_keep[-1501:]
    
    
    for xx in [15,30,60,90,150,600,1500]:
        test_df = hist_fea(df_test_keep,test_df,xx,'Close')
        test_df = hist_fea(df_test_keep,test_df,xx,'Volume')

    test_df = asset_feature(test_df, 'Close_now_15')

    y_pred = 0
    for model in modellist_lgb[:3]+modellist_cat[2:]:
        y_pred += model.predict(test_df[features])/len(modellist_lgb[:3]+modellist_cat[2:])

    df_pred['Target'] = y_pred

    env.predict(df_pred)
        