## Readme

This notebook is based on [RICHMANBTC's notebook](https://www.kaggle.com/richmanbtc/20211103-gresearch-crypto-v1) and [KATSU1110's notebook](https://www.kaggle.com/code1110/gresearch-simple-lgb-starter).

This notebook's point is this.

 - Exclude large target. (I bet black swan will not happen in this 3 month 😀)
 
 - I use little base feature like upper shadow, and use some rolling features. Rolling feature is mainly from richmanbtc's notebook.

Good luck for this 3 month!

---

## Code


In [None]:
import lzma
import pickle
import cloudpickle
import numpy as np
import pandas as pd
import lightgbm as lgb
import datatable as dt
import gresearch_crypto
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
from sklearn.model_selection import cross_validate, cross_val_predict

In [None]:
flag_strict = False
flag_cv = True

In [None]:
if flag_cv:
    import matplotlib.pyplot as plt
    from scipy.stats import pearsonr
    from sklearn.metrics import r2_score
    from sklearn.model_selection import cross_val_predict


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def get_feature_columns(df):
    features = df.columns[df.columns.str.startswith('feature')]
    return sorted(list(features))

def save_model(model, path):
    data = cloudpickle.dumps(model)
    data = lzma.compress(data)
    with open(path, 'wb') as f:
        f.write(data)
        
def process_data(df, df_asset, is_train=False):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', utc=True)
    
    # chnge point: 1
    if is_train:
        df = df[df['timestamp'] < '2021-06-13 00:00:00']

    df = df.rename(columns={
        'Asset_ID': 'market',
        'Close': 'cl',
        'Target': 'target',
    })
    df = df.join(df_asset[['weight']], on='market', how='left')
    df = df.set_index(['timestamp', 'market'])
    return df

def sort_and_remove_duplicates(df):
    df = df.sort_index(kind='mergesort')
    df = df.loc[~df.index.duplicated(keep='last')]
    return df

def my_purge_kfold(n, n_splits=5, purge=3750 * 14):
    idx = np.arange(n)
    cv = []
    for i in range(n_splits):
        val_start = i * n // n_splits
        val_end = (i + 1) * n // n_splits
        val_idx = idx[val_start:val_end]
        train_idx = idx[(idx < val_start - purge) | (val_end + purge <= idx)]
        cv.append((
            train_idx,
            val_idx,
        ))
    return cv

In [None]:
# calc features
def calc_features(df):
    df = df.copy()
    df['ln_cl'] = np.log(df['cl'])

    # shift is faster than diff
    df['feature_cl_diff1'] = df['ln_cl'] - df.groupby('market')['ln_cl'].shift(15)
    df['raw_return_causal'] = df['ln_cl'] - df.groupby('market')['ln_cl'].shift(15)
    
    inv_weight_sum = 1.0 / df.groupby('timestamp')['weight'].transform('sum')
    df['market_return_causal'] = (df['raw_return_causal'] * df['weight']).groupby('timestamp').transform('sum') * inv_weight_sum

    df['beta_causal'] = (
        (df['raw_return_causal'] * df['market_return_causal']).groupby('market').transform(lambda x: x.rolling(3750, 1).mean())
        / (df['market_return_causal'] ** 2).groupby('market').transform(lambda x: x.rolling(3750, 1).mean())
    )
    
    df['feature_cl_diff1_mean_simple'] = df['feature_cl_diff1'].groupby('timestamp').transform('mean')
    df['feature_cl_diff1_mean_weight'] = (df['feature_cl_diff1'] * df['weight']).groupby('timestamp').transform('sum') * inv_weight_sum
    df['feature_cl_diff1_resid'] = df['feature_cl_diff1'] - df['beta_causal'] * df['feature_cl_diff1_mean_weight']
    df['feature_cl_diff1_rank'] = df.groupby('timestamp')['feature_cl_diff1'].transform('rank')
    
    # added katsu
    df['feature_upper_shadow'] = df['High'] / df[['cl', 'Open']].max(axis=1)
    df['feature_lower_shadow'] = df[['cl', 'Open']].min(axis=1) / df['Low']
    df['feature_volume2count'] = df['Volume'] / (df['Count'] + 1)
    df.drop(['Open', 'High', 'Low', 'Volume', 'Count'], axis=1, inplace=True)
    
    # RSI, MACD
    # df["feature_RSI"] = RSI(df["cl"], 1)
    # macd, macd_signal = MACD(df["cl"], 30, 15, 5) 
    # df["feature_MACD"] = macd
    # df["feature_MACD_signal"] = macd_signal
    # del macd, macd_signal
    # gc.collect()
    
    df = df.rename(columns={
        'beta_causal': 'feature_beta_causal',
    })    
    return df

In [None]:
# %%time
# preprocess asset data
df_asset = dt.fread('../input/g-research-crypto-forecasting/asset_details.csv').to_pandas()
df_asset = df_asset.rename(columns={
    'Asset_ID': 'market',
    'Weight': 'weight',
    'Asset_Name': 'name',
})
df_asset = df_asset.set_index('market')
df_asset = df_asset.sort_values('market')

# preprocess train data
df = pd.concat([
    dt.fread('../input/g-research-crypto-forecasting/train.csv').to_pandas(),
    dt.fread('../input/g-research-crypto-forecasting/supplemental_train.csv').to_pandas(),
])

if flag_strict:
    df = process_data(df, df_asset, is_train=True)
else:
    df = process_data(df, df_asset)

df = reduce_mem_usage(df)
df = sort_and_remove_duplicates(df)
df.to_pickle('/tmp/df.pkl')

In [None]:
# %%time
# calc features
df = calc_features(df)
df = reduce_mem_usage(df)
features = get_feature_columns(df)
df = df[features + ['target', 'weight']]
df = df.dropna()

In [None]:
# Exclude 2021/10/29 data. It has very large target values.
s_str = '2021-10-29 08:49:00'
e_str = '2021-10-29 11:42:00'
dropped_inds = df.loc[
    (df.index.get_level_values(0) > s_str) & \
    (df.index.get_level_values(0) < e_str) & \
    ((df.index.get_level_values(1) == 7) | \
     (df.index.get_level_values(1) == 12) | \
     (df.index.get_level_values(1) == 13)), :].index
df.drop(dropped_inds, inplace=True)

In [None]:
# In addition, we exclude abs(target) > 0.5!
df = df.query("abs(target) < 0.5")

In [None]:
cv = my_purge_kfold(df.shape[0])
x = df[features]
y = df["target"]
df["y_pred"] = 0

# Tuned by Optuna
params_list = [
    {"seed": 0, 'objective': 'regression', 'metric': 'rmse', 'feature_pre_filter': False, 'lambda_l1': 5.338008670144408, 'lambda_l2': 0.10892103459556557, 'num_leaves': 12, 'feature_fraction': 0.8, 'bagging_fraction': 0.8229227798972512, 'bagging_freq': 7, 'min_child_samples': 20, 'num_iterations': 100, 'early_stopping_round': 50},
    {"seed": 0, 'objective': 'regression', 'metric': 'rmse', 'feature_pre_filter': False, 'lambda_l1': 6.000232331039988, 'lambda_l2': 0.0007928024088528338, 'num_leaves': 31, 'feature_fraction': 0.7200000000000001, 'bagging_fraction': 0.758706201323496, 'bagging_freq': 7, 'min_child_samples': 25, 'num_iterations': 100, 'early_stopping_round': 50},
    {"seed": 0, 'objective': 'regression', 'metric': 'rmse', 'feature_pre_filter': False, 'lambda_l1': 7.417330368307966e-05, 'lambda_l2': 6.905607612577662e-07, 'num_leaves': 159, 'feature_fraction': 0.8, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20, 'num_iterations': 100, 'early_stopping_round': 50},
    {"seed": 0, 'objective': 'regression', 'metric': 'rmse', 'feature_pre_filter': False, 'lambda_l1': 8.151302095905674, 'lambda_l2': 0.10316839127701032, 'num_leaves': 8, 'feature_fraction': 0.4, 'bagging_fraction': 0.6875441454005267, 'bagging_freq': 4, 'min_child_samples': 20, 'num_iterations': 100, 'early_stopping_round': 50},
    {"seed": 0, 'objective': 'regression', 'metric': 'rmse', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 45, 'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'min_child_samples': 20, 'num_iterations': 100, 'early_stopping_round': 50},
]

for fold, (trn_idx, val_idx) in enumerate(cv):
    clf = lgb.LGBMRegressor(**params_list[fold])
    trn_x, trn_y = x.iloc[trn_idx, :], y[trn_idx]
    val_x, val_y = x.iloc[val_idx, :], y[val_idx]
    clf.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        verbose=-1, 
        early_stopping_rounds=100,
    )
    print(clf._best_score["valid_0"]["rmse"])
    pickle.dump(clf, open(f"model.lgb.{fold}.pkl", 'wb'))
    df.iloc[val_idx, -1] = clf.predict(val_x)


In [None]:
if flag_cv:
    print(r2_score(df['target'], df['y_pred']))
    print(pearsonr(df['target'], df['y_pred']))
    print(df['target'].std())
    print('pearsonr by market')
    display(df.groupby('market').apply(lambda x: pearsonr(x['target'], x['y_pred'])[0]))
    df2 = df.reset_index().set_index('timestamp')
    market_count = df2['market'].unique().size
    df2['target'].rolling(3 * 30 * 24 * 60 * market_count).corr(df2['y_pred']).iloc[::24 * 60 * market_count].plot()
    plt.title('3 month rolling pearsonr')
    plt.show()


In [None]:
# reload clfs
estimators = []
for fold, (trn_idx, val_idx) in enumerate(cv):
    estimators.append(pickle.load(open(f"model.lgb.{fold}.pkl", 'rb')))

In [None]:
# submit
df_latest = pd.read_pickle('/tmp/df.pkl').tail(14*4000)

# time must 0.25s < in each iter...
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    
    # get latest data
    test_df2 = process_data(test_df, df_asset)
    df_latest = pd.concat([df_latest, test_df2])
    df_latest = df_latest.tail(14*4000+len(test_df2))
    df = df_latest.copy()

    df_features = calc_features(df)
    df_features = df_features.loc[test_df2.index, :]

    df_features["Target"] = 0
    for model in estimators:
        df_features["Target"] += model.predict(df_features[get_feature_columns(df_features)].values) / len(estimators)
    sample_prediction_df["Target"] = df_features["Target"].values

    # sample_prediction_df.merge(df_features[['row_id', 'Target']], how='left', on='row_id')
    env.predict(sample_prediction_df)
