# G-Research | XGBoost

### Acknowledgements 😍
My acknowledgments are given to:

* [Yam Peleg. G-Research: XGBoost with GPU (Fit in 1min).](https://www.kaggle.com/yamqwe/g-research-xgboost-with-gpu-fit-in-1min)

### My Previous Competition Notebooks
Here you can check my other notebooks for this competition: 
* [G-Research Forecast | Overlap | Score = 0.9999](https://www.kaggle.com/maricinnamon/g-research-forecast-overlap-score-0-9999)
* [G-Research | Exploratory Data Analysis | Lin Regr](https://www.kaggle.com/maricinnamon/g-research-exploratory-data-analysis-lin-regr)

# 1. Libraries

In [None]:
import traceback
import numpy as np
import pandas as pd
import xgboost as xgb
import datatable as dt
import gresearch_crypto
from lightgbm import LGBMRegressor

# 2. Read data
Here we are going to use an **extra dataset**, that you can add to your notebook: [Cryptocurrency extra data - Binance Coin](https://www.kaggle.com/yamqwe/cryptocurrency-extra-data-binance-coin). This dataset is an extra updating dataset for the G-Research Crypto Forecasting competition.


In [None]:
TRAIN_JAY = '../input/cryptocurrency-extra-data-binance-coin/orig_train.jay'
ASSET_DETAILS_JAY = '../input/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay'

In [None]:
df_train = dt.fread('../input/cryptocurrency-extra-data-binance-coin/orig_train.jay').to_pandas()
df_train.head()

In [None]:
df_asset_details = dt.fread('../input/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay').to_pandas().sort_values("Asset_ID")
df_asset_details.head()

# 3. Feature Engineering

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])


def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']


# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    return df_feat

# 4. Define Train Function

In [None]:
def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    model = xgb.XGBRegressor(
        n_estimators=500,
        max_depth=11,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.7,
        missing=-999,
        random_state=2020,
        # tree_method='gpu_hist'  # THE MAGICAL PARAMETER
    )
    model.fit(X, y)

    return X, y, model


## Loop over all assets

In [None]:
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], 
                                df_asset_details['Asset_Name']):
    
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    
    try:
        X, y, model = get_Xy_and_model_for_asset(df_train, asset_id)    
        Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    except:         
        Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None   

In [None]:
# Check the model interface
x = get_features(df_train.iloc[1])
y_pred = models[0].predict(pd.DataFrame([x]))
y_pred[0]

# 5. Submit to Cometition

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
    env.predict(df_pred)

#### *If you liked it, please, make an upvote* 💖