# G-Research Crypto - Beginners Pipeline 

We train regressor from different algorithms, one is `LGBMRegressor`, `XGBRegressor` & last `CatBoostRegressor`. Dataset of features from input frame are (`['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']`), we get the predictions correctly using the iterator and we submit.

## References:
* [G-Research Crypto - Starter XGB Pipeline](https://www.kaggle.com/tarlannazarov/g-research-crypto-starter-xgb-pipeline)
* [Basic Submission Template](https://www.kaggle.com/sohier/basic-submission-template)
* [CatBoost Regressor](https://www.kaggle.com/yamqwe/crypto-prediction-catboost-regressor#Predict-&-submit)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Different Regressor libraries 
from lightgbm import LGBMRegressor # LGBM Regressor
import xgboost as xgb # XGB Regressor
from catboost import CatBoostRegressor # CatBoost Resgressor
import gresearch_crypto
import traceback

# Reading & Feature Modification

In [None]:
# Read the Crypto dataset files
df_train = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv')
df_asset = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv').sort_values("Asset_ID")

In [None]:
# Modify the training dataset
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train = df_train.dropna()

# Modify the Assest Dataset
df_asset.replace([np.inf, -np.inf], np.nan, inplace=True)
df_asset = df_asset.dropna()

In [None]:
# Asset dataset details are following.
df_asset

# Utility Function
These function are used to create the some new features which are useful for the model.
Special thanks to @DATAISTA0 (JULIÁN PELLER) [notebook](https://www.kaggle.com/julian3833/g-research-starter-lgbm-pipeline) for functionality.

In [None]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# Extract Asset features
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    return df_feat

# Extract features from dataset
def get_Xy_features(df_train, df_asset_id):
    # Modfify with respect to 'asset ID'
    df = df_train[df_train["Asset_ID"] == df_asset_id]
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    # X, y features from the dataset.
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    return X, y

# Implementation: LGBM Regressor

In [None]:
# LGBM Regressor Model Function
def LGBMRegressor_model(X, y):
        
    # LGBM Regresor Model 
    model = LGBMRegressor(n_estimators=40) # 40000
    model.fit(X, y)
    
    return model

In [None]:
# Calling the implemenation of LGBM Regressor
model_lgb = {}

for asset_id, asset_name in zip(df_asset['Asset_ID'], df_asset['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y = get_Xy_features(df_train, asset_id)
    model = LGBMRegressor_model(X, y)
    
    model_lgb[asset_id] = model

In [None]:
# Checking LGBM Regressor Model interface
x = get_features(df_train.iloc[1])
y_pred_lgb = model_lgb[0].predict([x])
y_pred_lgb[0]

In [None]:
# Delete unused dataset
del x
del y_pred_lgb

# Implementation: XGB Regressor

In [None]:
# LGBM Regressor Model Function
def XGBRegressor_model(X, y):
    
    model = xgb.XGBRegressor(
    n_estimators=50, #500
    learning_rate=0.05,
    max_depth=12,
    subsample=0.9,
    colsample_bytree=0.7,
    missing=-999,
    random_state=1111,
    tree_method='hist'  
    )
    
    model.fit(X, y)
    
    return model

In [None]:
# Calling the implemenation of XGB Regressor
model_xgb = {}

for asset_id, asset_name in zip(df_asset['Asset_ID'], df_asset['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y = get_Xy_features(df_train, asset_id)
    model = XGBRegressor_model(X, y)
    
    model_xgb[asset_id] = model

In [None]:
# Checking XGB Regressor Model interface
x = get_features(df_train.iloc[1])
y_pred_xgb = model_xgb[0].predict(pd.DataFrame([x]))
y_pred_xgb[0]

In [None]:
# Delete unused dataset
del x
del y_pred_xgb

# Implementation: CatBoost Regressor

In [None]:
# Features for CatBoost Regressor
def get_features_catboost(df,row=False):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', "timestamp"]].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    ## Add some more feats
    df_feat["high_div_low"] = df_feat["High"]/df_feat["Low"]
    df_feat["open_sub_close"] = df_feat["Open"]-df_feat["Close"]

    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    times = pd.to_datetime(df_feat["timestamp"],unit="s",infer_datetime_format=True)
    if row:
        df_feat["hour"] = times.hour
        df_feat["dayofweek"] = times.dayofweek 
        df_feat["day"] = times.day 
    else:
        df_feat["hour"] = times.dt.hour
        df_feat["dayofweek"] = times.dt.dayofweek 
        df_feat["day"] = times.dt.day 
    df_feat = df_feat.drop(columns = "timestamp")
    return df_feat

# X, y Features from dataset
def get_Xy_and_Catboost(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features_catboost(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]

    return X, y

In [None]:
#Implementation of CatBoost Regressor
def CatBoostRegressor_model(X, y):
    model = CatBoostRegressor(iterations=20, learning_rate=0.05, depth=10, random_seed=42) #1000
    model.fit(X, y)

    return model

In [None]:
# Calling of CatBoost Regressor Model
model_cat = {}

for asset_id, asset_name in zip(df_asset['Asset_ID'], df_asset['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y = get_Xy_and_Catboost(df_train, asset_id)
    model = CatBoostRegressor_model(X, y)
    
    model_cat[asset_id] = model

In [None]:
# Check the CatBoost model interface
x = get_features_catboost(df_train.iloc[1],row=True)
y_pred_cat = model_cat[0].predict([x])
y_pred_cat[0]

In [None]:
# Delete unused dataset
del x
del y_pred_cat

# Final Prediction: LGBM + XGB + CatBoost

In [None]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    df_pred_lgbr = df_pred
    df_pred_xgbr = df_pred
    df_pred_cbtr = df_pred
    for j , row in df_test.iterrows():
        
        # LGBM Regressor
        model_lgbr = model_lgb[row['Asset_ID']]
        x_test_lgbr = get_features(row)
        y_pred_lgbr = model_lgbr.predict([x_test_lgbr])[0]
        
        df_pred_lgbr.loc[df_pred_lgbr['row_id'] == row['row_id'], 'Target'] = y_pred_lgbr
        
        # XGB Regressor
        model_xgbr = model_xgb[row['Asset_ID']]
        x_test_xgbr = get_features(row)
        y_pred_xgbr = model_xgbr.predict(pd.DataFrame([x_test_xgbr]))[0]
        
        df_pred_xgbr.loc[df_pred_xgbr['row_id'] == row['row_id'], 'Target'] = y_pred_xgbr
        
        # CatBoost Regressor
        model_cbtr = model_cat[row['Asset_ID']]
        x_test_cbtr = get_features_catboost(row, row=True)
        y_pred_cbtr = model_cbtr.predict([x_test_cbtr])[0]
        
        df_pred_cbtr.loc[df_pred_cbtr['row_id'] == row['row_id'], 'Target'] = y_pred_cbtr

    # Display the first prediction dataframe
    if i == 0:
        pred_final = 0.5 * df_pred_lgbr + 0.2 * df_pred_xgbr +  0.3 * df_pred_cbtr
        display(pred_final)

    # Send submissions
    env.predict(pred_final)

**In the Final prediction there are some computational error occurs due to whcih submission is not happening but still if you like the effort please upvote.!!!**