# INPUT

In [None]:
import os, gc, warnings, random, datetime, traceback, joblib, gresearch_crypto
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import lightgbm as lgb #new

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import numpy.polynomial.hermite as Herm
import math

In [None]:
env       = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
TRAIN_CSV         = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
EXAMPLE_TEST      = '/kaggle/input/g-research-crypto-forecasting/example_test.csv'

df_train          = pd.read_csv(TRAIN_CSV)
df_test           = pd.read_csv(EXAMPLE_TEST)
df_asset_details  = pd.read_csv(ASSET_DETAILS_CSV).sort_values('Asset_ID')

In [None]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
seed = 1337
fix_all_seeds(seed)

remove_cl_test_overlapping_data = True #& False #test 2-month
remove_lb_test_overlapping_data = True & False #test 6-month(+-3 month)
remove_op_test_overlapping_data = True & False 
bias_harmonic_oscillator        = True #& False
features_importance_check       = True #& False
visualization                   = True #& False
train_models                    = True & False       
callbacks                       = True & False 
save_models                     = True & False
load_models                     = True #& False
test_baseline_model             = True #& False
submissions_test                = True #& False

In [None]:
# Remove the features to test the LR baseline score.
if remove_cl_test_overlapping_data:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_valid  = df_train[(df_train['datetime'] > '2021-05-21 00:00:00')].reset_index(drop=True) #max : 21-09-21
    df_train  = df_train[(df_train['datetime'] < '2021-08-21 00:00:00')].reset_index(drop=True)
    df_train  = df_train.drop(['datetime'],axis=1)
    df_valid  = df_valid.drop(['datetime'],axis=1)
    
if remove_lb_test_overlapping_data:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_valid  = df_train[(df_train['datetime'] > '2021-03-23 00:00:00')].reset_index(drop=True) 
    df_train  = df_train[(df_train['datetime'] < '2021-07-23 00:00:00')].reset_index(drop=True) 
    df_train  = df_train.drop(['datetime'],axis=1)
    df_valid  = df_valid.drop(['datetime'],axis=1)

elif remove_op_test_overlapping_data:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_valid  = df_train[(df_train['datetime'] > '2021-05-23 00:00:00')].reset_index(drop=True)
    df_train  = df_train.drop(['datetime'],axis=1)
    df_valid  = df_valid.drop(['datetime'],axis=1)
    
else:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_valid  = df_train[(df_train['datetime'] > '2021-02-23 00:00:00')].reset_index(drop=True)
    df_train  = df_train.drop(['datetime'],axis=1)
    df_valid  = df_valid.drop(['datetime'],axis=1)

In [None]:
df_valid = df_valid.dropna(subset=['Target']).reset_index(drop=True)

In [None]:
#simple units
hbar = 1.0
m    = 1.0
w    = 1.0

def hermite(x, n):
    xi             = np.sqrt(m*w/hbar)*x
    herm_coeffs    = np.zeros(n+1)
    herm_coeffs[n] = 1
    return Herm.hermval(xi, herm_coeffs)

def stationary_state(x,n):
    xi        = np.sqrt(m*w/hbar)*x
    prefactor = 1.0/math.sqrt(2.0**n * math.factorial(n)) * (m*w/(np.pi*hbar))**(0.25)
    psi       = prefactor * np.exp(- xi**2 / 2) * hermite(x,n)
    return psi

In [None]:
def get_features(df, drop_train=True, drop_visualization=False, bias_harmonic_oscillator=bias_harmonic_oscillator):    
    
    #customize
    df['upper_Shadow'] = df['High'] - np.maximum(df['Close'], df['Open'])   
    df['lower_Shadow'] = np.minimum(df['Close'], df['Open']) - df['Low']                      
    df['hlco_ration']  = (df['High'] - df['Low'])/(df['Close']-df['Open'])
    df['high_div_low'] = df['High'] / df['Low']
    df['gtrade']       = (df['Close'] - df['Open']) / df['Count']
    df['shadow1']      = (df['Close'] - df['Open']) / df['Volume']
    df['shadow3']      = df['upper_Shadow'] / df['Volume']
    df['shadow5']      = df['lower_Shadow'] / df['Volume']
    df['mean2']        = (df['shadow1'] + df['Volume']) / 2
    df['spread']       = df['High'] - df['Low']
    df['log_price_co'] = np.log(df['Close']/df['Open'])
    df['log_exp_co']   = np.logaddexp(df['Close'], df['Open'])
    df['volume_count'] = df['Volume'] / (df['Count'] + 1)  
    
    #Quantum Harmonic Oscillator(QHO); Light Gradient Boosting Machine(LGBM) with an applied QHO for indicator
    df['harmonic_oscillator_115v'] = stationary_state(df['volume_count'], 115) 
    #114.59155903 == 360/pi, 6h*60min/pi ; 6h = 1/4 of day ≈ 115+-5 so [110,120] for training
    #Bias in dataset Harmonic-Oscillator If the value is prohibitively exorbitant, you can adjust the hermite value.
    # Add :: df['hermite_n'] = hermite(df, n) ;In simple units, recommend 60 <= n >= 210
    df['hermite_120v'] = hermite(df['volume_count'], 120)

    #drop
    if drop_train:
        df = df.drop(['timestamp', 'VWAP', 'Close', 'Low', 'High', 'Open', 'Volume', 'Count'],axis=1)
        if bias_harmonic_oscillator:
            df = df.drop(['hermite_120v'],axis=1)
    elif drop_visualization:
        df = df.drop(['VWAP', 'Close', 'Low', 'High', 'Open', 'Volume', 'Count'],axis=1) 
        if bias_harmonic_oscillator:
            df = df.drop(['hermite_120v'],axis=1)
            df = df.fillna(-1)
    else:
        df = df.drop(['timestamp', 'VWAP', 'Close', 'Low', 'High', 'Open', 'Volume', 'Count', 'Asset_ID', 'row_id'])
        if bias_harmonic_oscillator:
            df = df.drop(['hermite_120v'])
            df = df.fillna(-1)
    return df 

In [None]:
def get_Xys_and_model_for_asset(df_train, df_valid, asset_id, df_asset_details):   
    
    #X_train, X_test
    df_train, df_valid = df_train[df_train["Asset_ID"] == asset_id],  df_valid[df_valid['Asset_ID'] == asset_id]
    df_train, df_valid = df_train.dropna(subset=['Target']),          df_valid.dropna(subset=['Target'])
    
    #y_train, y_test
    y_train,  y_test   = df_train['Target'],                          df_valid['Target']  
    df_train, df_valid = df_train.drop(['Target','Asset_ID'],axis=1), df_valid.drop(['Target','Asset_ID'],axis=1)
    df_t,     df_v     = get_features(df_train),                      get_features(df_valid)
    x_train,  x_test   = df_t.fillna(-1),                             df_v.fillna(-1)
            
    #weight 
    df_weight   = df_asset_details[df_asset_details["Asset_ID"] == asset_id]
    weight      = df_weight['Weight'] 
    lr          = float(weight)/10
        
    best_lgb_params ={
        'objective'              : 'regression',  
        'metric'                 : ['rmse', 'poisson'],
        'feature_pre_filter'     : False,
        'lambda_l1'              : 0.010565309968664168,
        'lambda_l2'              : 0.3120057367604998,
        'poisson_max_delta_step' : float(weight),
        
        'num_leaves'        : 700,
        'feature_fraction'  : 0.7,
        'bagging_fraction'  : 0.7,
        'bagging_freq'      : 0,
        'min_child_samples' : int(weight), 'random_state' : 42,
        
        'tree_learner'        : 'voting',             
        'learning_rate'       :  lr,
        'early_stopping_round': 110,
        'n_estimators'        : 5500} #,'device': 'gpu'
    
    lgb_train, lgb_valid = lgb.Dataset(x_train, y_train), lgb.Dataset(x_test, y_test) #,weight=weights     
    model = lgb.train(best_lgb_params, lgb_train, valid_sets=[lgb_valid], verbose_eval=1000)  
    
    return model

#  PROCESSING

In [None]:
if features_importance_check:
    df_tr = df_train.dropna(how="any")
    df_va = df_valid.dropna(how="any")
    #train : #test
    df_t, df_v = df_tr[df_tr['Asset_ID'] == 1],  df_va[df_va['Asset_ID'] == 1] #BTC = 1
    df_t, df_v = df_t.dropna(subset=['Target']), df_v.dropna(subset=['Target'])
    
    y_tr, y_te  = df_t['Target'],                          df_v['Target']  
    df_t, df_v  = df_t.drop(['Target','Asset_ID'],axis=1), df_v.drop(['Target','Asset_ID'],axis=1)
    df_t, df_v  = get_features(df_t),                      get_features(df_v)
    x_tr, x_te  = df_t.fillna(-1),                         df_v.fillna(-1) 
      
    #weight 
    df_w   = df_asset_details[df_asset_details["Asset_ID"] == 1]
    weight = df_w['Weight'] 
    lr     = float(weight)/10
        
    best_lgb_params ={
        'objective'              : 'regression',  
        'metric'                 : ['rmse', 'poisson'],
        'feature_pre_filter'     : False,
        'lambda_l1'              : 0.010565309968664168,
        'lambda_l2'              : 0.3120057367604998,
        'poisson_max_delta_step' : float(weight),
        
        'num_leaves'        : 700,
        'feature_fraction'  : 0.7,
        'bagging_fraction'  : 0.7,
        'bagging_freq'      : 0,
        'min_child_samples' : int(weight), 'random_state' : 42,
        
        'tree_learner'        : 'voting',             
        'learning_rate'       : lr,
        'early_stopping_round': 110,
        'n_estimators'        : 2500} #'device': 'gpu'
    
    lgb_t     = lgb.Dataset(x_tr, y_tr) #,weight=weights 
    lgb_v     = lgb.Dataset(x_te, y_te)
    model_pre = lgb.train(best_lgb_params, lgb_t, valid_sets=[lgb_v], verbose_eval=1000) 
    
    #callbacks
    x_pred      = pd.DataFrame()
    x_pred['x'] = model_pre.predict(x_te)
    print('Test score for LR baseline:', f"{np.corrcoef(x_pred.x, y_te)[0,1]:.5f}")
    #x_pred  
    features   = x_tr.columns
    importance = sorted(list(enumerate(model_pre.feature_importance())), key=lambda x:x[1], reverse=True)
    
    features_list = []
    score_list    = []
    for i,v in importance:
        print(f'Feature: ({i:<2}) {features[i]:<32} : score = {v:.5f}')
        features_list.append(features[i])
        score_list.append(v)
    
    fig = px.bar(x      = features_list,
                 y      = score_list, 
                 color  = score_list,
                 color_continuous_scale="bluyl")
    fig.update_xaxes(title ="Features"),  fig.update_yaxes(title="score") #x:y
    fig.update_layout(showlegend = True,
                      title      = {
                          'text'   : 'Features importance to model',
                          'x'      : 0.5,
                          'y'      : 0.95,
                          'xanchor': 'center',
                          'yanchor': 'top'},
                      template  ="plotly_white")
    fig.show()

In [None]:
if visualization:
    df  = df_train[df_train['Asset_ID'] == 1]
    dfx = get_features(df, drop_train=False, drop_visualization=True)
    dfx = dfx.dropna(how="any")
    dfx = dfx.head(1000)
    dfx['datetime'] = pd.to_datetime(dfx['timestamp'], unit='s')
    
    #customize
    
    #regression --- target:datetime
    if features_importance_check:
        y_pred = pd.DataFrame()
        y_tr   = dfx.Target
        x_te   = dfx.drop(['datetime','timestamp','Asset_ID','Target'],axis=1)
        
        y_pred['x'] = model_pre.predict(x_te)
           
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=dfx.datetime, y=y_tr, name='train/test', mode='markers'))
        fig.add_trace(go.Scatter(x=dfx.datetime, y=y_pred.x, name='prediction',  mode='markers'))
        fig.update_layout(title='Correlations between train/test & prediction')
        fig.show()
        
    #relationship between (quantum harmonic oscillator-111: eigenstates of the quantum harmonic oscillator) & target
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dfx.datetime, y=dfx.harmonic_oscillator_115v, mode='lines', name='harmonic_oscillator_115v'))
    fig.add_trace(go.Scatter(x=dfx.datetime, y=dfx.Target, mode='markers', name='target'))
    fig.update_layout(title='Relationship between harmonic oscillator-115v & target')
    fig.show()

    #relationship between (quantum harmonic oscillator-111: eigenstates of the quantum harmonic oscillator) & volume count
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dfx.datetime, y=dfx.harmonic_oscillator_115v, mode='lines', name='harmonic_oscillator_115v'))
    fig.add_trace(go.Scatter(x=dfx.datetime, y=dfx.volume_count , mode='lines', name='volume_count'))
    fig.update_layout(title='Relationship between harmonic oscillator-115v & volume_count')
    fig.show()
    
    #candlestick volume upper-lower 
    fig = go.Figure()
    fig.add_trace(go.Candlestick(x = dfx.datetime, open = dfx.shadow3, high = dfx.upper_Shadow, low = dfx.lower_Shadow, close = dfx.shadow5))
    fig.update_layout(title='Candlestick volume upper-lower')
    fig.show()

In [None]:
%%time
if train_models:
    Xs     = {}
    ys     = {}
    models = {}

    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    
        model = get_Xys_and_model_for_asset(df_train, df_valid, asset_id, df_asset_details)
        models[asset_id] =  model
        
        if callbacks:
            #validation
            x_pred = pd.DataFrame()
            x      = asset_id
            record = df_valid[df_valid.Asset_ID == x]   
            target = record.Target 
            record = record.drop(['Target','Asset_ID'],axis=1)
            x_test = get_features(record) 
            
            model       = models[x]
            x_pred['x'] = model.predict(x_test)
            print('Test score for LR baseline:', f"{np.corrcoef(x_pred.x, target)[0,1]:.5f}")
            del record
            del x_pred
            del x_test
            #break

# OUTPUT

In [None]:
%%time
if save_models:
    output_models  = 'models15f_g_crypto.h5'
    saved_models   = joblib.dump(models, output_models)
    loaded_model   = models
    gc.collect()
elif load_models:
    input_models   = '../input/fork-models15f-g-crypto/models15f_g_crypto.h5'
    loaded_model   = joblib.load(input_models)
    gc.collect()
else:
    loaded_model   = models

In [None]:
%%time
if test_baseline_model:
    x_pred = pd.DataFrame()
    for x in range(len(df_valid.Asset_ID.unique())):
        record           = df_valid[df_valid.Asset_ID == x]     
        record           = record.drop(['Target','Asset_ID'],axis=1)
        x_test           = get_features(pd.DataFrame(record))
        model            = loaded_model[x]
        x_test['y_pred'] = model.predict(x_test)
        x_pred           = pd.concat([x_test,x_pred])    
    
    x_pred = x_pred.sort_index()    
    print('Test score for LR baseline: ', f"{np.corrcoef(x_pred.y_pred,df_valid.Target)[0,1]:.5f}")

In [None]:
if submissions_test:
    for i, (df_test, df_pred) in enumerate(iter_test):
        for j , row in df_test.iterrows():
            model  = loaded_model[row['Asset_ID']]
            x_test = get_features(row, drop_train=False)
            y_pred = model.predict([x_test])[0]
        
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred 
            # Print just one sample row to get a feeling of what it looks like
            if i == 0 and j == 1:
                display(x_test)
        # Display the first prediction dataframe
        if i == 0:
            display(df_pred)
        # Send submissions
        env.predict(df_pred)    

# REFERENCE SOURCE CODE
1. [Quantum Harmonic Oscillators : The LibreTexts project](https://rb.gy/rm6hjh)
2. [Single Coins Valid train(JP/EN) : Tensor Choko](https://www.kaggle.com/tensorchoko/g-research-for-single-coins-valid-train-jp-en?scriptVersionId=81667733)
3. [64 New Features with Autoencoders : Sayantan Mazumdar](https://www.kaggle.com/swaralipibose/64-new-features-with-autoencoders/notebook)
4. [I Purchased Bitcoin : Dragon Zhang ](https://www.kaggle.com/dragonzhang/i-purchased-bitcoin)
5. [Simple LGB Starter : Katsu1110](https://www.kaggle.com/code1110/gresearch-simple-lgb-starter) 

# NEXT LEVEL 
* [Crypto Forecasting(1/1) : AE-QHO : Natapong Nitarach](https://www.kaggle.com/natnitarach/crypto-forecasting-1-1-ae-qho)