In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

In [None]:
df=pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
asset=pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')

In [None]:
asset_map=lambda x : asset[asset.Asset_ID==x].Asset_Name.tolist()[0]

In [None]:
dfs={}
for i in np.unique(df.Asset_ID):
    dfs[i]=df[df.Asset_ID==i].reset_index(drop=True)

In [None]:
plt.figure(figsize=(12,8))
for i in range(0,2):
    sns.lineplot(x='timestamp',y='Target',data=dfs[i][:500],
                 label=asset_map(i))
    
plt.legend()
plt.show()

# Feature Enginearing

# Missing Data

In [None]:
df.isna().value_counts()

In [None]:
#check Target nan
miss_idx={}
miss=[]
sample_size=[]
for k,v in dfs.items():
    #print(f'----asset---- : {asset_map(k)}')
    #print(f'#missing : {v.Target.isna().sum()*100/v.shape[0]}%')
    #print()
    #print()
    miss.append(v.Target.isna().sum()*100/v.shape[0])
    miss_idx[k]=v[v.Target.isna()].index
    sample_size.append(v[v.isna()==False].shape[0])
    
miss=pd.DataFrame(miss,index=list(map(asset_map,dfs.keys()))).T

sample_size=pd.DataFrame(sample_size,index=list(map(asset_map,dfs.keys()))).T

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=miss)
plt.xlabel('Cryptocurrency')
plt.ylabel('% of missing')
plt.show()

* We can see that Degecoin, IOTA, Maker, Monero, Stellar have large amount of missing data , so I drop them 

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=sample_size)
plt.xlabel('Cryptocurrency')
plt.ylabel('sample size')
plt.title('sample size(not consider NaN)')
plt.show()

* Linear Search

In [None]:
def nan_linear_search(idx,inv=False):        
    if inv:
        i=-1
        j=-2
    else:
        i=0
        j=1
    last=miss_idx[idx][i]
    temp=miss_idx[idx][j]
    target=last
    if inv:
        if (dfs[idx].tail(1).index!=target)[0]: #if NaN series not appear in the tail , then return 
            return
    else:
        if (dfs[idx].head(1).index!=target)[0]: #if NaN series not appear in the tail , then return 
            return
        
    while np.abs(last-temp)==1:
        if inv:
            i-=1
            j-=1
        else:
            i+=1
            j+=1
        last=miss_idx[idx][i]
        temp=miss_idx[idx][j]
        
    return list(range(last,target+1)) if inv else list(range(target,last+1))


In [None]:
Drop=['Degecoin', 'IOTA', 'Maker', 'Monero','Stellar']

In [None]:
for k,v in dfs.items():
    if asset_map(k) in Drop:
        dfs[k]=dfs[k].dropna()
    else:
        for bool_ in [True,False]:
            nan_idx=nan_linear_search(k,bool_)
            try:
                dfs[k]=dfs[k].drop(nan_idx)
            except:
                pass
        dfs[k]['Target']=dfs[k]['Target'].interpolate(method='cubic',order=2)
        dfs[k]['VWAP']=dfs[k]['VWAP'].interpolate(method='cubic',order=2)

In [None]:
for k,v in dfs.items():
    print(f'asset id : {k}')
    print(v.isna().value_counts())
    dfs[k]=dfs[k].drop(['Asset_ID'],axis=1)

## Tutorial Features

In [None]:

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])



def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df, row=False):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    if row:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean()
    else:
        df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    
    df_feat['High/Mean'] = df_feat['High'] / df_feat['Mean']
    df_feat['Low/Mean'] = df_feat['Low'] / df_feat['Mean']
    df_feat['Volume/Count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    
    
    #I observe that if adding time info to series , the model will hard to train
    
    df_feat=df_feat.drop(['Count','Volume'],axis=1)
    
    df_feat['Target']=df.Target
    
    df_feat['timestamp']=df['timestamp']
    

    return df_feat

In [None]:
for k,v in dfs.items():
    dfs[k]=get_features(dfs[k])

# KD Values

In [None]:
K=0
def KValue(rsv):
    global K
    K = (2/3) * K + (1/3) * rsv
    return K

D=0
def DValue(k):
    global D
    D = (2/3) * D + (1/3) * k
    return D

def kd_value(df):
    
    df['date']=pd.to_datetime(df.timestamp,unit='s')
    df=df.set_index('date')
                              
    #highest price in recent 9 days
    df['9DAYMAX']=df['High'].rolling('9D').max()
                              
    #lowest price in recent 9 days 
    df['9DAYMIN']=df['Low'].rolling('9D').min()
                              
                              
    #RSV value
    df['RSV'] = 100 *\
        (df['Close'] - df['9DAYMIN']) / (df['9DAYMAX'] - df['9DAYMIN']+1) #prevent 0 divided
                              
    df['K'] = df['RSV'].apply(KValue)
                              
    df['D'] = df['K'].apply(DValue)
    
    df=df.drop(['9DAYMAX','9DAYMIN','Open','High','Low','Close','RSV'],axis=1)
    
    return df

In [None]:
for k,v in dfs.items():
    dfs[k]=kd_value(dfs[k])

In [None]:
for k,v in dfs.items():
    fig,ax=plt.subplots(ncols=2,figsize=(25,8))
    sns.lineplot(x=v[:1000].index,y='K',data=v[:1000],label='K',ax=ax[0])
    sns.lineplot(x=v[:1000].index,y='D',data=v[:1000],label='D',ax=ax[0])
    ax[0].set_ylabel('value')
    
    sns.lineplot(x=v[:1000].index,y='Target',data=v[:1000],label='Target',ax=ax[1])
    ax[0].set_title(asset_map(k))
    ax[1].set_title(asset_map(k))
plt.legend()
plt.show()

## Spliting

* Train:0.7

* Val : 0.2

* Test : 0.1


In [None]:
train={}
val={}
test={}


stats={}
for k,v in dfs.items():
    n=len(dfs[k])
    
    timestamp=dfs[k].timestamp
    train_df=dfs[k][:int(n*0.7)].drop(['timestamp'],axis=1)
    val_df=dfs[k][int(n*0.7):int(n*0.9)].drop(['timestamp'],axis=1)
    test_df=dfs[k][int(n*0.9):].drop(['timestamp'],axis=1)
    
    
    #to ensure data is unseen , we can't use vali or test statistics to do normalization
    train_mean=train_df.mean()
    train_std=train_df.std()
    
    stats[k]=(train_mean,train_std)
    
    #normalization
    train_df=(train_df-train_mean)/(train_std)
    val_df=(val_df-train_mean)/(train_std)
    test_df=(test_df-train_mean)/(train_std)
    
    train[k]=train_df.set_index(timestamp[:int(n*0.7)])
    val[k]=val_df.set_index(timestamp[int(n*0.7):int(n*0.9)])
    test[k]=test_df.set_index(timestamp[int(n*0.9):])


In [None]:
for k,v in train.items():
    temp=v[:1000].melt(var_name='column',value_name='normalized')
    plt.figure(figsize=(25,8))
    plt.title(asset_map(k))
    sns.violinplot(x='column',y='normalized',data=temp)
plt.show()

* In prediction step :

    $$Target=Target*\sigma+\mu$$

# Modeling

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.losses import MeanSquaredError,MeanAbsoluteError
from tensorflow.keras.optimizers import Adam

In [None]:
class WindowGenerator:
    def __init__(self,input_width,label_width,offset,
                train_df,val_df,test_df,label_columns=None,batch_size=32,drop_label=True):
        self.train_df=train_df
        self.val_df=val_df
        self.test_df=test_df
        self.batch_size=batch_size
        self.drop_label=drop_label
    
        
        #if we want to predict Target、Volumns ,label columns=['Target','Volumns']
        self.label_columns=label_columns 
        if label_columns!=None:
            self.label_columns_indices={name:i for i,name in enumerate(label_columns)}
        self.column_indices={name:i for i,name in enumerate(train_df.columns)}
        
        #label idx
        self.label_idx=[train_df.columns.to_list().index(name) 
                            for i,name in enumerate(label_columns)]
        
        self.select_idx=list(set(range(train_df.shape[-1]))-set(self.label_idx))
        
        
        #manage indexes
        self.input_width=input_width
        self.label_width=label_width
        self.offset=offset
        
        self.total_window_size=input_width+offset
        
        #input
        self.input_slice=slice(0,input_width)
        self.input_indices=np.arange(self.total_window_size)[self.input_slice]
        
        #label
        self.label_start=self.total_window_size-self.label_width
        self.label_slice=slice(self.label_start,None)
        self.label_indices=np.arange(self.total_window_size)[self.label_slice]
    
    def split_window(self,features):
        inputs=features[:,self.input_slice,:]
        labels=features[:,self.label_slice,:]
        
        if self.label_columns!=None:
            labels = tf.stack(
                [labels[:, :, self.column_indices[name]] for name in self.label_columns],axis=-1)
        
            
        inputs.set_shape([None,self.input_width,None])
        labels.set_shape([None,self.label_width,None])
        
        if self.drop_label:
            inputs=tf.gather(inputs,self.select_idx,axis=-1)
        
        return inputs,labels
    
    def make_dataset(self,data):
        data = np.array(data, dtype=np.float32)
        
        
        #split :  each window --> [input width , offset]
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=15,
            shuffle=True,
            batch_size=self.batch_size)
        ds = ds.map(self.split_window)
        
        return ds
    
    @property
    def train(self):
        return self.make_dataset(self.train_df)
    
    @property
    def val(self):
        return self.make_dataset(self.val_df)
    @property
    def test(self):
        return self.make_dataset(self.test_df)

        
    #if print object , this function will be called
    def __repr__(self): 
        return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])

* Hyperparameters

In [None]:
input_width=256
label_width=256
offset=0
batch_size=256


#model
units=512
lr=1e-5
epochs=2

* Model

In [None]:
class Model(tf.keras.Model):
    def __init__(self, units, num_features=1,dropout=0.3):
        super().__init__()
        
        self.units = units
        self.lstm_cell = layers.LSTMCell(units,dropout=dropout)
        self.lstm_rnn = layers.RNN(self.lstm_cell, return_state=True,return_sequences=True)
        self.dense = layers.Dense(num_features,activation=None)  #the return is between -1~1
        
    def call(self,x,training=False):
        x,*state=self.lstm_rnn(x,training=training)
        x=self.dense(x)
        return x
    
    def one_step_forecast(self,x,state=None):
        #input : (1,features)
        if state==None:
            state=[tf.zeros((1,self.units)),tf.zeros((1,self.units))]
        x,state=self.lstm_cell(x,states=state,training=False)
        x=self.dense(x)
        return x,state

* Train

In [None]:
def compile_and_fit(model, window, EPOCHS,patience=2):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')
    model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(learning_rate=lr),
                metrics=[tf.metrics.MeanAbsoluteError()])
    
    history = model.fit(window.train, epochs=EPOCHS,
                      validation_data=window.val,
                      callbacks=[early_stopping])
    return history

In [None]:
def main():
    models={}
    for k,v in dfs.items():
        print(f'Asset {k}')
        w=WindowGenerator(input_width,
                  label_width,
                  offset,
                  train[k],
                  val[k],
                  test[k],
                  ['Target'],
                  batch_size)
        models[k]= Model(units)
        
        hist=compile_and_fit(models[k], w, epochs,patience=2)
        tf.keras.models.save_model(models[k],f'model_{k}.pt')
        
    return models

In [None]:
models=main()

## Show Forecasting

In [None]:
target_idx=-3

In [None]:
def inv_norm(target):
    return target*train_std[target_idx]+train_mean[target_idx]

In [None]:
for k,v in train.items():
    w=WindowGenerator(1000,
                  1000,
                  0,
                  train[k],
                  val[k],
                  test[k],
                  ['Target'],
                  batch_size=1)
    plt.figure()
    plt.title(asset_map(k))
    for x,y in w.test.take(1):
        y_pred=models[k](x)[0,:,0].numpy()
        plt.plot(range(1000),inv_norm(y_pred),label='predict')
        plt.plot(range(1000),inv_norm(y[0,:,0].numpy()),label='ground_truth')
    plt.legend()
    plt.show()

# Submit

In [None]:
import gresearch_crypto

env = gresearch_crypto.make_env()   
iter_test = env.iter_test() 

In [None]:
states={}
for k,(test_df, sample_prediction_df) in enumerate(iter_test):
    t=np.unique(test_df.timestamp)[0]
    
    for i,idx in enumerate(test_df['Asset_ID']):
        try:
            if t in np.unique(train[idx].index):
                x=train[idx].loc[t]
            
            elif t in np.unique(val[idx].index):
                x=val[idx].loc[t]
            else:
                x=test[idx].loc[t]
            
            
            x=x.drop(['Target'])
            x=tf.convert_to_tensor(x)
            x=tf.expand_dims(x,axis=0)
        
            pred,state=models[idx].one_step_forecast(x,state=None if k==0 else states[idx])
        
            states[idx]=state
        
            pred=inv_norm(pred)
            sample_prediction_df['Target'].iloc[i]=pred.numpy()[0][0]
        except:
            sample_prediction_df['Target'].iloc[i]=0
    env.predict(sample_prediction_df)