## The Previous Notebooks : <br>
### [Questions and answers📝EDA to Forecasting🚀](https://www.kaggle.com/yassershrief/notebook00ec514ff5/edit)<br>
### [LGBM Regressor Forecasting and Evaluation 📈](https://www.kaggle.com/yassershrief/lgbm-regressor-forecasting-and-evaluation/edit)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import datatable as dt
import math
from sklearn.preprocessing import RobustScaler
from datetime import timedelta
import matplotlib.pyplot as plt
plt.style.use("bmh")
from tensorflow.keras.models import Sequential , load_model
from tensorflow.keras.layers import Activation, Dense, Dropout, LSTM ,GRU
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint,EarlyStopping
import random
import gresearch_crypto
import traceback
from sklearn.metrics import mean_absolute_error, mean_squared_error , r2_score
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

**Datatable** (heavily inspired by R's data.table) can read large datasets fairly quickly and is often faster than pandas. It is specifically meant for data processing of tabular datasets with emphasis on speed and support for large sized data. 👌

In [None]:
import datatable as dt

In [None]:
df_all=dt.fread('../input/g-research-crypto-forecasting/train.csv').to_pandas()
asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
asset_details.sort_values("Weight", ascending=False)

# Bitcoin prediction

In [None]:
#Bitcoin
df1 = df_all[(df_all.Asset_ID == 1)]

def get_row_feats(df):
    """Feature engineering by row
    """
    df['upper_shadow'] = df['High'] / df[['Close', 'Open']].max(axis=1)
    df['lower_shadow'] = df[['Close', 'Open']].min(axis=1) / df['Low']
    df['open2close'] = df['Close'] / df['Open']
    df['high2low'] = df['High'] / df['Low']
    mean_price = df[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    median_price = df[['Open', 'High', 'Low', 'Close']].median(axis=1)
    df['high2mean'] = df['High'] / mean_price
    df['low2mean'] = df['Low'] / mean_price
    df['high2median'] = df['High'] / median_price
    df['low2median'] = df['Low'] / median_price
    df['volume2count'] = df['Volume'] / (df['Count'] + 1)
    return df   
df = get_row_feats(df1)

df['datetime'] = pd.to_datetime(df['timestamp'], unit='s') 
df['datetime_d'] = df['datetime'].dt.strftime("%Y-%m-%d") 

df = df.groupby(by = ["Asset_ID", 'datetime_d']).mean()          
df=df.reset_index(["Asset_ID"])                                               
df.drop(['timestamp','Asset_ID','Target','Open', 'High', 'Low', 'Volume', 'VWAP'],axis='columns', inplace=True) 
# Converting the index as date
df.index = pd.to_datetime(df.index)
# data = df[(df.index.year >= 2019)]
data=df.copy()

# Split data
Traning data before `13-06-2021` and validation data from `14-06-2021` to the end<br>
Kindly refer to the [topic](https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/285505)

In [None]:
aim = 'Close'
train_data = data.iloc[:1259]
test_data = data.iloc[1259:]

def line_plot(line1, line2, label1=None, label2=None, title='', lw=2):
    fig, ax = plt.subplots(1, figsize=(13, 7))
    ax.plot(line1, label=label1, linewidth=lw)
    ax.plot(line2, label=label2, linewidth=lw)
    ax.set_ylabel('USD', fontsize=14)
    ax.set_title(title, fontsize=16)
    ax.legend(loc='best', fontsize=16);

In [None]:
line_plot(train_data[aim], test_data[aim], 'training', 'test', title='')

In [None]:
def normalise_zero_base(continuous):
    return continuous / continuous.iloc[0] - 1

In [None]:
def extract_window_data(continuous, window_len=5, zero_base=True):
    window_data = []
    for idx in range(len(continuous) - window_len):
        tmp = continuous[idx: (idx + window_len)].copy()
        if zero_base:
            tmp = normalise_zero_base(tmp)
        window_data.append(tmp.values)
    return np.array(window_data)
def prepare_data(continuous, aim, window_len=10, zero_base=True, test_size=0.2):
    X_train = extract_window_data(train_data, window_len, zero_base)
    X_test = extract_window_data(test_data, window_len, zero_base)
    y_train = train_data[aim][window_len:].values
    y_test = test_data[aim][window_len:].values
    if zero_base:
        y_train = y_train / train_data[aim][:-window_len].values - 1
        y_test = y_test / test_data[aim][:-window_len].values - 1

    return train_data, test_data, X_train, X_test, y_train, y_test


In [None]:
def build_lstm_model(input_data, output_size, neurons, activ_func='linear',
                     dropout=0.2, loss='mse', optimizer='adam'):
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(input_data.shape[1], input_data.shape[2])))
    model.add(Dropout(dropout))
    model.add(Dense(units=output_size))
    model.add(Activation(activ_func))

    model.compile(loss=loss, optimizer=optimizer)
    return model
np.random.seed(0)
window_len = 7
test_size = 0.2
zero_base = True
lstm_neurons = 50
epochs = 150
batch_size = 32
loss = 'mse'
dropout = 0.25
optimizer = 'adam'
train_data, test_data, X_train, X_test, y_train, y_test = prepare_data(
    data, aim, window_len=window_len, zero_base=zero_base, test_size=test_size)

In [None]:
print(X_train.shape)

In [None]:
model = build_lstm_model(
    X_train, output_size=1, neurons=lstm_neurons, dropout=dropout, loss=loss,
    optimizer=optimizer)
modelfit = model.fit(
    X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size, shuffle=False,
    callbacks=EarlyStopping(monitor='val_loss', verbose=1,patience=10))


In [None]:
plt.plot(modelfit.history['loss'])
plt.plot(modelfit.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')

In [None]:
targets = test_data[aim][window_len:]
preds = model.predict(X_test).squeeze()
print('MAE for Bitcoin Model :',mean_absolute_error(preds, y_test))

In [None]:
preds = model.predict(X_test).squeeze()
SCORE_MSE=mean_squared_error(preds, y_test)
print('MSE for Bitcoin Model :',SCORE_MSE)

In [None]:
from sklearn.metrics import r2_score
r2_score=r2_score(y_test, preds)
print('R2 for Bitcoin Model :',r2_score*100)

In [None]:
preds = test_data [aim].values[:-window_len] * (preds + 1)
preds = pd.Series(index=targets.index, data=preds)
line_plot(targets, preds, 'actual', 'prediction', lw=3)

# Evaluate 2022 price
Check the accuracy of LSTM prediction. We got live price to date from [Yahoo Finance](https://finance.yahoo.com/quote/BTC-USD/history/)

BTC from 26/10/2021 to 26/01/2021

In [None]:
BTC = pd.read_csv('../input/btc-2612022/BTC-USD.csv')
BTC

In [None]:
# btc_quote = web.DataReader('BTC-USD', data_source='yahoo', start='2018-01-01', end='2021-09-20')
# Create a new dataframe
new_df = BTC.filter(['Close'])
# Get the last 60 days closing price
last_60_days = new_df[-60:].values
# Scale the data to be values between 0 and 1
scaler = MinMaxScaler(feature_range=(0,1))
last_60_days_scaled = scaler.fit_transform(last_60_days)
#Create an empty list
X_test = []
# Append the past 60 days
X_test.append(last_60_days_scaled)
# convert to numpy array
X_test = np.array(X_test)
# Reshape
train_data1, test_data1, X_train1, X_test1, y_train1, y_test1 = prepare_data(
    X_test, new_df, window_len=window_len, zero_base=zero_base, test_size=test_size)
# Get the predicted scaled price
pred_price = model.predict(X_test1)
# undo the scaling

pred_price1 = scaler.inverse_transform(pred_price)[-1]
# # btc_quote2 = web.DataReader('BTC-USD', data_source='yahoo', start='2021-09-20', end='2021-09-20')
actual_price1 = BTC['Close'][-1:].values
# actual_price1
accuracy = ((pred_price1-actual_price1)/actual_price1)*100
print('Prediction close price at 26-01-2022: $', pred_price1, sep='')
print('Actual price at 26/01/2022: $', actual_price1, sep='')
print('error: ', accuracy, '%', sep='')

# Loop over all assets

In [None]:
temp = df_all.reset_index(drop = True) 
temp['datetime'] = pd.to_datetime(temp['timestamp'], unit='s')
train_data = temp

In [None]:
# train test split df_train into 90% train rows and 10% valid rows

def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    df = df.filter(['Close'])
    df = df.values
    training_data_len = math.ceil(len(df) * .001)
    # Scale the Data
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = scaler.fit_transform(df)
    # Create the training data set
    # Create the scaled training data set
    train_data = scaled_data[0:training_data_len, :]

    # Split the data itno x_train and y_train data sets
    x_train = []
    y_train = []

    for i in range(60, len(train_data)):
        x_train.append(train_data[i-60:i,0])
        y_train.append(train_data[i,0])

    # Convert the x_train and y_train to numpy arrays
    x_train, y_train = np.array(x_train), np.array(y_train)
    # Reshape the data
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    # x_train.shape
    # Build the LSTM Model
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(25))
    model.add(Dropout(dropout))
    model.add(Dense(units=1))
    model.add(Activation('linear'))
    # Compile the model
    model.compile(optimizer='adam', loss='mse')
    # Train the model
    model.fit(x_train, y_train, batch_size=1, epochs=1) # LSTM model
    return x_train, y_train, model

Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(asset_details['Asset_ID'], asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y, model = get_Xy_and_model_for_asset(train_data, asset_id)       
    try:
        Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    except: 
        Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None

In [None]:
sup_train = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv', 
                 usecols=['Close', 'Target', 'Asset_ID','timestamp'], dtype={'Asset_ID': 'int8'})
sup_train['datetime'] = pd.to_datetime(sup_train['timestamp'], unit='s')
sup_train = sup_train.set_index('datetime').drop('timestamp', axis=1)
# sup_train = sup_train[(sup_train.index.year == 2021) & (sup_train.index.month > 5)]
sup_trains = {asset_id: sup_train[sup_train['Asset_ID'] == asset_id].resample('1min').interpolate().copy() for asset_id in sup_train['Asset_ID'].unique()}
del sup_train

# Predict & submit

In [None]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [None]:
df = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv', 
                 usecols=['Target', 'Asset_ID','timestamp'], dtype={'Asset_ID': 'int8'})
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df = df.set_index('datetime').drop('timestamp', axis=1)
# df = df[(df.index.year == 2021) & (df.index.month > 5)]
dfs = {asset_id: df[df['Asset_ID'] == asset_id].resample('1min').interpolate().copy() for asset_id in df['Asset_ID'].unique()}
del df
for df_test, df_pred in iter_test:
    df_test['datetime'] = pd.to_datetime(df_test['timestamp'], unit='s')
    for _, row in df_test.iterrows():
        try:
            df = dfs[row['Asset_ID']]
            closest_train_sample = df.iloc[df.index.get_loc(row['datetime'], method='nearest')]
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = closest_train_sample['Target']
        except:
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    df_pred['Target'] = df_pred['Target'].fillna(0)
    env.predict(df_pred)