# Created by Corey Levinson

In [None]:
import pandas as pd # python dataframes
import numpy as np # python numerics
import matplotlib.pyplot as plt # python plotting
import seaborn as sns

# Keras imports
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU, SpatialDropout1D, GlobalMaxPool1D
from keras.layers.embeddings import Embedding

from kaggle.competitions import twosigmanews # Needed to obtain training/test data

from tqdm import tqdm
import gc

In [None]:
# Change DEBUG to False when you're ready, Corey.
DEBUG = False

# Change YEARMIN to change the cutoff point for your data
# All data must be greater than YEARMIN
YEARMIN = 2011

In [None]:
#random seeds for stochastic parts of neural network 
np.random.seed(100)
from tensorflow import set_random_seed
set_random_seed(150)

In [None]:
env = twosigmanews.make_env()

In [None]:
# Load in market data, garbage collect news data
(market_train, _) = env.get_training_data()

# Light preprocessing:

In [None]:
# Require all data to be more recent than YEARMIN
market_train = market_train.loc[market_train['time'].dt.year > YEARMIN]

In [None]:
market_train.columns

In [None]:
# # Require all TARGETS be in range (-1, 1)
# market_train['returnsOpenNextMktres10'] = market_train['returnsOpenNextMktres10'].clip(-1,1)

In [None]:
# Are there any columns that have NA's?
# Recall Neural Networks requires all values imputed
print('MARKET TRAIN:')
for col in market_train.columns:
    print(col+' has '+str(market_train[col].isna().sum())+' NAs')

# Four columns have NA's. Let's impute them with the median of the group

In [None]:
# If DEBUG, then don't read in all of the data.
if DEBUG:
    market_train = market_train.sample(50000, random_state=4)

In [None]:
# Attempt to impute by group by's median
market_train['returnsClosePrevMktres1'] = market_train.groupby(['assetCode'])['returnsClosePrevMktres1'].transform(lambda x: x.fillna(x.median()))
market_train['returnsOpenPrevMktres1'] = market_train.groupby(['assetCode'])['returnsOpenPrevMktres1'].transform(lambda x: x.fillna(x.median()))
market_train['returnsClosePrevMktres10'] = market_train.groupby(['assetCode'])['returnsClosePrevMktres10'].transform(lambda x: x.fillna(x.median()))
market_train['returnsOpenPrevMktres10'] = market_train.groupby(['assetCode'])['returnsOpenPrevMktres10'].transform(lambda x: x.fillna(x.median()))

# If the assetCode has no non-null values, then impute with column median
market_train = market_train.fillna(market_train.median())

In [None]:
market_train = market_train.sort_values(['assetCode','time']) # Sort it by time for use in LSTM later
market_train.reset_index(drop=True,inplace=True)
market_train.head()

In [None]:
market_train.columns

# Hypothesis: I think i dont have enough RAM to construct the list. So I am reducing amount of information being fed.

In [None]:
market_train['time'] = pd.to_datetime(market_train['time'].dt.date) # Change from datetime to date for less memory and easier merge with news

In [None]:
# Feature Engineering
market_train['margin1'] = market_train['open'] / market_train['close']
market_train['TARGET'] = np.sign(market_train['returnsOpenNextMktres10'])

In [None]:
# # Keep last 30 of each asset
total_market_obs_df = [market_train.loc[(market_train['time'].dt.year >= 2016) & (market_train['time'].dt.month >= 9)].groupby('assetCode').tail(30).drop(['universe','returnsOpenNextMktres10'], axis=1)]

# Train LSTM model now

In [None]:
LSTM_COLUMNS_TO_USE = ['time', # Time variable is necessary
                       'assetCode', # AssetCode is necessary to perform merges/historical analysis
                       'universe', # binary variable indicating if entry will be used in metric
                       'returnsOpenNextMktres10',
                       'TARGET',
                       'volume',
                       'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
                       'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                       'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
                       'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
                       'margin1',
                     ]

In [None]:
# Drop columns not in use
market_train = market_train[LSTM_COLUMNS_TO_USE]

In [None]:
gc.collect()

In [None]:
# If the assetCode has no non-null values, then impute with column median
market_train = market_train.fillna(market_train.median())

In [None]:
INFORMATION_COLS = ['time','assetCode','universe','returnsOpenNextMktres10','TARGET']
INPUT_COLS = [f for f in market_train.columns if f not in INFORMATION_COLS]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
market_train[INPUT_COLS] = scaler.fit_transform(market_train[INPUT_COLS])

In [None]:
market_train.head(20)

In [None]:
# Adapted from: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/

def series_to_supervised(dataset, n_in=1):
    data_X = []
    data_time = []
    data_assetCode = []
    data_universe = []
    data_returns = []
    data_TARGET = []
    
    # input sequence (t-n, ... t-1, t)
    for i in range(0, len(dataset)):
        data_time.append(dataset[i][0])
        data_assetCode.append(dataset[i][1])
        data_universe.append(dataset[i][2])
        data_returns.append(dataset[i][3])
        data_TARGET.append(dataset[i][4])
        to_append = np.append(np.zeros(shape=(max(0,n_in - 1 - i), 10)),(dataset[max(0, i - n_in+1):i+1, len(INFORMATION_COLS):]), axis=0)
        data_X.append( to_append)
        
    return data_X, data_time, data_assetCode, data_universe, data_returns, data_TARGET

In [None]:
LOOK_BACK = 15

In [None]:
# Create LSTM input for each assetCode individually and store in a huge list
lstm_df_list = np.empty(shape=(market_train.shape[0],LOOK_BACK,10))
#lstm_df_list = []
the_time = []
the_assetCode = []
the_universe = []
the_returns = []
the_TARGET = []

row_at = 0

#for assetCode in ['AA.N','ABAX.O']:#tqdm(market_train['assetCode'].unique()[1:3]):
for i in tqdm(market_train.groupby('assetCode')['time'].count().reset_index().values):
    res = series_to_supervised(market_train.loc[market_train['assetCode']==i[0]].values, n_in=LOOK_BACK)
    #lstm_df_list = np.append(lstm_df_list, np.array(res[0]), axis=0)
    #lstm_df_list.append(np.array(res[0]))
    lstm_df_list[row_at:row_at+i[1]] = np.array(res[0])
    row_at = row_at + i[1]
    the_time.append(res[1])
    the_assetCode.append(res[2])
    the_universe.append(res[3])
    the_returns.append(res[4])
    the_TARGET.append(res[5])

In [None]:
# FLATTEN LISTS
import itertools

the_time = list(itertools.chain.from_iterable(the_time))
the_assetCode = list(itertools.chain.from_iterable(the_assetCode))
the_universe = list(itertools.chain.from_iterable(the_universe))
the_returns = list(itertools.chain.from_iterable(the_returns))
the_TARGET = list(itertools.chain.from_iterable(the_TARGET))

In [None]:
the_TARGET[-5:]

In [None]:
del market_train
gc.collect()

In [None]:
print(lstm_df_list.shape)

In [None]:
from keras import callbacks

In [None]:
# https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2

class Metrics(callbacks.Callback):
    def on_train_begin(self, logs={}):
        self._data = []

    def on_epoch_end(self, batch, logs={}):
        X_val, y_val = self.validation_data[0], self.validation_data[1]
        y_predict = (pd.DataFrame(model.predict(X_val)) * 2) - 1 # Need to convert it back to [-1, 1] instead of [0, 1]
        _sigmascore = sigma_scorelstm(y_val, y_predict)
        print(" — sigmascore: %f" % (_sigmascore))

        self._data.append({
            'val_sigmascore': _sigmascore
        })
        return

    def get_data(self):
        return self._data

metrics = Metrics()

In [None]:
train_index = [i for i in range(len(lstm_df_list))]

In [None]:
def sigma_scorelstm(y_true, y_pred):
        x_t_i = y_pred * pd.DataFrame([the_returns[i] for i in train_index]) * pd.DataFrame([the_universe[i] for i in train_index]) # Multiply my confidence by return multiplied by universe
        data = pd.concat([pd.DataFrame([the_time[i] for i in train_index]), x_t_i], axis=1)
        data.columns = ['day','x_t_i']
        x_t = data.groupby('day').sum().values.flatten()
        mean = np.mean(x_t)
        std = np.std(x_t)
        score_valid = mean / std
        return score_valid

In [None]:
trainX = np.array([lstm_df_list[i] for i in train_index])
trainY = (np.array([the_TARGET[i] for i in train_index]) + 1) / 2

In [None]:
trainY[-5:]

In [None]:
model = Sequential()
model.add(GRU(50, return_sequences=True, input_shape=(LOOK_BACK, trainX.shape[2])))
model.add(SpatialDropout1D(0.5))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.50))

model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop') # RMS prop is supposed to be better for recurrent neural networks.

history = model.fit(trainX, trainY, epochs=2, batch_size=1028, validation_data=(trainX, trainY), verbose=2, shuffle=True, callbacks=[metrics])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper right')
plt.show()

In [None]:
# Gain memory
del trainX, trainY
gc.collect()

# Predictions:

In [None]:
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
days = env.get_prediction_days()

In [None]:
# Correct LSTM columns to use
LSTM_COLUMNS_TO_USE = [col for col in LSTM_COLUMNS_TO_USE if (col!='universe' and col!='returnsOpenNextMktres10' and col!='TARGET')]

In [None]:
# Drop this
total_market_obs_df[0].drop('TARGET', axis=1, inplace=True)

In [None]:
# Adapted from: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/

def series_to_supervised(dataset, curdate, n_in=1):
    data_X = []
    
    # input sequence (t-n, ... t-1, t)
    for i in range(0, len(dataset)):
        # Only create with the prediction date
        if dataset[i][0]==curdate:
            to_append = np.append(np.zeros(shape=(max(0,n_in - 1 - i), 10)),(dataset[max(0, i - n_in+1):i+1, 2:]), axis=0)
            data_X.append( to_append)
        
    return data_X


In [None]:
for (market_obs_df, _, predictions_template_df) in days:
    #######################
    # LGBM modeling:
    
    market_obs_df = market_obs_df.fillna(market_obs_df.median())
    market_obs_df = market_obs_df.sort_values('assetCode')
    
    market_obs_df['time'] = pd.to_datetime(market_obs_df['time'].dt.date)
        
    # Feature Engineering
    market_obs_df['margin1'] = market_obs_df['open'] / market_obs_df['close']
    
    # Save to history df
    total_market_obs_df.append(market_obs_df)
    history_df = pd.concat(total_market_obs_df[-(np.max(30)+1):]) # Store last 30 for assetCodes
    
    ###################################
    # LSTM modeling:
    
    tmp = history_df[LSTM_COLUMNS_TO_USE]
    
    # If the assetCode has no non-null values, then impute with column median
    tmp = tmp.fillna(tmp.median())
    
    # Scale
    tmp[INPUT_COLS] = scaler.fit_transform(tmp[INPUT_COLS])
    
    # Create LSTM input for each assetCode individually and store in a huge list
    lstm_df_list = np.empty(shape=(predictions_template_df.shape[0],LOOK_BACK,10))

    row_at = 0

    for asset in market_obs_df['assetCode'].unique():
        res = series_to_supervised(tmp.loc[tmp['assetCode']==asset].values, curdate=tmp['time'].max(), n_in=LOOK_BACK)

        lstm_df_list[row_at] = np.array(res)
        row_at = row_at + 1
        
    trainX = np.array([lstm_df_list[i] for i in range(len(lstm_df_list))])
    
    yhat_lstm = model.predict(trainX)
    yhat_lstm = yhat_lstm.flatten() # Flatten it
    
    yhat_lstm = pd.DataFrame(yhat_lstm)
    preds = (yhat_lstm * 2) - 1
    
#     # Predict on Ensemble now
#     ensemble = pd.concat([yhat_lgbm, yhat_goss, yhat_dart, yhat_lstm], axis=1)
#     ensemble.columns = ['lgbm','goss','dart','lstm']

#     preds = logreg.predict_proba(ensemble)[:,1]
#     preds = (preds * 2) - 1 # Convert from [0,1] to [-1,1]
    
    predictions_template_df['confidenceValue'] = preds
    env.predict(predictions_template_df)

In [None]:
env.write_submission_file()