In [13]:
import gc
from itertools import chain
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
# widen jupyter display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 100)

#### in a kaggle kernel ####
# from kaggle.competitions import twosigmanews
# env = twosigmanews.make_env()
# train = env.get_training_data()
# test = []
# for (market_obs, news_obs, predictions_template) in env.get_prediction_days():
#     test.append((market_obs, news_obs, predictions_template))
#     predictions_template.confidenceValue = 0.0
#     env.predict(predictions_template)
# markettuple = (train, test)
#### to dump on kaggle kernel to local machine
# pickle.dump((train, test), open('data.p', 'wb'))

#### On local machine ####
markettuple = pickle.load(open('data.p', "rb"))

#### markettuple is a tuple
#### ((train_mkt, train_news), [(test_mkt, test_news, predictions_template)])
#### Where train is 2008 - 2016 and test is 2017 -

In [14]:
TRAIN_SAMPLING = 5_000 # None or 0 to turn off
TEST_SAMPLING = 5_000

# NN params
BATCH_SIZE = 10000
LOOK_BACK = 30
LOOK_BACK_STEPS = 5
EPOCHS = 20
STEPS_PER_EPOCHS = 50
VALIDATION_STEPS = 5

In [15]:
if TRAIN_SAMPLING:
    # sample on tail instead of randomly
    # since random sampling will turn up missing classes
    mkt = markettuple[0][0].sample(TRAIN_SAMPLING)
    nw = markettuple[0][1].sample(TRAIN_SAMPLING)
else:
    mkt = markettuple[0][0]
    nw = markettuple[0][1]

markettuple = None
gc.collect()
mkt = mkt.sort_values(by='time'
        ).reset_index(drop=True)
nw = nw.sort_values(by='time'
      ).reset_index(drop=True)

gc.collect()

14

In [16]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [56]:
def market_prepro(
    df_mkt, 
    clean_data=True, 
    extract_time_info=True,
    scale_numeric=False,
    encode_asset=True,
    fillna_clean_outliers=True,
    ):
    """
    Preprocess market data
    """
    df = df_mkt.copy()
    label_col = 'returnsOpenNextMktres10'
    # numeric columns
    ncols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 
             'returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 
             'returnsOpenPrevRaw10', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
    if scale_numeric is True and fillna_clean_outliers is False:
        raise ValueError("scale_numeric can't be done without fillna")
    # Clean bad data. We fit on train dataset and it's ok to remove bad data
    # Remove strange cases with close/open ratio > 2
    if clean_data:
        max_ratio  = 2
        df = df[(df['close'] / df['open']).abs() <= max_ratio].loc[:]
        df = df.reset_index(drop=True)
    # Fill na, fix outliers. Safe for test dataset, no rows removed.
    if fillna_clean_outliers:
        # Fill nans
        df[ncols] = df[['assetCode'] + ncols
                                  ].groupby('assetCode'
                                  ).transform(lambda g: g.fillna(method='bfill'))
        df[ncols] = df[ncols].fillna(0)
        # Fix outliers
        df[ncols] = df[ncols].clip(
                        df[ncols].quantile(0.01), 
                        df[ncols].quantile(0.99), 
                        axis=1)
        
    # Extract day, week, year from time
    if extract_time_info:
        df = df.join(pd.get_dummies(
            df.time.dt.year, prefix="year",
                dummy_na=True, drop_first=True))
        df = df.join(pd.get_dummies(
            df.time.dt.day, prefix="day",
                dummy_na=True, drop_first=True))
        df = df.join(pd.get_dummies(
            df.time.dt.week, prefix="week",
                dummy_na=True, drop_first=True))
        df = df.join(pd.get_dummies(
            df.time.dt.dayofweek, prefix="dayofweek",
                dummy_na=True, drop_first=True))
        # Create linear time index column
        udays = sorted(list(df.time.unique()))
        timeindexdf = pd.DataFrame(np.arange(len(udays)), 
                                   index=udays
                                  ).reset_index(drop=False)
        timeindexdf.columns = ['time', 'timeIndex']
        df = pd.merge(df, timeindexdf, how='left', on='time')
    if encode_asset:
        df = df.join(pd.get_dummies(
                df['assetCode'], prefix="assetCode",
                dummy_na=True, drop_first=True))
    if scale_numeric:
        # Fit for numeric and time
        df[ncols] = StandardScaler().fit_transform(
                                df[ncols].astype(float))
        df = df.fillna(0)
    feature_cols = list(df.columns)
    return df[feature_cols].copy()

In [57]:
def news_prepro(
    df_news,
    scale_numeric = True,
    ):
    """
    Aggregate news by day and asset. Normalize numeric values.
    Prepare news batch for generator.
    Asset can have many news per day, so group them by asset, day and aggregate. 
    Then normalize numerical values. News aggregation part is based on this kernel: 
    https://www.kaggle.com/bguberfain/a-simple-model-using-the-market-and-news-data#
    """
    news_cols_agg = {
        'urgency': ['min', 'count'],
        'takeSequence': ['max'],
        'bodySize': ['min', 'max', 'mean', 'std'],
        'wordCount': ['min', 'max', 'mean', 'std'],
        'sentenceCount': ['min', 'max', 'mean', 'std'],
        'companyCount': ['min', 'max', 'mean', 'std'],
        'marketCommentary': ['min', 'max', 'mean', 'std'],
        'relevance': ['min', 'max', 'mean', 'std'],
        'sentimentNegative': ['min', 'max', 'mean', 'std'],
        'sentimentNeutral': ['min', 'max', 'mean', 'std'],
        'sentimentPositive': ['min', 'max', 'mean', 'std'],
        'sentimentWordCount': ['min', 'max', 'mean', 'std'],
        'noveltyCount12H': ['min', 'max', 'mean', 'std'],
        'noveltyCount24H': ['min', 'max', 'mean', 'std'],
        'noveltyCount3D': ['min', 'max', 'mean', 'std'],
        'noveltyCount5D': ['min', 'max', 'mean', 'std'],
        'noveltyCount7D': ['min', 'max', 'mean', 'std'],
        'volumeCounts12H': ['min', 'max', 'mean', 'std'],
        'volumeCounts24H': ['min', 'max', 'mean', 'std'],
        'volumeCounts3D': ['min', 'max', 'mean', 'std'],
        'volumeCounts5D': ['min', 'max', 'mean', 'std'],
        'volumeCounts7D': ['min', 'max', 'mean', 'std'],
    }
    news_cols_numeric = set(news_cols_agg.keys()) - set(['assetCode', 'time'])
    # Fill na with previous value for the asset
    df = df_news.copy()
    # Aggregate by time, asset code
    # Fix asset codes (str -> list)
    # Since asset codes are in {1, 2, 3} format
    # We need to repeat rows for each asset code
    df['assetCodes'] = df['assetCodes'].str.findall(f"'([\w\./]+)'")
    # Group to date level
    df.time = df.time.astype('datetime64[D, UTC]')
    # Expand assetCodes
    assetCodes_expanded = list(chain(*df['assetCodes']))
    if not df.empty: 
        assetCodes_index = df.index.repeat(df['assetCodes'].apply(len)) 
    else: assetCodes_index = df.index
    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 
                                  'assetCode': assetCodes_expanded})
    # Create expanded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetCodes'] + sorted(list(news_cols_agg.keys()))
    df_expanded = pd.merge(df_assetCodes, df[news_cols], 
                           left_on='level_0', right_index=True, 
                           suffixes=(['','_old']))
    # Aggregate numerical news features
    df_agg = df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    # Flat columns
    df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
    # Normalize, fillna etc. Don't remove rows.
    df_agg = df_agg.fillna(0)
    if not df_agg.empty:
        news_df_numeric = df_agg._get_numeric_data().astype(float)
        df_agg[news_df_numeric.columns] = StandardScaler().fit_transform(news_df_numeric)
    return df_agg

In [58]:
def joined_prepro(
    mkt_df, nw_df,
    clean_data=False, 
    extract_time_info=True,
    scale_numeric=False,
    encode_asset=True,
    fillna_clean_outliers=True
    ):
    """
    
    Returns X
    """
    # market has index [time, assetCode]
    dfmkt = market_prepro(mkt_df,
        clean_data=clean_data, 
        extract_time_info=extract_time_info,
        scale_numeric=scale_numeric,
        encode_asset=encode_asset,
        fillna_clean_outliers=fillna_clean_outliers)
    # We select news in train time interval
    dfnews = news_prepro(nw_df,
        scale_numeric=scale_numeric)
    # Join by index, which is time, assetCode. 
    # Some assets have no news at all, so left join and 0 nans
    X = dfmkt.merge(dfnews, how='left', left_on=['time', 'assetCode'], 
                    right_on=['time','assetCode'],  right_index=True)
    if fillna_clean_outliers:
        # Some market data can be without news, fill nans
        X = X.fillna(0)
    # Drop non-training columns
    y = X.returnsOpenNextMktres10.copy()
    X = X.drop(['time', 'assetCode', 'assetName', 'returnsOpenNextMktres10'], axis=1)
    return X, y

# Flat Data Model

In [59]:
X, y = joined_prepro(
    mkt,
    nw,
    clean_data=False,
    extract_time_info=True,
    scale_numeric=False,
    encode_asset=True,
    fillna_clean_outliers=True)

ValueError: fill value must be in categories

In [48]:
X

Unnamed: 0,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,universe,year_2008.0,year_2009.0,year_2010.0,year_2011.0,year_2012.0,year_2013.0,year_2014.0,year_2015.0,year_2016.0,year_nan,day_2.0,day_3.0,day_4.0,day_5.0,day_6.0,day_7.0,day_8.0,day_9.0,day_10.0,day_11.0,day_12.0,day_13.0,day_14.0,day_15.0,day_16.0,day_17.0,day_18.0,day_19.0,day_20.0,day_21.0,day_22.0,day_23.0,day_24.0,day_25.0,day_26.0,day_27.0,day_28.0,day_29.0,...,sentimentNeutral_mean,sentimentNeutral_std,sentimentPositive_min,sentimentPositive_max,sentimentPositive_mean,sentimentPositive_std,sentimentWordCount_min,sentimentWordCount_max,sentimentWordCount_mean,sentimentWordCount_std,noveltyCount12H_min,noveltyCount12H_max,noveltyCount12H_mean,noveltyCount12H_std,noveltyCount24H_min,noveltyCount24H_max,noveltyCount24H_mean,noveltyCount24H_std,noveltyCount3D_min,noveltyCount3D_max,noveltyCount3D_mean,noveltyCount3D_std,noveltyCount5D_min,noveltyCount5D_max,noveltyCount5D_mean,noveltyCount5D_std,noveltyCount7D_min,noveltyCount7D_max,noveltyCount7D_mean,noveltyCount7D_std,volumeCounts12H_min,volumeCounts12H_max,volumeCounts12H_mean,volumeCounts12H_std,volumeCounts24H_min,volumeCounts24H_max,volumeCounts24H_mean,volumeCounts24H_std,volumeCounts3D_min,volumeCounts3D_max,volumeCounts3D_mean,volumeCounts3D_std,volumeCounts5D_min,volumeCounts5D_max,volumeCounts5D_mean,volumeCounts5D_std,volumeCounts7D_min,volumeCounts7D_max,volumeCounts7D_mean,volumeCounts7D_std
0,1407200.0,45.3300,45.0000,0.007781,-0.007280,0.000459,-0.004250,0.047608,0.041908,0.024337,0.028301,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,13049891.0,32.3500,31.8300,0.016337,-0.006864,0.014055,-0.011509,0.071902,0.076793,0.077818,0.091703,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,228100.0,24.5600,24.8500,-0.016420,-0.013889,-0.014170,-0.016563,0.045106,0.047639,0.000000,0.000000,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1082054.0,16.3100,16.2300,0.010533,-0.008552,0.009529,-0.007177,0.027078,0.026565,0.064453,0.056102,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,297504.0,41.8500,41.8100,0.000957,0.024504,0.000647,0.025248,0.012337,0.019259,0.044485,0.071996,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,203200.0,27.2700,27.2000,0.005531,-0.012346,0.003455,-0.012780,0.001837,-0.007299,-0.008511,0.005626,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,24207200.0,22.3900,22.5200,0.000894,0.047442,-0.002719,0.046507,0.071292,0.114300,0.063591,0.053211,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,373800.0,33.3000,34.5000,-0.031977,-0.028716,-0.033773,-0.029138,0.063898,0.088328,0.089996,0.031993,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,10243595.0,42.5500,42.4700,0.002356,-0.007478,0.000906,-0.007758,-0.054444,-0.050738,-0.023430,0.005084,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,2461100.0,73.9200,74.1600,-0.000135,0.006242,-0.001241,0.005865,-0.007119,0.003518,-0.001189,0.002537,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# LSTM Model

In [None]:
raise Exception("Just making sure you don't train the LSTM needlessly")

In [None]:
class JoinedGenerator:
    """
    Keras standard approach to generage batches for model.fit_generator() call.
    """
    def __init__(self, prepro, market, news, index_df):
        """
        @param preprocessor: market and news join preprocessor
        @param market: full loaded market df
        @param news: full loaded news df
        @param index_df: df with assetCode and time of train or validation market data. Batches will be taken from them.
        """
        self.market = market
        self.prepro = prepro
        self.news = news
        self.index_df = index_df
        self.asset_codes = self.index_df['assetCode'].unique().tolist()

    def flow_lstm(self, batch_size, is_train, look_back, look_back_step):
        """
        Generates batch data for LSTM NN
        Each cycle in a loop we yield a batch for one training step in epoch. 
        """
        while True:
            # Get market indices of random assets, sorted by assetCode, time.
            batch_index_df = self.get_random_assets_idx(batch_size)
            # Get X, y data for this batch, containing market and news, but without look back yet
            X, y = self.get_batch(batch_index_df, is_train)
            # Add look back data to X, y
            X, y = self.with_look_back(X,y,look_back,look_back_step)
            yield X,y
    
    def get_random_assets_idx(self, batch_size):
        """
        Get random asset and it's last market data indices.
        Repeat for next asset until we reach batch_size.
        """
        # Insert first asset
        asset = np.random.choice(self.asset_codes)
        asset_codes.remove(asset)
        batch_index_df = self.index_df[self.index_df.assetCode == asset].tail(batch_size)
        # Repeat until reach batch_size records
        while (batch_index_df.index.size < batch_size) and (len(asset_codes) > 0):
            asset = np.random.choice(asset_codes)
            asset_codes.remove(asset)
            asset_index_df = self.index_df[self.index_df.assetCode == asset].tail(batch_size - batch_index_df.index.size)
            batch_index_df = pd.concat([batch_index_df, asset_index_df])
        
        return batch_index_df.sort_values(by=['assetCode', 'time'])
            
    def get_batch(self, batch_idx, is_train):
        """
        Get batch of market-news data without lookback
        """
        market_df = self.market.loc[batch_idx.index]
        # Select subset of news for future merge by assetCode and time. 
        news_df = self.news.merge(batch_idx, on=['time'])
        # Remove bad rows, clean the data. It's ok for train.
        if is_train: 
            market_df, news_df = prepro.fix_train(market_df, news_df)
        # Join market and news using preprocessor       
        X = self.prepro.get_X(market_df, news_df)
        y = self.prepro.get_y(market_df)
        return(X, y)
    
    # convert an array of values into a dataset matrix
    def with_look_back(self, X, y, look_back, look_back_step):
        """
        Add look back window values to prepare dataset for LSTM
        """
        X_processed, y_processed = [], []
        # Fix last window in batch, can be not full
        if look_back > len(X): 
            look_back = len(X)
            look_back_step = min(look_back_step, look_back)
            
        for i in range(0,len(X)-look_back+1):
            # Add lookback to X
            x_window = X.values[i:(i+look_back):look_back_step, :]
            X_processed.append(x_window)
            # If input is X only, we'll not output y
            if y is None: continue
            # Add lookback to y
            y_window = y.values[i+look_back-1, :]
            y_processed.append(y_window)
        # Return Xy for train/test or X for prediction
        if(y is not None): return np.array(X_processed), np.array(y_processed)
        else: return np.array(X_processed)

    
# Train data generator instance
join_generator = JoinedGenerator(prepro, mkt, nw, mkt)

# Validation data generator instance
val_generator = JoinedGenerator(prepro, mkt, nw, mkt)

In [None]:
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization, LSTM, Embedding
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical

model = Sequential()
# Add an input layer market + news
input_size = len(market_prepro.feature_cols) + len(news_prepro.feature_cols)
# input_shape=(timesteps, input features)
model.add(LSTM(units=128, return_sequences=True, input_shape=(None,input_size)))
model.add(LSTM(units=64, return_sequences=True ))
model.add(LSTM(units=32, return_sequences=False))
# Add an output layer 
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
weights_file='best_weights.h5'
# We'll stop training if no improvement after some epochs
earlystopper = EarlyStopping(patience=5, verbose=1)
# Low, avg and high scor training will be saved here
# Save the best model during the traning
checkpointer = ModelCheckpoint(
    weights_file,
    verbose=1,
    save_best_only=True,
    save_weights_only=True)
reduce_lr = ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.001)

training = model.fit_generator(
    join_generator.flow_lstm(
        batch_size=BATCH_SIZE,
        is_train=True, 
        look_back=LOOK_BACK, 
        look_back_step=LOOK_BACK_STEPS),
        epochs=EPOCHS, 
        validation_data=val_generator.flow_lstm(
            batch_size=BATCH_SIZE,
            is_train=False, 
            look_back=LOOK_BACK, 
            look_back_step=LOOK_BACK_STEPS),
    steps_per_epoch=STEPS_PER_EPOCHS,
    validation_steps=VALIDATION_STEPS,
    callbacks=[earlystopper, checkpointer, reduce_lr])