## _Text Classification -  Deep Learning_

**Script to predict if the stock will go up or down based on text in the 8-K Document. Models include
MLP, CNN, RNN-CNN. RNN-CNN results in an average accuracy of 87.5%**

### _Import Libraries_

In [12]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pickle
import json
tqdm.pandas()


import tensorflow as tf
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, GRU,Input, LSTM, Embedding, Bidirectional
from keras.layers import Flatten, Conv1D, MaxPooling1D, GlobalMaxPooling1D, TimeDistributed, BatchNormalization
from keras.layers import concatenate as lconcat
from keras.optimizers import SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K


from sklearn.metrics import roc_auc_score
from keras.utils import np_utils,plot_model, multi_gpu_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler

In [69]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6066842343966597470
]


### _Helper Functions for text embedding_

In [70]:

# Define number of words, and embedding dimensions
max_words = 34603
embed_dim = 100

def load_embeddings(vec_file):
    """
    Function to load Glove Embeddings.
    
    Args:
        vec_file (text file): Text file containing dictonary of words and their pre-trained weights.
    
    Returns:
        dict: Dictionary of words and the corresponding Glove embeddings as values.
    """
    print("Loading Glove Model")
    f = open(vec_file,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done. {} words loaded!".format(len(model)))
    return model

def tokenize_and_pad(docs,max_words=max_words):
    """
    Function to tokenize and pad word sequences using Keras.
    
    Args:
        docs (str): Text to be tokenized and padded.
        max_words (int): Maximum number of words to be kept in each sample.
        
    Returns:
        str: Tokenized and padded text.
    """
    global t
    t = Tokenizer()
    t.fit_on_texts(docs)
    docs = pad_sequences(sequences = t.texts_to_sequences(docs),maxlen = max_words, padding = 'post')
    global vocab_size
    vocab_size = len(t.word_index) + 1
    
    return docs

In [78]:
df = pd.read_csv("./Data/lemmatized_text_all.csv")

In [79]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GICS Sector,GICS Sub Industry,cik,doc_name,items,release_date,text,ticker,txt_link,price_change,vix,rm_week,rm_month,rm_qtr,rm_year,signal,processed_text,text_len
0,0,0,Information Technology,Information Technology,320193.0,0000320193-19-000026.txt,['Item 5.02'],2019-02-06 08:00:34,0000320193-19-000026.txt : 20190206 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,,,,7.47,-19.41,-2.18,down,"['accession', 'number', 'conformed', 'submissi...",315
1,1,1,Information Technology,Information Technology,320193.0,0000320193-19-000007.txt,"['Item 2.02', 'Item 9.01']",2019-01-29 16:30:18,0000320193-19-000007.txt : 20190129 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,,,1.52,-4.93,-26.05,-8.14,down,"['accession', 'number', 'conformed', 'submissi...",1065
2,2,2,Information Technology,Information Technology,320193.0,0000320193-19-000002.txt,"['Item 2.02', 'Item 9.01']",2019-01-02 16:30:03,0000320193-19-000002.txt : 20190102 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,-8.11,2510.030029,-0.43,-6.48,-23.16,-8.65,down,"['accession', 'number', 'conformed', 'submissi...",1522
3,3,3,Information Technology,Information Technology,320193.0,0000320193-18-000142.txt,"['Item 2.02', 'Item 9.01']",2018-11-01 16:30:21,0000320193-18-000142.txt : 20181101 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,-5.56,2740.370117,0.52,0.24,12.14,31.58,down,"['accession', 'number', 'conformed', 'submissi...",1090
4,4,4,Information Technology,Information Technology,320193.0,0000320193-18-000098.txt,"['Item 2.02', 'Item 9.01']",2018-07-31 16:30:20,0000320193-18-000098.txt : 20180731 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,5.21,2816.290039,-1.77,1.72,14.20,22.16,up,"['accession', 'number', 'conformed', 'submissi...",986
5,5,5,Information Technology,Information Technology,320193.0,0001193125-18-154948.txt,[],2018-05-07 21:31:32,0001193125-18-154948.txt : 20180508 0001193125...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,1.09,2672.629883,11.30,2.78,7.84,11.68,up,"['accession', 'number', 'conformed', 'submissi...",326
6,6,6,Information Technology,Information Technology,320193.0,0001193125-18-154515.txt,[],2018-05-07 16:30:56,0001193125-18-154515.txt : 20180507 0001193125...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,1.09,2672.629883,11.30,2.78,7.84,11.68,up,"['accession', 'number', 'conformed', 'submissi...",2915
7,7,7,Information Technology,Information Technology,320193.0,0000320193-18-000067.txt,"['Item 2.02', 'Item 9.01']",2018-05-01 16:30:17,0000320193-18-000067.txt : 20180501 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,4.77,2654.800049,0.77,-3.47,1.62,12.02,up,"['accession', 'number', 'conformed', 'submissi...",1062
8,8,8,Information Technology,Information Technology,320193.0,0001193125-18-045761.txt,[],2018-02-14 16:54:21,0001193125-18-045761.txt : 20180214 0001193125...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,2.00,2698.629883,4.96,-8.67,-12.19,21.31,up,"['accession', 'number', 'conformed', 'submissi...",914
9,9,9,Information Technology,Information Technology,320193.0,0000320193-18-000005.txt,"['Item 2.02', 'Item 9.01']",2018-02-01 16:30:17,0000320193-18-000005.txt : 20180201 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,0.88,2821.979980,-2.26,-4.93,3.10,36.79,stay,"['accession', 'number', 'conformed', 'submissi...",993


In [80]:
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('items')),columns=mlb.classes_,),sort=False,how="left")

# df.drop('items', axis = 1, inplace = True)

In [81]:
df.isnull().sum().sum()

16

In [82]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GICS Sector,GICS Sub Industry,cik,doc_name,release_date,text,ticker,txt_link,...,6,7,8,9,I,[,],e,m,t
0,0,0,Information Technology,Information Technology,320193.0,0000320193-19-000026.txt,2019-02-06 08:00:34,0000320193-19-000026.txt : 20190206 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,0,1,1,1,1,1,1
1,1,1,Information Technology,Information Technology,320193.0,0000320193-19-000007.txt,2019-01-29 16:30:18,0000320193-19-000007.txt : 20190129 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,1,1,1,1,1,1,1
2,2,2,Information Technology,Information Technology,320193.0,0000320193-19-000002.txt,2019-01-02 16:30:03,0000320193-19-000002.txt : 20190102 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,1,1,1,1,1,1,1
3,3,3,Information Technology,Information Technology,320193.0,0000320193-18-000142.txt,2018-11-01 16:30:21,0000320193-18-000142.txt : 20181101 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,1,1,1,1,1,1,1
4,4,4,Information Technology,Information Technology,320193.0,0000320193-18-000098.txt,2018-07-31 16:30:20,0000320193-18-000098.txt : 20180731 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,1,1,1,1,1,1,1
5,5,5,Information Technology,Information Technology,320193.0,0001193125-18-154948.txt,2018-05-07 21:31:32,0001193125-18-154948.txt : 20180508 0001193125...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,0,0,1,1,0,0,0
6,6,6,Information Technology,Information Technology,320193.0,0001193125-18-154515.txt,2018-05-07 16:30:56,0001193125-18-154515.txt : 20180507 0001193125...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,0,0,1,1,0,0,0
7,7,7,Information Technology,Information Technology,320193.0,0000320193-18-000067.txt,2018-05-01 16:30:17,0000320193-18-000067.txt : 20180501 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,1,1,1,1,1,1,1
8,8,8,Information Technology,Information Technology,320193.0,0001193125-18-045761.txt,2018-02-14 16:54:21,0001193125-18-045761.txt : 20180214 0001193125...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,0,0,1,1,0,0,0
9,9,9,Information Technology,Information Technology,320193.0,0000320193-18-000005.txt,2018-02-01 16:30:17,0000320193-18-000005.txt : 20180201 0000320193...,Apple Inc.,https://www.sec.gov/Archives/edgar/data/320193...,...,0,0,0,1,1,1,1,1,1,1


In [95]:
# df['release_date'] = pd.to_datetime(df['release_date'])

# filter_train = (df['release_date'] <= '2014-01-01')
# filter_test = (df['release_date'] > '2014-01-01')

# train_data = df[filter_train]
# test_data = df[filter_test]

# train_data.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,GICS Sector,GICS Sub Industry,cik,doc_name,release_date,text,ticker,txt_link,price_change,vix,rm_week,rm_month,rm_qtr,rm_year,signal,processed_text,text_len
343,345,345,Industrials,Industrials,40545.0,0000040545-06-000013.txt,2006-04-13 07:43:58,-----BEGIN PRIVACY-ENHANCED MESSAGE----- Proc-...,General Electric,https://www.sec.gov/Archives/edgar/data/40545/...,54.68,1288.119995,0.92,0.26,-1.96,-6.14,up,"['message', 'webmaster', 'rsa', 'accession', '...",1240
344,346,346,Industrials,Industrials,40545.0,0000040545-06-000001.txt,2006-01-20 07:50:31,-----BEGIN PRIVACY-ENHANCED MESSAGE----- Proc-...,General Electric,https://www.sec.gov/Archives/edgar/data/40545/...,55.08,1285.040039,-0.87,-2.76,-1.69,-1.32,up,"['message', 'webmaster', 'rsa', 'accession', '...",1030
345,347,347,Industrials,Industrials,40545.0,0001193125-05-239758.txt,2005-12-09 12:29:43,-----BEGIN PRIVACY-ENHANCED MESSAGE----- Proc-...,General Electric,https://www.sec.gov/Archives/edgar/data/40545/...,-36.77,1255.839966,-0.08,2.49,7.24,0.95,down,"['message', 'webmaster', 'rsa', 'accession', '...",25077
346,348,348,Industrials,Industrials,40545.0,0001193125-05-232239.txt,2005-11-25 15:36:47,-----BEGIN PRIVACY-ENHANCED MESSAGE----- Proc-...,General Electric,https://www.sec.gov/Archives/edgar/data/40545/...,-36.53,1265.609985,-0.21,3.13,4.97,-2.83,down,"['message', 'webmaster', 'rsa', 'accession', '...",587
347,349,349,Industrials,Industrials,40545.0,0000040545-05-000054.txt,2005-10-14 07:44:34,-----BEGIN PRIVACY-ENHANCED MESSAGE----- Proc-...,General Electric,https://www.sec.gov/Archives/edgar/data/40545/...,59.67,1176.839966,0.37,1.97,-4.06,-4.47,up,"['message', 'webmaster', 'rsa', 'accession', '...",1105


In [53]:
# Separate into X and Y
cols = ['GICS Sector','vix','rm_week','rm_month', 'rm_qtr', 'rm_year']
cols.extend(list(mlb.classes_))
X = df[cols]
docs = df['processed_text']

y = df['signal']

# Get Dummies

docs = tokenize_and_pad(docs)
X = pd.get_dummies(columns = ['GICS Sector'],prefix="sector",data=X)
y = pd.get_dummies(columns=['signal'],data=y)

aux_shape = len(X.columns)

In [54]:
# # Split into train and test data
X_train, X_test, y_train, y_test, docs_train, docs_test = train_test_split(X, y,docs,
                                                    stratify=y, 
                                                    test_size=0.3,
                                                    random_state = 20)

In [55]:
embeddings_index = load_embeddings("./Data/glove.6B.100d.txt")

words_not_found = []

embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Loading Glove Model
Done. 400000 words loaded!
number of null word embeddings: 66003


In [56]:
from sklearn.metrics import roc_auc_score

# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.metrics.auc(y_true,y_pred)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [57]:

def build_model(output_classes,architecture,aux_shape=aux_shape,vocab_size=vocab_size,embed_dim=embed_dim,embedding_matrix=embedding_matrix,max_seq_len=max_words):
    
    with tf.device('/cpu:0'):
        main_input= Input(shape=(max_seq_len,),name='doc_input')
        main = Embedding(input_dim = vocab_size,
                            output_dim = embed_dim,
                            weights=[embedding_matrix], 
                            input_length=max_seq_len, 
                            trainable=False)(main_input)

    if architecture == 'mlp':
        # Densely Connected Neural Network (Multi-Layer Perceptron)
        main = Dense(32, activation='relu')(main)
        main = Dropout(0.2)(main)
        main = Flatten()(main)
    elif architecture == 'cnn':
        # 1-D Convolutional Neural Network
        main = Conv1D(64, 3, strides=1, padding='same', activation='relu')(main)
        #Cuts the size of the output in half, maxing over every 2 inputs
        main = MaxPooling1D(pool_size=3)(main)
        main = Dropout(0.2)(main)
        main = Conv1D(32, 3, strides=1, padding='same', activation='relu')(main)
        main = GlobalMaxPooling1D()(main)
        #model.add(Dense(output_classes, activation='softmax'))
    elif architecture == 'rnn':
        # LSTM network
        main = Bidirectional(GRU(64, return_sequences=False),merge_mode='concat')(main)
        main = BatchNormalization()(main)
    elif architecture =="rnn_cnn":
        main = Conv1D(64, 5, padding='same', activation='relu')(main)
        main = MaxPooling1D()(main)
        main = Dropout(0.2)(main)
        main = Bidirectional(GRU(32,return_sequences=False),merge_mode='concat')(main)
        main = BatchNormalization()(main)
   
    else:
        print('Error: Model type not found.')
        
    auxiliary_input = Input(shape=(aux_shape,), name='aux_input')
    x = lconcat([main, auxiliary_input])
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    main_output = Dense(output_classes, activation='sigmoid', name='main_output')(x)
    model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output],name=architecture)
        
    #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#     model = multi_gpu_model(model)
    model.compile('adam', 'categorical_crossentropy',metrics=['accuracy'])
    
    return model

In [58]:
def plot_metrics(model_dict,metric,x_label,y_label):
    plots = 1
    plt.figure(figsize=[15,10])
    for model, history in model_dict.items():
        plt.subplot(2,2,plots)
        plt.plot(history[metric])
        #plt.plot(history.history['val_acc'])
        plt.title('{0} {1}'.format(model,metric))
        plt.ylabel(y_label)
        plt.xlabel(x_label)
        plots += 1
    #plt.legend(['train', 'test'], loc='upper left')
    plt.tight_layout()
    plt.savefig("Graphs/{}.png".format(metric),format="png")
    plt.show()
    
def gen():
    print('generator initiated')
    idx = 0
    while True:
        yield [docs_train[:32], X_train[:32]], y_train[:32]
        print('generator yielded a batch %d' % idx)
        idx += 1

In [59]:
# Save data
np.save("./Data/docs_train.npy",docs_train)
np.save("./Data/docs_test.npy",docs_test)

X_train.to_pickle("./Data/X_train.pkl")
X_test.to_pickle("./Data//X_test.pkl")

y_train.to_pickle("./Data/y_train.pkl")
y_test.to_pickle("./Data/y_test.pkl") 

np.save("./Data/embedding_matrix.npy",embedding_matrix)

In [60]:
model_dict = dict()

In [61]:
mlp = build_model(3,"mlp")

model_dict["mlp"] = mlp.fit([docs_train, X_train],y_train,batch_size=64,epochs=10,verbose=1) 



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
mlp.predict()

In [28]:
mlp.save("./models/mlp.hdf5")
# with open('./models/mlp.pkl', 'wb') as file_pi:
#     pickle.dump(model_dict["mlp"], file_pi)

TypeError: can't pickle _thread.RLock objects

In [63]:
rnn_cnn = build_model(3,"rnn_cnn")

model_dict["rnn_cnn"] = rnn_cnn.fit([docs_train,X_train],y_train,batch_size=32,epochs=50,verbose=1)

rnn_cnn.save("./models/rnn_cnn.hdf5")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50

KeyboardInterrupt: 

In [32]:
X_test = pd.read_pickle("./Data/X_test.pkl")
y_test = pd.read_pickle("./Data/y_test.pkl")
docs_test = np.load("./Data/docs_test.npy")

mlp_hist = pickle.load(open("Data/trainHistory/mlp.pkl","rb"))
cnn_hist = pickle.load(open("Data/trainHistory/cnn.pkl","rb"))
rnn_hist = pickle.load(open("Data/trainHistory/rnn.pkl","rb"))
rnn_cnn_hist = pickle.load(open("Data/trainHistory/rnn_cnn.pkl","rb"))

In [34]:
mlp.evaluate([docs_test,X_test],y_test,batch_size=64)



[0.779270589351654, 0.875, 0.8825430274009705]

In [35]:
rnn_cnn = load_model("./models/rnn_cnn.hdf5",custom_objects={"auc_roc":auc_roc})
rnn_cnn.evaluate([docs_test,X_test],y_test,batch_size=64)



[2.0147619247436523, 0.875, 0.0]