In [1]:
import os

import numpy as np
import pandas as pd
import math
import random

import datetime
import time
import io

import tensorflow as tf
from keras import backend as K

from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional, Input, concatenate, add, multiply
from keras.layers import Conv1D, MaxPooling1D, Flatten, Reshape, GlobalMaxPooling1D, Highway, Permute, Lambda
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.models import Model
from keras.optimizers import Adam

from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVR

from gensim.models.wrappers import FastText

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")
Using TensorFlow backend.


In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

session = tf.Session(config = tf.ConfigProto(inter_op_parallelism_threads = 0,
                                             intra_op_parallelism_threads = 0,
                                             log_device_placement = True))

K.set_session(session)

In [3]:
# Reading dataset from files

def isNaN(num):
    return num != num

def is_ascii(s):
    return (len(s) == len(s.encode()))

def read_pr(file, newline_token):
    xl = pd.ExcelFile(file)
    df = xl.parse("SignificantDevelopment")

    df['Release Date'] = df['Release Date'].apply(lambda x: x.split(" ")[0])
    
    non_ascii_count = 0

    for i in df.index:
        if isNaN(df.loc[i, 'Headline']):
            df.at[i, :] = np.nan
        elif (not is_ascii(df.loc[i, 'Headline'])):
                df.at[i, :] = np.nan
                non_ascii_count += 1

    df.dropna(inplace = True)
    
    for d in set(df['Release Date']):
        df_d = df[df['Release Date'] == d]

        if (len(df_d.index) > 1):
            df_row = {}
            df_row['Topic'] = newline_token.join(set(df_d['Topic'].values))
            df_row['Release Date'] = d
            df_row['Company'] = df_d['Company'].values[0]
            df_row['Headline'] = newline_token.join(df_d['Headline'].values)
            
            df = df.drop(df_d.index)
            df = df.append(df_row, ignore_index=True)
    
    return df, non_ascii_count

def read_sd(file1, file2):
    df1 = pd.read_excel(file1, sheet_name='Sheet1', skiprows=[0])
    df2 = pd.read_excel(file2, sheet_name='Sheet1', skiprows=[0])
    df = pd.concat([df1, df2], ignore_index=True, sort=True).copy().sort_values(by='Date')

    if (len(df.index) != len(df1.index) + len(df2.index)):
        print ("Some rows are missed!") 

    df.fillna(method='ffill', inplace = True)
    df.fillna(df.mean(), inplace = True)

    ci_s = [ci for c in df.columns for ci in c.split() if ci != 'Close']
    stock_name = max(set(ci_s), key = ci_s.count)
    df.columns = [c.replace(stock_name, '').strip() for c in df.columns]
    
    df['r_stock'] = (df['Close'] / df['Close'].shift(1) - 1.0)
    df['r_index'] = (df['.SPX-US Close'] / df['.SPX-US Close'].shift(1) - 1.0)
    df['Release Date'] = df['Date'].apply(lambda x: "/".join([x.split("/")[1], x.split("/")[2], x.split("/")[0][-2:]]))
    
    df.dropna(inplace = True)
    
    cols = ['Release Date', 'r_stock', 'r_index', 'MFI', 'ForPE', 'SIP']
    for c in cols:
        if c not in df.columns:
            df[c] = np.nan
    
    df['MFI'] = df['MFI'].shift(1)
    df['ForPE'] = df['ForPE'].shift(1)
    df['SIP'] = df['SIP'].shift(1)
    
    return df[cols]

newline_token = " nnnewlineee "

dfs = []
for f in os.listdir("press_releases"):
    if f.endswith(".xls"):
        df, non_ascii_count = read_pr("press_releases\\" + f, newline_token)
        
        if (f not in os.listdir('stock_data\\10-14\\')) or (f not in os.listdir('stock_data\\15-19\\')):
            print ("Stock", f ,"was not found!")
            pass
        sd = read_sd('stock_data\\10-14\\' + f, 'stock_data\\15-19\\' + f)
        
        dfs.append(df.merge(sd, on = 'Release Date', how='left'))       
        print (f, "was processed. Non ascii lines removed:", non_ascii_count, "Total lines left:", len(df.index))

data = pd.concat(dfs, ignore_index = True)
data = data[~data['r_stock'].isnull()]
data.fillna(data.mean(), inplace = True)

del df, dfs

005930-SE.xls was processed. Non ascii lines removed: 5 Total lines left: 182
7203-TO.xls was processed. Non ascii lines removed: 10 Total lines left: 304
AAPL-US.xls was processed. Non ascii lines removed: 10 Total lines left: 249
ABBV-US.xls was processed. Non ascii lines removed: 6 Total lines left: 161
ABT-US.xls was processed. Non ascii lines removed: 4 Total lines left: 228
ACN-US.xls was processed. Non ascii lines removed: 5 Total lines left: 165
AMGN-US.xls was processed. Non ascii lines removed: 3 Total lines left: 243
AMZN-US.xls was processed. Non ascii lines removed: 11 Total lines left: 252
BA-US.xls was processed. Non ascii lines removed: 11 Total lines left: 374
BABA-US.xls was processed. Non ascii lines removed: 6 Total lines left: 145
BAYN-XE.xls was processed. Non ascii lines removed: 22 Total lines left: 260
BBL-US.xls was processed. Non ascii lines removed: 4 Total lines left: 195
BP.-LN.xls was processed. Non ascii lines removed: 17 Total lines left: 303
BRK'A-US.x

In [4]:
# Company to index

Company_num = len(set(data['Company']))

Company2idx = {}
Company_embeddings = np.identity(Company_num, float)

for company in set(data['Company']):
    if company not in Company2idx:
        Company2idx[company] = len(Company2idx)

print('List of companies:')
Company2idx.keys()

List of companies:


dict_keys(['BP plc (ADR)', 'Merck & Co., Inc.', 'Siemens AG (ADR)', 'Bayer AG (ADR)', 'Chevron Corporation', 'Toyota Motor Corp (ADR)', 'Nestle SA', 'Banco Santander SA (ADR)', 'Samsung Electronics Co Ltd', 'Royal Dutch Shell Plc', 'AbbVie Inc', 'The Chase Manhattan Corp', 'Oracle Corporation', 'Pfizer, Inc. (OLD)', 'Honeywell International Inc.', '3M Co', 'HSBC Holdings plc (ADR)', 'Boeing Co', 'Alphabet Inc', 'Abbott Laboratories', 'Accenture Plc', 'Roche Holding Ltd. (ADR)', 'Wells Fargo & Co', 'International Business Machines Corp.', 'Microsoft Corporation', 'Johnson & Johnson (OLD)', 'Amazon.com, Inc.', 'Berkshire Hathaway Inc.', 'Exxon Corporation', 'Novartis AG (ADR)', 'Walmart Inc', 'Travelers Group Inc', 'Novo Nordisk A/S (ADR)', 'BHP Group PLC', 'Volkswagen AG (ADR)', 'Apple Inc.', 'Total SA (ADR)', 'Cisco Systems, Inc.', 'General Electric Company', 'Facebook Inc', 'AT&T Inc.', 'Alibaba Group Holding Ltd', 'Amgen, Inc.', 'Intel Corporation'])

In [5]:
# Char to index

char2idx = {}
char2idx["PADDING_TOKEN"] = 0
char2idx["NEWLINE_TOKEN"] = 1

# Token length for char CNN implementation
cnn_len = 16

for sentence in data['Headline'].values:
    for char in list(sentence):
        if char not in char2idx:
            char2idx[char] = len(char2idx)

print('Char vocabulary:')
char2idx.keys()

Char vocabulary:


dict_keys(['PADDING_TOKEN', 'NEWLINE_TOKEN', 'S', 'a', 'm', 's', 'u', 'n', 'g', ' ', 'E', 'l', 'e', 'c', 'y', 'F', 'i', 't', 'I', 'v', 'M', 'o', 'r', 'C', 'h', 'p', 'W', 'D', 'T', 'Y', '-', 'Q', '4', 'O', 'P', 'f', 'U', '1', '3', '6', '2', '5', 'B', 'K', 'H', 'x', 'L', 'w', '0', '9', ',', 'd', 'R', '$', '.', '8', 'A', 'k', 'N', 'b', 'G', 'X', 'J', "'", 'V', '&', 'q', '7', '/', 'z', 'j', '<', '>', ':', ';', '+', '%', '"', 'Z', '(', ')', '`', '!', '[', ']', '*', '#', '_'])

In [6]:
# Loading FASTTEXT english model bin

word_embeddings_path = 'embeddings\cc.en.300.bin'
lang_model = FastText.load_fasttext_format(word_embeddings_path)

embedding_size = len(lang_model['size'])
print ('Embedding size:', embedding_size)

Embedding size: 300


In [7]:
# Lemmatization of text tokens

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return {'pos': wordnet.ADJ}
    elif treebank_tag.startswith('V'):
        return {'pos': wordnet.VERB}
    elif treebank_tag.startswith('N'):
        return {'pos': wordnet.NOUN}
    elif treebank_tag.startswith('R'):
        return {'pos': wordnet.ADV}
    else:
        return {}

headlines_processed = []

for ind, row in data.iterrows():
    sentence_clean = row['Headline'].replace('"', '').replace("'", '')
    sentence_list = pos_tag(word_tokenize(sentence_clean))
    sentence_parsed = [wordnet_lemmatizer.lemmatize(token.lower(), **get_wordnet_pos(pos)) for token, pos in sentence_list]
    headlines_processed.append(sentence_parsed)

data['Headline_proc'] = headlines_processed
data.iloc[:5]

Unnamed: 0,Release Date,Company,Headline,Topic,r_stock,r_index,MFI,ForPE,SIP,Headline_proc
0,01/30/19,Samsung Electronics Co Ltd,Samsung Elec Says Facility Investment In Memor...,Other Earnings Pre-Announcement,0.01978,0.015549,72.672,8.507,1.239389,"[samsung, elec, say, facility, investment, in,..."
1,01/28/19,Samsung Electronics Co Ltd,Samsung Electro-Mechanics Q4 Operating Profit ...,Earnings Announcements,0.006704,-0.007847,66.051,8.225,1.239389,"[samsung, electro-mechanics, q4, operating, pr..."
2,01/23/19,Samsung Electronics Co Ltd,"SK Hynix Expects Lower 2019 Capex, Demand Reco...",Other Earnings Pre-Announcement,-0.003559,0.002203,60.384,7.588,1.239389,"[sk, hynix, expects, lower, 2019, capex, ,, de..."
3,11/30/18,Samsung Electronics Co Ltd,Samsung Faces $5.8 Bln Loss In Sales After Tec...,"Regulatory / Company Investigation, Other Earn...",-0.030127,0.008185,59.982,6.33,1.239389,"[samsung, face, $, 5.8, bln, loss, in, sale, a..."
4,11/15/18,Samsung Electronics Co Ltd,Credit Suisse Says Surprised About Mobile Paym...,Regulatory / Company Investigation,0.003401,0.010594,53.966,6.395,1.239389,"[credit, suisse, say, surprise, about, mobile,..."


In [8]:
# Word to embedding

word2idx = {}
word_embeddings = []

# Initialization of embeddings for pads and OOV
word2idx["PADDING_TOKEN"] = len(word2idx)
word_embeddings.append(np.zeros(embedding_size))

word2idx["UNKNOWN_TOKEN"] = len(word2idx)
word_embeddings.append(np.random.uniform(-0.25, 0.25, embedding_size))

# Получаем вектора токенов из словаря
for sentence_list in data['Headline_proc'].values:
    for token in sentence_list:
        if token not in word2idx:
            try:
                word_embeddings.append(lang_model[token])
                word2idx[token] = len(word2idx)
            except:
                pass

word_embeddings = np.array(word_embeddings, dtype='float32')

print ('Word embeddings:')
word_embeddings[:4]

Word embeddings:


array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.2075986 ,  0.06262114,  0.05612901, ..., -0.22106387,
        -0.21707186,  0.11902369],
       [-0.02436286, -0.06004606,  0.00459439, ...,  0.08166368,
        -0.00982456, -0.04823476],
       [-0.1724217 ,  0.04652646,  0.02433003, ...,  0.20093898,
         0.01823837, -0.18172157]], dtype=float32)

In [9]:
# Train-dev-test split

train_dataset = data.sample(frac=0.95, random_state=1234)
test_dataset = data.drop(train_dataset.index)

print ('Train size:', len(train_dataset.index))
print ('Test size:', len(test_dataset.index))

Train size: 9235
Test size: 486


In [10]:
# Converting unstructured data to matrices

def create_matrices(df, df_type):   
    total_tokens = 0
    unknown_tokens = 0
    dataset = []
    
    for ind, row in df.iterrows():
        
        # Get company code
        company_index = Company2idx[row['Company']]
        
        # Get word embedding indices
        word_indices = [word2idx['PADDING_TOKEN']]
        
        for token in row['Headline_proc']:
            if (token != newline_token):
                total_tokens += 1
                
                if token in word2idx:
                    word_idx = word2idx[token]
                else:
                    word_idx = word2idx["UNKNOWN_TOKEN"]
                    unknown_tokens += 1
            else:
                word_idx = word2idx['PADDING_TOKEN']
            
            word_indices.append(word_idx)
        
        word_indices.append(word2idx['PADDING_TOKEN'])
        
        # Get char indices
        char_codes = [[char2idx["PADDING_TOKEN"]] * cnn_len]
        
        for token in word_tokenize(row['Headline'].replace('"', '').replace("'", '')):
            if (token != newline_token):
                token_trunc = token[:cnn_len]
                token_chars = [char2idx[char] for char in list(token_trunc)]
                token_chars = token_chars + [char2idx["PADDING_TOKEN"]] * (cnn_len - len(token_trunc))
            else:
                token_chars = [char2idx["PADDING_TOKEN"]] * cnn_len
                
            char_codes.append(token_chars)
            
        char_codes.append([char2idx["PADDING_TOKEN"]] * cnn_len)        
        
        # Get true label
        label = row['r_stock']
        
        # Get numerical features
        r_index = row['r_index']
        MFI = row['MFI']
        ForPE = row['ForPE']
        SIP = row['SIP']        

        # Save sample
        dataset.append([company_index, np.array(word_indices), np.array(char_codes), r_index, MFI, ForPE, SIP, label])
        
    unknown_percent = 0.0
    if total_tokens != 0:
        unknown_percent = 100 * float(unknown_tokens) / total_tokens
    print(df_type + " data: {} tokens, {} unknown, {:.3}%".format(total_tokens, unknown_tokens, unknown_percent))
    
    return np.array(dataset)

train_data = create_matrices(train_dataset, 'Train')
test_data = create_matrices(test_dataset, 'Test')

Train data: 133517 tokens, 868 unknown, 0.65%
Test data: 7040 tokens, 40 unknown, 0.568%


## Full model

In [11]:
dim_HIDDEN = 16
CNN_FILTERS = 64
dim_company = 4
dim_CHAR = 32
CNN_WIN = 5

# Input layers and embeddings
company_input = Input(dtype='int32', shape=(1,), name='company_input')
company_embedding_layer = Embedding(input_dim=len(Company2idx), output_dim=dim_company,
                                    trainable=True, name='company_embeddings')
company = company_embedding_layer(company_input)
company = Lambda(lambda x: K.squeeze(x, 1), name='company')(company)

r_index = Input(dtype='float32', shape=(1,), name='r_index')
MFI = Input(dtype='float32', shape=(1,), name='MFI')
ForPE = Input(dtype='float32', shape=(1,), name='ForPE')
SIP = Input(dtype='float32', shape=(1,), name='SIP')

token_input = Input(dtype='int32', shape=(None,), name='token_input')
token_embedding_layer = Embedding(input_dim=word_embeddings.shape[0], 
                                   output_dim=word_embeddings.shape[1],
                                   weights=[word_embeddings], trainable=False, 
                                   name='token_embeddings')
tokens = token_embedding_layer(token_input)

char_input = Input(dtype='int32', shape=(None, cnn_len), name='char_input')
char_embedding_layer = Embedding(input_dim=len(char2idx), output_dim=dim_CHAR, name='char_embedding_layer')
char_embeddings = char_embedding_layer(char_input)

# Implementation of char CNN
char_cnn = TimeDistributed(Conv1D(filters=CNN_FILTERS, kernel_size=CNN_WIN), name='char_cnn')(char_embeddings)
char_activation = TimeDistributed(PReLU(), name='char_activation')(char_cnn)
char_pooling = TimeDistributed(GlobalMaxPooling1D(), name='char_pooling')(char_activation)
char_highway = TimeDistributed(Highway(), name='char_highway')(char_pooling)
chars = TimeDistributed(Dropout(0.30), name = "chars")(char_highway)

merged_embeddings = concatenate([tokens, chars], name='merged_embeddings')

# Implementation of BLSTM
blstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
    LSTM(dim_HIDDEN, return_sequences=True, return_state=True, implementation=2), name='blstm')(merged_embeddings)

# Implementation of attention
state_h_concat = concatenate([forward_h, backward_h], name = 'state_h_concat')
state_h = Lambda(lambda x: tf.expand_dims(x, axis = 1), name = 'state_h')(state_h_concat)

attention_W1 = TimeDistributed(Dense(dim_HIDDEN), name = 'attention_W1')(blstm)
attention_W2 = TimeDistributed(Dense(dim_HIDDEN), name = 'attention_W2')(state_h)
attention_W = add([attention_W1, attention_W2], name = 'attention_W')

attention_scores = Lambda(lambda x: tf.nn.tanh(x), name = 'attention_scores')(attention_W)
attention_V = TimeDistributed(Dense(1), name = 'attention_V')(attention_scores)
attention_weights = Lambda(lambda x: tf.nn.softmax(x, axis = 1), name = 'attention_weights')(attention_V)

# Weighting context embeddings by attention
context_vector = multiply([attention_weights, blstm], name = "context_vector")
context_agg = Lambda(lambda x: tf.reduce_sum(x, axis=1), name = "context_agg")(context_vector)
context = Dense(dim_HIDDEN, name='context')(context_agg)

# Combining vector of features
features = concatenate([company, context, r_index, MFI, ForPE, SIP], name = 'features')
features_dropout = Dropout(0.30, name = "features_dropout")(features)

# Output regression
dense = Dense(dim_HIDDEN, name='dense')(features_dropout)
activation = PReLU(name='activation')(dense)
result = Dense(1, name='result')(activation)

# Compiling model
model = Model(inputs=[company_input, token_input, char_input, r_index, MFI, ForPE, SIP], outputs=result)
model.compile(loss='mse', optimizer=Adam())
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, None, 16)     0                                            
__________________________________________________________________________________________________
char_embedding_layer (Embedding (None, None, 16, 32) 2816        char_input[0][0]                 
__________________________________________________________________________________________________
char_cnn (TimeDistributed)      (None, None, 12, 64) 10304       char_embedding_layer[0][0]       
__________________________________________________________________________________________________
char_activation (TimeDistribute (None, None, 12, 64) 768         char_cnn[0][0]                   
__________________________________________________________________________________________________
char_pooli

In [12]:
number_of_epochs = 5
lr_decay = 0.5
K.set_value(model.optimizer.lr, 0.01)
random.seed(1234)
print("%d epochs" % number_of_epochs)
print()

def iterate_minibatches(dataset):   
    for sentence in dataset:
        companies, tokens, chars, r_index, MFI, ForPE, SIP, label = sentence     
        yield (np.asarray([companies]), np.asarray([tokens]), np.asarray([chars]), np.asarray([r_index]), 
               np.asarray([MFI]), np.asarray([ForPE]), np.asarray([SIP]), np.asarray([label]))

def tag_dataset(dataset):
    predicted_returns = []
    true_returns = []
    for company, tokens, chars, r_index, MFI, ForPE, SIP, label in dataset:
        pred = model.predict_on_batch([np.asarray([company]), np.asarray([tokens]), np.asarray([chars]), np.asarray([r_index]),
                                       np.asarray([MFI]), np.asarray([ForPE]), np.asarray([SIP])])[0]
        predicted_returns.append(pred)
        true_returns.append(label)
    return predicted_returns, true_returns

def compute_rmse(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_true = y_true, y_pred = y_pred))

print("%d train sentences" % len(train_data))
print("%d test sentences" % len(test_data))

for epoch in range(number_of_epochs):    
    print()
    print("--------- Epoch %d -----------" % epoch)
    random.shuffle(train_data)
    
    start_time = time.time()    
    for batch in iterate_minibatches(train_data):
        companies, tokens, chars, r_index, MFI, ForPE, SIP, label = batch       
        model.train_on_batch([companies, tokens, chars, r_index, MFI, ForPE, SIP], label)   
    print("%.2f sec for training" % (time.time() - start_time))
    print()
    
    #Train Dataset       
    start_time = time.time()  
    print("================================== Train Data ==================================")    
    predicted, correct = tag_dataset(train_data)  
    RMSE = compute_rmse(predicted, correct)
    print("RMSE = ", RMSE)

    #Test Dataset 
    print("================================== Test Data: ==================================")
    predicted, correct = tag_dataset(test_data)  
    RMSE = compute_rmse(predicted, correct)
    print("RMSE = ", RMSE)
    print()
    print("%.2f sec for evaluation" % (time.time() - start_time))
    
    current_lr = K.get_value(model.optimizer.lr)
    K.set_value(model.optimizer.lr, current_lr * (1.0 - lr_decay))

5 epochs

9235 train sentences
486 test sentences

--------- Epoch 0 -----------
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
72.68 sec for training

RMSE =  0.27435272143270856
RMSE =  0.2669889999616818

15.13 sec for evaluation

--------- Epoch 1 -----------
65.35 sec for training

RMSE =  0.09505458342477073
RMSE =  0.11046102768238672

18.66 sec for evaluation

--------- Epoch 2 -----------
67.69 sec for training

RMSE =  0.019311870189981508
RMSE =  0.02444978597783986

17.43 sec for evaluation

--------- Epoch 3 -----------
67.98 sec for training

RMSE =  0.014523009180562358
RMSE =  0.020994951059156303

17.01 sec for evaluation

--------- Epoch 4 -----------
66.66 sec for training

RMSE =  0.013693130099892223
RMSE =  0.021128044641349507

15.70 sec for evaluation


## Baseline 2

In [111]:
dim_HIDDEN = 16
dim_company = 4

# Input layers and embeddings
company_input = Input(dtype='int32', shape=(1,), name='company_input')
company_embedding_layer = Embedding(input_dim=len(Company2idx), output_dim=dim_company,
                                    trainable=True, name='company_embeddings')
company = company_embedding_layer(company_input)
company = Lambda(lambda x: K.squeeze(x, 1), name='company')(company)

noise_vector = Input(dtype='float32', shape=(dim_HIDDEN,), name='noise_vector')

r_index = Input(dtype='float32', shape=(1,), name='r_index')
MFI = Input(dtype='float32', shape=(1,), name='MFI')
ForPE = Input(dtype='float32', shape=(1,), name='ForPE')
SIP = Input(dtype='float32', shape=(1,), name='SIP')

# Combining vector of features
features = concatenate([company, noise_vector, r_index, MFI, ForPE, SIP], name = 'features')
features_dropout = Dropout(0.30, name = "features_dropout")(features)

# Output regression
dense = Dense(dim_HIDDEN, name='dense')(features_dropout)
activation = PReLU(name='activation')(dense)
result = Dense(1, name='result')(activation)

# Compiling model
model = Model(inputs=[company_input, noise_vector, r_index, MFI, ForPE, SIP], outputs=result)
model.compile(loss='mse', optimizer=Adam())
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
company_input (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
company_embeddings (Embedding)  (None, 1, 4)         176         company_input[0][0]              
__________________________________________________________________________________________________
company (Lambda)                (None, 4)            0           company_embeddings[0][0]         
__________________________________________________________________________________________________
noise_vector (InputLayer)       (None, 16)           0                                            
__________________________________________________________________________________________________
r_index (I

In [112]:
number_of_epochs = 5
lr_decay = 0.5
K.set_value(model.optimizer.lr, 0.01)
random.seed(1234)
print("%d epochs" % number_of_epochs)
print()

random_train = np.random.normal(size=(len(train_dataset.index), dim_HIDDEN))
random_test = np.random.normal(size=(len(test_dataset.index), dim_HIDDEN))

def iterate_minibatches(dataset, random_vecs):
    i = -1
    for sentence in dataset:
        i += 1
        companies, _, _, r_index, MFI, ForPE, SIP, label = sentence     
        yield (np.asarray([companies]), np.asarray([r_index]), 
               np.asarray([MFI]), np.asarray([ForPE]), np.asarray([SIP]), np.asarray([label]), np.asarray([random_vecs[i]]))

def tag_dataset(dataset, random_vecs):
    predicted_returns = []
    true_returns = []
    i = -1
    for company, _, _, r_index, MFI, ForPE, SIP, label in dataset:
        i += 1
        pred = model.predict_on_batch([np.asarray([company]), np.asarray([random_vecs[i]]), np.asarray([r_index]),
                                       np.asarray([MFI]), np.asarray([ForPE]), np.asarray([SIP])])[0]
        predicted_returns.append(pred)
        true_returns.append(label)
    return predicted_returns, true_returns

def compute_rmse(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_true = y_true, y_pred = y_pred))

print("%d train observations" % len(train_data))
print("%d test observations" % len(test_data))

for epoch in range(number_of_epochs):    
    print()
    print("--------- Epoch %d -----------" % epoch)
    random.shuffle(train_data)
    
    start_time = time.time()    
    for batch in iterate_minibatches(train_data, random_train):
        companies, r_index, MFI, ForPE, SIP, label, random_vec = batch       
        model.train_on_batch([companies, random_vec, r_index, MFI, ForPE, SIP], label)   
    print("%.2f sec for training" % (time.time() - start_time))
    print()
    
    #Train Dataset       
    start_time = time.time()  
    print("================================== Train Data ==================================")    
    predicted, correct = tag_dataset(train_data, random_train)  
    RMSE = compute_rmse(predicted, correct)
    print("RMSE = ", RMSE)

    #Test Dataset 
    print("================================== Test Data: ==================================")
    predicted, correct = tag_dataset(test_data, random_test)  
    RMSE = compute_rmse(predicted, correct)
    print("RMSE = ", RMSE)
    print()
    
    print("%.2f sec for evaluation" % (time.time() - start_time))
    
    current_lr = K.get_value(model.optimizer.lr)
    K.set_value(model.optimizer.lr, current_lr * (1.0 - lr_decay))

5 epochs

9235 train observations
486 test observations

--------- Epoch 0 -----------
12.09 sec for training

RMSE =  0.0020940749952854418
RMSE =  0.024600661473915614

7.61 sec for evaluation

--------- Epoch 1 -----------
8.58 sec for training

RMSE =  0.00047133445018445136
RMSE =  0.025216087776783278

6.58 sec for evaluation

--------- Epoch 2 -----------
8.71 sec for training

RMSE =  0.0002665597918634258
RMSE =  0.024591328508430592

6.56 sec for evaluation

--------- Epoch 3 -----------
8.83 sec for training

RMSE =  6.831015763747323e-05
RMSE =  0.024664564952236127

6.64 sec for evaluation

--------- Epoch 4 -----------
8.67 sec for training

RMSE =  0.002510423378195602
RMSE =  0.02362537984107042

6.40 sec for evaluation


## Baseline 3

In [113]:
def identity_tokenizer(text):
    return text

vectorizer = TfidfVectorizer(lowercase=False, input='content', stop_words='english',
                             ngram_range=(1,1), tokenizer=identity_tokenizer)
trSVD = TruncatedSVD(n_components=500, n_iter=100)

tv = vectorizer.fit_transform(list(data['Headline_proc'].values))
tr_tv = trSVD.fit_transform(tv)

data['co_dummy'] = pd.get_dummies(data['Company']).astype(float).values.tolist()
data['tv'] = tr_tv.tolist()

In [114]:
# Train-dev-test split

train_dataset_v = data.sample(frac=0.95, random_state=1234)
test_dataset_v = data.drop(train_dataset_v.index)

print ('Train size:', len(train_dataset_v.index))
print ('Test size:', len(test_dataset_v.index))

Train size: 9235
Test size: 486


In [115]:
svr_model = SVR()

X = ((train_dataset_v[['r_index', 'MFI', 'ForPE', 'SIP']]
     ).join(train_dataset_v['tv'].apply(pd.Series).add_prefix('tv_'))
    ).join(train_dataset_v['co_dummy'].apply(pd.Series).add_prefix('co_'))
y = train_dataset_v['r_stock']

svr_model.fit(X, y)

y_train = svr_model.predict(X)

print("================================== Train Data: ==================================")
RMSE = compute_rmse(y_train, y)
print("RMSE = ", RMSE)
print()

RMSE =  0.027232685386578026



In [116]:
X_test = ((test_dataset_v[['r_index', 'MFI', 'ForPE', 'SIP']]
     ).join(test_dataset_v['tv'].apply(pd.Series).add_prefix('tv_'))
    ).join(test_dataset_v['co_dummy'].apply(pd.Series).add_prefix('co_'))
y_true = test_dataset_v['r_stock']

y_test = svr_model.predict(X_test)

print("================================== Test Data: ==================================")
RMSE = compute_rmse(y_test, y_true)
print("RMSE = ", RMSE)
print()

RMSE =  0.029775049558993457

