In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import itertools

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils import resample, shuffle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Embedding, Dropout, Bidirectional, LSTM, GlobalMaxPool1D, Dense

import gensim

In [None]:
# Set GPU
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
## Load dataset
df = pd.read_table('FinancialPhraseBank-v1.0/Sentences_50Agree.txt', delimiter='\r\n')
values = np.array([df.values[i][0].split('@') for i in range(df.size)])
data = pd.DataFrame({'sentence':values[:, 0], 'sentiment':values[:, 1]})

# Show first rows
data.head()

In [None]:
## Split dataset into training and testing sets
X = data.sentence
y = data.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

y_train.shape

In [None]:
## Sampling the dataset
# Separate classes
neu_index = y_train[y_train == 'neutral'].index
pos_index = y_train[y_train == 'positive'].index
neg_index = y_train[y_train == 'negative'].index

In [None]:
# Down-sample neutral class
X_train_neu, y_train_neu = resample(X_train.loc[neu_index], y_train.loc[neu_index],
                                    n_samples=len(pos_index), replace=False,
                                    random_state=7)

# Up-sample negative class
X_train_neg, y_train_neg = resample(X_train.loc[neg_index], y_train.loc[neg_index],
                                    n_samples=len(pos_index), replace=True,
                                    random_state=7)

# Combine resampled classes
X_train_resample = X_train.loc[pos_index].append([X_train_neu, X_train_neg])

In [None]:
# Remove neutral sentences
neu_ind = y[y == 'neutral'].index

X = X.drop(neu_ind)
y = y.drop(neu_ind)

X_train2, X_test, y_train2, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=7)

# Separate classes
pos_index = y_train2[y_train2 == 'positive'].index
neg_index = y_train2[y_train2 == 'negative'].index

# Up-sample negative class
X_train_neg, y_train_neg = resample(X_train2.loc[neg_index], y_train2.loc[neg_index],
                                    n_samples=len(pos_index), replace=True, random_state=7)

# Combine resampled classes
X_train2 = X_train2.loc[pos_index].append(X_train_neg)
y_train2 = y_train2.loc[pos_index].append(y_train_neg)

# Shuffle samples
X_train2, y_train2 = shuffle(X_train2, y_train2, random_state=7)

In [None]:
## Tokenize training and testing sets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_resample)

word_index = tokenizer.word_index
print("Number of words: {}".format(len(word_index)))

X_train2 = tokenizer.texts_to_sequences(X_train2)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad the sequences
max_len = 71

X_train2 = pad_sequences(X_train2, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
# Encode target values as integers
le = LabelEncoder()
le.fit(y_train2)

y_train2 = le.transform(y_train2)
y_test = le.transform(y_test)

In [None]:
## Create embedding matrix
# Load Google's pre-trained Word2Vec model
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
def word2vec_matrix(model, word_index):
    """
    This function creates an embedding matrix from Word2Vec word embeddings.  
    """
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if word in model.index2word:
            embedding_matrix[i] = model[word]

    return embedding_matrix

In [None]:
# Embedding size
EMBEDDING_DIM = 300

# Embedding matrix
w2v_matrix =  word2vec_matrix(word2vec, word_index)
# Save embedding matrix
np.save('embeddings/posneg_emb_matrix_w2v', w2v_matrix)

In [None]:
# Create the model
model = Sequential()
model.add(Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[w2v_matrix], trainable=False, input_length=max_len, name='posneg_embedding_1'))
model.add(Dropout(0.25, name='posneg_dropout_1'))
model.add(Bidirectional(LSTM(200, return_sequences=True), merge_mode='concat', name='posneg_blstm_1'))
model.add(Dropout(0.25, name='posneg_dropout_2'))
model.add(GlobalMaxPool1D(name='posneg_pool_1'))
model.add(Dense(50, activation='sigmoid', name='posneg_dense_1'))
model.add(Dropout(0.25, name='posneg_dropout_3'))
model.add(Dense(1, activation='sigmoid', name='posneg_dense_2'))

model.summary()

In [None]:
## Compile the model
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [None]:
## Train the model
batch_size = 64
epochs = 40
model.fit(X_train2, y_train2, epochs=epochs, batch_size=batch_size, verbose=0)

In [None]:
## Make predictions
y_pred = model.predict(X_test, batch_size=batch_size)

y_test = np.round(y_test)
y_pred = np.round(y_pred)

In [None]:
# Precision, recall, f1 score and support
# Precision, recall, f1 score and support
p, r, f1, s = precision_recall_fscore_support(y_test, y_pred, average=None)

# Print precision, recall, f1 score and support
results = pd.DataFrame({'1-Precision': p, '2-Recall': r, '3-F1 score': f1, '4-Support': s}, index=labels)

# Print precision, recall, f1 score and support
print(results.round(decimals=3))

In [None]:
# Serialize model to JSON
model_name = 'models/posneg_model'
model_json = model.to_json()
with open(model_name + '.json', 'w') as json_file:
    json_file.write(model_json)
    
# Serialize weights to HDF5
model.save_weights(model_name + '.h5')