In [1]:
# imports
import pandas as pd
import numpy as np
import random
import string

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from tabulate import tabulate
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.random import set_seed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, \
    Dropout, BatchNormalization
from imblearn.over_sampling import SMOTE

In [2]:
# set seed

seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
set_seed(seed_value)

In [3]:
# import Dataset 1

with open('financial_phrasebank/Sentences_75Agree.txt', 'r') as file:
    lines = file.readlines()

# Initialize empty lists for text and sentiment
texts = []
sentiments = []

# Iterate over each line in the file
for line in lines:
    # Split the line at the '@' symbol
    parts = line.strip().split('@')
    if len(parts) == 2:
        # Extract the text and sentiment
        text = parts[0].strip()
        sentiment = parts[1].strip()

        # Append the text and sentiment to the respective lists
        texts.append(text)
        sentiments.append(sentiment)

# Create a dataframe from the extracted data
ds_one = pd.DataFrame({'text': texts, 'sentiment': sentiments})

In [4]:
# EDA

# sentiment distribution
print(ds_one['sentiment'].value_counts())

neutral     2146
positive     887
negative     420
Name: sentiment, dtype: int64


In [5]:
# Preprocess the text data

# use stopwords and wordnet collections in the nltk package
nltk.download('stopwords')
nltk.download('wordnet')
# use pre-trained model for initial tokenization
# i.e. the sentence will split at what punctuations
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # Tokenize the lowercased text into individual words
    words = nltk.word_tokenize(text.lower())

    # Remove stopwords: common words like "a," "an," "the," "and," etc.
    words = [word for word in words if word not in stop_words]

    # Remove punctuations 
    words = [word for word in words if word not in string.punctuation]

    # Lemmatize words: reduces words to their base or dictionary form
    # i.e. "running" to "run"
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back into a sentence
    preprocessed_text = ' '.join(words)

    return preprocessed_text

ds_one['text'] = ds_one['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Star\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
X = ds_one['text'].values
y = ds_one['sentiment'].values

# encode the sentiment labels to numeric format
le = LabelEncoder()
y = le.fit_transform(y)

# uncomment the following code to get the encoding mapping
# negative: 0
# neutral: 1
# positive: 2

# class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
# for class_label, encoded_value in class_mapping.items():
#     print(f"{class_label}: {encoded_value}")

# Split the dataset into training and testing sets
# note that we preserve the sentiment distribution
# to handle the sentiment imbalance

# val -> validation
X_train_val, X_test, y_train_val, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Further split the training set into training and validation sets
# preserve the sentiment distribution
X_train, X_val, y_train, y_val = \
    train_test_split(X_train_val, y_train_val, test_size=0.2,
                      random_state=42, stratify=y_train_val)

In [7]:
# Second tokenization and pad the sequences

# we switch to the numerical tokenizer from tensorflow package
tokenizer = Tokenizer()

# builds the vocabulary and assigns a unique index to each word in the text
tokenizer.fit_on_texts(X_train)

# convert the text data into sequences of numerical tokens
# based on the vocabulary learned by the tokenizer
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)



# the total number of unique tokens in the vocabulary
# later used in the embedding layer of the model
vocab_size = len(tokenizer.word_index) + 1



# LSTM models require input sequences to have the same length
# To address this, we pad the sequences to a maximum length as follows:
# Sequences longer than max_len are truncated, 
# and shorter sequences are padded with zeros at the beginning

# *5 because the announcement paragraphs are usually 4-5 sentences
# per paragraph
max_len = max([len(x)*5 for x in X_train])

X_train = pad_sequences(X_train, maxlen=max_len)
X_val = pad_sequences(X_val, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [8]:
# save y_train before one-hot coding for optimization trial 4
y_train_opt = y_train

In [9]:
num_classes = 3  # Number of sentiment categories (positive, negative, neutral)

# Convert class labels to one-hot encoding:

# converts the categorical labels into a binary matrix representation
# where each class label is represented by a row vector with a value of 1
# in the corresponding class index and 0 elsewhere.
y_train = to_categorical(y_train, num_classes)
y_val = to_categorical(y_val, num_classes)
y_test = to_categorical(y_test, num_classes)


# build the LSTM model

# model structure: 

# sequential model
model = Sequential()
# embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
# LSTM layer
model.add(LSTM(units=32))
# dense layer for classification

# choice of activation function: softmax
# common choice for multi-class classification
# it outputs a probability distribution across the sentiments (pos/neg/neutral), 
# ensuring that the predicted probabilities sum up to 1.
model.add(Dense(units=3, activation='softmax'))

# Specify the loss function, optimizer, and evaluation metrics

# choice of loss: categorical_crossentropy
# common choice for multi-class classification
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])


# Define early stopping callback
early_stopping = callbacks.EarlyStopping(patience=5, restore_best_weights=True)

# train the model
# epochs and batch size can be tuned later on
model.fit(X_train, y_train, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping])



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


<keras.callbacks.History at 0x1fe62991cd0>

In [10]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.6097
Test Accuracy: 0.7583


In [11]:
# import Dataset 2A (small sample manually reviewed and collected)
paragraph_reviewed = pd.read_csv('sentiment_by_paragraphs.csv')
article_reviewed = pd.read_csv('overall_sentiment_by_article.csv')

In [12]:
# EDA & data cleaning for Dataset 2A

# number of paragraphs
print("number of paragraphs in total:", paragraph_reviewed.shape[0])

# sentiment distribution
print(paragraph_reviewed['sentiment'].value_counts())

# print(paragraph_reviewed.head())
# found invalid columns

# drop invalid columns
paragraph_reviewed = paragraph_reviewed.drop(
    columns=['Unnamed: 3', 'Unnamed: 4'])

# cleaned datasets
print(paragraph_reviewed.head())


number of paragraphs in total: 127
neutral     52
positive    48
negative    27
Name: sentiment, dtype: int64
   aID                                       text_content sentiment
0    1  The Bank of Canada and the Bank of Korea today...  positive
1    2  MONTRÉAL, QUEBEC—The fundamental forces that h...   neutral
2    2  In a speech to CFA Montréal and the Montreal C...  positive
3    2  Inflation has underperformed forecasts mostly ...   neutral
4    2  The fundamental drivers of inflation, along wi...   neutral


In [13]:
# Preprocess the paragraphs

# same procedure as preprocessing Dataset 1

paragraph_reviewed['text_content'] = paragraph_reviewed[
    'text_content'].apply(preprocess_text)

paragraph_reviewed_X = paragraph_reviewed['text_content'].values
paragraph_reviewed_y = paragraph_reviewed['sentiment'].values

# tokenize and pad the sequences
paragraph_reviewed_X = tokenizer.texts_to_sequences(paragraph_reviewed_X)
paragraph_reviewed_X = pad_sequences(paragraph_reviewed_X, maxlen=max_len)

# convert to numeric category & one-hot code y
paragraph_reviewed_y = le.fit_transform(paragraph_reviewed_y)
paragraph_reviewed_y = to_categorical(paragraph_reviewed_y, num_classes)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(paragraph_reviewed_X, paragraph_reviewed_y)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 1.6110
Test Accuracy: 0.4173


In [14]:
# only 40% accuracy, could be attributed to the different distribution of 
# sentiments in Dataset 2A than in Dataset 1

# let's look at the confusion matrix



predictions = model.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels = np.argmax(predictions, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[ 0 17 10]
 [ 0 36 16]
 [ 0 31 17]]


In [15]:
# Count the occurrences of each class label
label_counts = np.bincount(predicted_labels)

# Get the class names
class_names = ['negative', 'neutral', 'positive']

# Create a dictionary to store the distribution
distribution = dict(zip(class_names, label_counts))

# Print the distribution
for label, count in distribution.items():
    print(f"{label}: {count}")

negative: 0
neutral: 84
positive: 43


In [16]:
# from the confusion matrix:
# None of the negative paragraphs are correctly classified
# too many paragraphs are classified as neutral 
# less than half of the positive paragraphs are correctly classified

# a likely reason is the distribution of sentiment in the training dataset (Dataset 1)
# in which:
# 62% sentences are neutral
# 12% sentences are negative
# 26% sentences are positive

# possible ways to reduce the distribution bias:

# - regularize, ex. dropout

# - custom loss functions that penalize incorrect predictions on the minority classes more heavily

# - assign different weights to each class during the model training

# - undersampling / oversampling by randomly duplicating minority samples

# - Ensemble Models: Train multiple LSTM models using different subsets of the data, then aggregate the models

In [17]:
# optimize the model





# trial 1 (discarded): regularize by adding dropout
# and apply a smaller learning rate to the optimizer

model_1 = Sequential()
model_1.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
# dropput: 20% of the LSTM layer's input units are randomly set to 0
model_1.add(LSTM(units=32, dropout=0.2)) 

model_1.add(Dense(units=num_classes, activation='softmax'))
# adam optimizer with a slower learning rate (default is 0.001)
optimizer = Adam(learning_rate=0.0005)
model_1.compile(loss='categorical_crossentropy', optimizer=optimizer, 
              metrics=['accuracy'])

model_1.fit(X_train, y_train, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping])


# Evaluate the model on the test set
loss, accuracy = model_1.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model on the bank announcement data
loss, accuracy = model_1.evaluate(paragraph_reviewed_X, paragraph_reviewed_y)

print(f"Announcement data test Loss: {loss:.4f}")
print(f"Announcement data test Accuracy: {accuracy:.4f}")


# result: some improvement on the initial model

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Test Loss: 0.6284
Test Accuracy: 0.7641
Announcement data test Loss: 1.4485
Announcement data test Accuracy: 0.4016


In [18]:
# confusion matrix under trial 1(discarded)

predictions_1 = model_1.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels_1 = np.argmax(predictions_1, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels_1)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[ 0 15 12]
 [ 0 32 20]
 [ 0 29 19]]


In [19]:
# trial 2(discarded): regularize by adding batch normalization

model_2 = Sequential()
model_2.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
model_2.add(LSTM(units=32)) 
# batch normalization layer added
model_2.add(BatchNormalization())
model_2.add(Dense(units=num_classes, activation='softmax'))
# keep the adam optimizer with a slower learning rate from trial 1
optimizer = Adam(learning_rate=0.0005)
model_2.compile(loss='categorical_crossentropy', optimizer=optimizer, 
              metrics=['accuracy'])

model_2.fit(X_train, y_train, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping])


# Evaluate the model on the test set
loss, accuracy = model_2.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model on the bank announcement data
loss, accuracy = model_2.evaluate(paragraph_reviewed_X, paragraph_reviewed_y)

print(f"Announcement data test Loss: {loss:.4f}")
print(f"Announcement data test Accuracy: {accuracy:.4f}")


# result: bad accuracy, do not include in ensemble model

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Test Loss: 0.9727
Test Accuracy: 0.7135
Announcement data test Loss: 1.1215
Announcement data test Accuracy: 0.3701


In [20]:
# confusion matrix under trial 2(discarded)

predictions_2 = model_2.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels_2 = np.argmax(predictions_2, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels_2)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[ 4 18  5]
 [13 28 11]
 [ 8 25 15]]


In [21]:
# trial 3 (now trial 1). use custom loss functions that penalize incorrect predictions
# on the minority classes (i.e. 'negative') more heavily


# define the custom loss function
def penalized_loss(y_true, y_pred):
    # Define weights for each class, higher weight for 'negative' class
    class_weights = tf.constant([1.0, 1.0, 2.0]) 

    # Apply weights to the loss calculation
    weighted_loss = tf.multiply(y_true * tf.math.log(y_pred), class_weights)
    loss = -tf.reduce_mean(weighted_loss)
    return loss





model_3 = Sequential()
model_3.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
model_3.add(LSTM(units=32)) 
model_3.add(Dense(units=num_classes, activation='softmax'))
# adam optimizer with a slower learning rate (default is 0.001)
optimizer = Adam(learning_rate=0.0005)
model_3.compile(loss=penalized_loss, optimizer=optimizer, 
              metrics=['accuracy'])

model_3.fit(X_train, y_train, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping])


# Evaluate the model on the test set
loss, accuracy = model_3.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model on the bank announcement data
loss, accuracy = model_3.evaluate(paragraph_reviewed_X, paragraph_reviewed_y)

print(f"Announcement data test Loss: {loss:.4f}")
print(f"Announcement data test Accuracy: {accuracy:.4f}")


# result: performance similar to trial 1 (discarded)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Test Loss: 0.2558
Test Accuracy: 0.7569
Announcement data test Loss: 0.6583
Announcement data test Accuracy: 0.3780


In [22]:
# confusion matrix under trial 3 (now trial 1)

predictions_3 = model_3.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels_3 = np.argmax(predictions_3, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels_3)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[ 0 15 12]
 [ 0 24 28]
 [ 0 24 24]]


In [23]:
# trial 4 (now trial 2): utilize class weighting during model training

# computes the class weights based on the 'balanced' strategy
# this strategy assigns weights inversely proportional to the class frequencies
# i.e. 'negative' has very low frequency -> receive higher weights

class_weights = class_weight.compute_class_weight(
    class_weight = 'balanced', 
    classes = np.unique(np.argmax(y_train, axis=1)), 
    y = np.argmax(y_train, axis=1)
    )

# Convert class weights to dictionary format
class_weights_dict = dict(enumerate(class_weights))






model_4 = Sequential()
model_4.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
model_4.add(LSTM(units=32)) 
model_4.add(Dense(units=num_classes, activation='softmax'))

optimizer = Adam(learning_rate=0.0005)
model_4.compile(loss='categorical_crossentropy', optimizer=optimizer, 
              metrics=['accuracy'])

# add class weight param
model_4.fit(X_train, y_train, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping],
          class_weight=class_weights_dict)


# Evaluate the model on the test set
loss, accuracy = model_4.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model on the bank announcement data
loss, accuracy = model_4.evaluate(paragraph_reviewed_X, paragraph_reviewed_y)

print(f"Announcement data test Loss: {loss:.4f}")
print(f"Announcement data test Accuracy: {accuracy:.4f}")


# result: performance similar to trial 1 (discarded)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Test Loss: 0.6417
Test Accuracy: 0.7395
Announcement data test Loss: 1.4212
Announcement data test Accuracy: 0.3307


In [24]:
# confusion matrix under trial 4 (now trial 2)

predictions_4 = model_4.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels_4 = np.argmax(predictions_4, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels_4)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[ 2 16  9]
 [ 5 31 16]
 [ 3 36  9]]


In [25]:
# trial 5 (now trial 3): undersampling on the majority class ('neutral')
# and oversampling on the minority classes ('positive' and 'negative')


# undersample 'neutral' class
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X_train_balanced, y_train_balanced = undersampler.fit_resample(
    X_train, y_train_opt)

# oversample 'positive' and 'negative' classes
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train_balanced, y_train_balanced = oversampler.fit_resample(
    X_train_balanced, y_train_balanced)

# oversample twice to create equal distribution of sentiments
X_train_balanced, y_train_balanced = oversampler.fit_resample(
    X_train_balanced, y_train_balanced)

# uncomment code below to look at the resampled sentiment distribution
# unique_values, counts = np.unique(y_train_balanced, return_counts=True)
# for value, count in zip(unique_values, counts):
#     print(f"{value}: {count}")
# it's now 1:1:1, balanced!

# one-hot code resampled y
y_train_balanced = to_categorical(y_train_balanced, num_classes)







model_5 = Sequential()
model_5.add(Embedding(input_dim=vocab_size, output_dim=32, 
                      input_length=max_len))
model_5.add(LSTM(units=32)) 

model_5.add(Dense(units=num_classes, activation='softmax'))

optimizer = Adam(learning_rate=0.0005)
model_5.compile(loss='categorical_crossentropy', optimizer=optimizer, 
              metrics=['accuracy'])

model_5.fit(X_train_balanced, y_train_balanced, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping])


# Evaluate the model on the test set
loss, accuracy = model_5.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model on the bank announcement data
loss, accuracy = model_5.evaluate(paragraph_reviewed_X, paragraph_reviewed_y)

print(f"Announcement data test Loss: {loss:.4f}")
print(f"Announcement data test Accuracy: {accuracy:.4f}")


# result: performance slightly worse than trial 1 (discarded)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Test Loss: 0.7110
Test Accuracy: 0.6918
Announcement data test Loss: 1.3747
Announcement data test Accuracy: 0.2126


In [26]:
# confusion matrix under trial 5 (now trial 3)

predictions_5 = model_5.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels_5 = np.argmax(predictions_5, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels_5)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[10  8  9]
 [26 10 16]
 [29 12  7]]


In [27]:
# trial 6 (now trial 4): trial 5 (now trial 3) but with a mix of both dataset 1 & 2A
# this means we randomly select 20% paragraphs in dataset 2A
# to put in the training set


# data preprocessing

# split the 20% paragraphs out
paragraph_reviewed_train, paragraph_reviewed_test = train_test_split(
    paragraph_reviewed, test_size=0.8, random_state=42)

paragraph_reviewed_train_X = paragraph_reviewed_train['text_content'].values
paragraph_reviewed_train_y = paragraph_reviewed_train['sentiment'].values

paragraph_reviewed_test_X = paragraph_reviewed_test['text_content'].values
paragraph_reviewed_test_y = paragraph_reviewed_test['sentiment'].values

# tokenize and pad X
paragraph_reviewed_train_X = tokenizer.texts_to_sequences(
    paragraph_reviewed_train_X)

paragraph_reviewed_test_X = tokenizer.texts_to_sequences(
    paragraph_reviewed_test_X)

paragraph_reviewed_train_X = pad_sequences(paragraph_reviewed_train_X, 
                                           maxlen=max_len)
paragraph_reviewed_test_X = pad_sequences(paragraph_reviewed_test_X, 
                                          maxlen=max_len)

# convert to numeric category & one-hot code y
paragraph_reviewed_train_y = le.fit_transform(paragraph_reviewed_train_y)
paragraph_reviewed_train_y = to_categorical(
    paragraph_reviewed_train_y, num_classes)

paragraph_reviewed_test_y = le.fit_transform(paragraph_reviewed_test_y)
paragraph_reviewed_test_y = to_categorical(
    paragraph_reviewed_test_y, num_classes)

# convert to np array for concatenation
paragraph_reviewed_train_X = np.array(paragraph_reviewed_train_X)
paragraph_reviewed_test_X = np.array(paragraph_reviewed_test_X)

# create new training and test sets containing mixed data
mix_training_X = np.concatenate((paragraph_reviewed_train_X, X_train_balanced))
mix_training_y = np.concatenate((paragraph_reviewed_train_y, y_train_balanced))








# modelling

model_6 = Sequential()
model_6.add(Embedding(input_dim=vocab_size, output_dim=32, 
                      input_length=max_len))
model_6.add(LSTM(units=32)) 

model_6.add(Dense(units=num_classes, activation='softmax'))

optimizer = Adam(learning_rate=0.0005)
model_6.compile(loss='categorical_crossentropy', optimizer=optimizer, 
              metrics=['accuracy'])

model_6.fit(mix_training_X, mix_training_y, validation_data=(X_val, y_val), 
          epochs=30, batch_size=128, callbacks=[early_stopping])


# Evaluate the model on the test set
loss, accuracy = model_6.evaluate(X_test, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Evaluate the model on the split out test set from bank announcement data
loss, accuracy = model_6.evaluate(paragraph_reviewed_test_X, 
                                  paragraph_reviewed_test_y)

print(f"Announcement data test Loss: {loss:.4f}")
print(f"Announcement data test Accuracy: {accuracy:.4f}")




Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Test Loss: 0.6585
Test Accuracy: 0.7438
Announcement data test Loss: 1.3399
Announcement data test Accuracy: 0.4020


In [28]:
# confusion matrix under trial 6 (now trial 4)

predictions_6 = model_6.predict(paragraph_reviewed_X)

# Convert the probabilities to class labels
predicted_labels_6 = np.argmax(predictions_6, axis=1)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(np.argmax(paragraph_reviewed_y, 
                                                      axis=1), 
                                            predicted_labels_6)
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
[[ 6  6 15]
 [12 25 15]
 [10 10 28]]


In [29]:
# trial 7(now trial 5): ensemble model 3 (now trial 1), 4 (now trial 2), 5 (now trial 3), 6 (now trial 4)


# ensemble method: majority vote across predictions make by diff. models

def major_vote(arr):
    # i.e. arr = [pred_1, pred_3, pred_4]
    # loop thorugh all 691 observations
    # for each observarion:
    # find the most common label given by the pred in arr
    # return all the most common labels
    result = []
    for index in range(len(arr[0])):
        column = [row[index] for row in arr]
        counts = Counter(column)
        most_common = counts.most_common(1)[0][0]
        result.append(most_common)
    return result


# ensemble model on phrasebank data
pred_3 = model_3.predict(X_test)
pred_4 = model_4.predict(X_test)
pred_5 = model_5.predict(X_test)
pred_6 = model_6.predict(X_test)

# Convert predictions to one-hot encoded format
pred_3 = np.argmax(pred_3, axis=1)
pred_4 = np.argmax(pred_4, axis=1)
pred_5 = np.argmax(pred_5, axis=1)
pred_6 = np.argmax(pred_6, axis=1)

# Combine the predictions using a voting approach
pred_3456 = np.array(major_vote([pred_6, pred_3, pred_4, pred_5]))
# one-hot code pred to match the format of y_test
pred_3456 = to_categorical(pred_3456, num_classes)

pred_accuracy = accuracy_score(pred_3456, y_test)

print(f"Test Accuracy (phrasebank data): {pred_accuracy:.4f}")

# result: test accuracy for phrasebank data is similar to single models







# ensemble model on bank announcement data
pred_3456 = np.array(major_vote([predicted_labels_3, 
                                 predicted_labels_4, 
                                 predicted_labels_5,
                                 predicted_labels_6]))
pred_3456 = to_categorical(pred_3456, num_classes)

pred_accuracy = accuracy_score(pred_3456, paragraph_reviewed_y)
print(f"Test Accuracy (announcement data): {pred_accuracy:.4f}")

# result: no improvement

Test Accuracy (phrasebank data): 0.7656
Test Accuracy (announcement data): 0.3386


In [30]:
# # test if the article predictions are correct with word2vec_model
# # replace word2vec_model with your model name

# # predict the sentiment
# predictions_word2vec = word2vec_model.predict(paragraph_reviewed_X)

# # Convert the probabilities to class labels
# predictions_word2vec = np.argmax(predictions_word2vec, axis=1)

# # add predicted labels to dataset 2, i.e. 'paragraph_reviewed'
# paragraph_reviewed['predicted_sentiment'] = predictions_word2vec

# # look at what dataset 2A (paragraph_reviewed) looks like
# # print(paragraph_reviewed.head())

# # aggregate sentiment labels via majority voting
# # group by ArticleID and calculate the mode (most common prediction)
# sentiment_by_article = paragraph_reviewed.groupby('aID'
#         )['predicted_sentiment'].apply(lambda x: x.mode()[0]).reset_index()

# # look at the new dataset containing the article id's and its corresponding
# # sentiment predictions
# # print(sentiment_by_article.head())

# # note:
# # the encoding mapping of sentiment labels is as below
# # negative: 0
# # neutral: 1
# # positive: 2

# # also encode true sentiment labels in the above format
# true_sentiments = le.fit_transform(article_reviewed['sentiment_overall'])

# # test if the sentiments by article is correctly predicted
# print("sentiment accuracy by article:", accuracy_score(true_sentiments, 
#                                 sentiment_by_article['predicted_sentiment']))

In [31]:
# import Dataset 2 (web scrapped)

paragraph = pd.read_csv('bank_publications_in_paragraph.csv')
article = pd.read_csv('bank_publications.csv')

In [32]:
# EDA & data cleaning for Dataset 2

# drop the NAs
paragraph = paragraph.dropna()
article = article.dropna()

# number of paragraphs
print("number of paragraphs in total:", paragraph.shape[0])



# number of articles, notice the discrepancy between article # in each data file
# to be fixed after cleaning
print("number of articles in paragraph file:", 
      len(paragraph["ArticleID"].unique()))

print("number of articles in article file:", article.shape[0])

# the % invalid paragraphs that are meaninglessly short
invalid_paras = paragraph[paragraph['Paragraph'].str.len() < 160]
print("the percentage of meaninglessly short paragraphs", 
       f'{len(invalid_paras) / paragraph.shape[0]:.2f}')

# remove these paragraphs
paragraph = paragraph.drop(invalid_paras.index)
print("number of paragraphs in total after filtering:", paragraph.shape[0])

# remove the articles that are gone as the paragraphs gets deleted
# as well as the articles that didn't exist at the beginning (discrepancy)
article = article[article["ArticleID"].isin(
    paragraph["ArticleID"].unique())]

# check if the number of articles match
print("number of articles in paragraph file after filtering:", 
      len(paragraph["ArticleID"].unique()))
print("number of articles in article file after filtering:", 
      len(article["ArticleID"].unique()))
# they do


# cleaned datasets
print(paragraph.head())
print("\n", article.head())



number of paragraphs in total: 14222
number of articles in paragraph file: 1118
number of articles in article file: 1289
the percentage of meaninglessly short paragraphs 0.16
number of paragraphs in total after filtering: 11903
number of articles in paragraph file after filtering: 859
number of articles in article file after filtering: 859
    ArticleID                                          Paragraph
6          14  Good morning. I’m pleased to be here with Seni...
9          14  Inflation is coming down quickly and is foreca...
10         14  Our destination is the 2% inflation target, an...
11         14  We are focused on these indicators, and the ev...
13         14  Since we last updated our economic projection ...

     ArticleID                                               Link  \
13         14  https://www.bankofcanada.ca/2023/04/opening-st...   
14         15  https://www.bankofcanada.ca/multimedia/speech-...   
22         23  https://www.bankofcanada.ca/multimedia/speech-.

In [33]:
# Preprocess the paragraphs

# same procedure as preprocessing Dataset 1

paragraph['Paragraph'] = paragraph['Paragraph'].apply(preprocess_text)


In [34]:
paragraph_X = paragraph['Paragraph'].values

# tokenize and pad the sequences
paragraph_X = tokenizer.texts_to_sequences(paragraph_X)
paragraph_X = pad_sequences(paragraph_X, maxlen=max_len)

In [35]:
# predict the sentiment of all web scrapped announcements

predictions_word2vec = model_6.predict(paragraph_X)

# Convert the probabilities to class labels
predictions_word2vec = np.argmax(predictions_word2vec, axis=1)

# add predicted labels to dataset 2, i.e. 'paragraph'
paragraph['predicted_sentiment'] = predictions_word2vec

# look at what dataset 2 (paragraph) looks like
print(paragraph.head())

# aggregate sentiment labels via majority voting
# group by ArticleID and calculate the mode (most common prediction)
sentiment_by_article = paragraph.groupby('ArticleID'
        )['predicted_sentiment'].apply(lambda x: x.mode()[0]).reset_index()

# look at the new dataset containing the article id's and its corresponding
# sentiment predictions
print(sentiment_by_article.head())
# note:
# the encoding mapping of sentiment labels is as below
# negative: 0
# neutral: 1
# positive: 2

  1/372 [..............................] - ETA: 15s

    ArticleID                                          Paragraph  \
6          14  good morning ’ pleased senior deputy governor ...   
9          14  inflation coming quickly forecast around 3 sum...   
10         14  destination 2 inflation target several thing s...   
11         14  focused indicator evolution core inflation ens...   
13         14  since last updated economic projection january...   

    predicted_sentiment  
6                     2  
9                     2  
10                    2  
11                    0  
13                    2  
   ArticleID  predicted_sentiment
0         14                    2
1         15                    1
2         23                    1
3         41                    1
4         63                    1


In [36]:
# step 3. observe daily stock index change & BoC sentiments



# find the time range to extract from dataset 3


# Convert the date column to datetime format
article['Publication Date'] = pd.to_datetime(
    article['Publication Date'], format='%B %d, %Y')

# Find the earliest and latest dates
earliest_date = article['Publication Date'].min()
latest_date = article['Publication Date'].max()

print(earliest_date, latest_date)
# printed 2010-01-11 and 2023-06-07


2010-01-11 00:00:00 2023-06-07 00:00:00


In [37]:
# import Dataset 3
stock_price = pd.read_csv('tsx_index_prices.csv')

# Convert the "Date" column in stock data to datetime format
stock_price['Date'] = pd.to_datetime(stock_price['Date'])

# descriptive stats
print(stock_price.tail())

           Date          Open          High           Low         Close  \
3372 2023-06-12  19862.599609  19933.599609  19805.900391  19921.300781   
3373 2023-06-13  19998.300781  20099.199219  19975.900391  19990.400391   
3374 2023-06-14  20027.800781  20098.500000  19930.300781  20015.099609   
3375 2023-06-15  19986.099609  20057.400391  19952.199219  20027.400391   
3376 2023-06-16  20057.300781  20112.400391  19973.199219  19975.400391   

         Adj Close     Volume  
3372  19921.300781  212148400  
3373  19990.400391  205098400  
3374  20015.099609  212250000  
3375  20027.400391  199331900  
3376  19975.400391  499628700  


In [38]:
# Merge the dataframes based on the common date and publish date columns
merged_df = pd.merge(article, stock_price, left_on=article[
    'Publication Date'].dt.strftime('%Y-%m-%d'),
                     right_on=stock_price['Date'].dt.strftime('%Y-%m-%d'))

# Calculate the average stock prices 3 days before and
# 3 days after the publish date
# Initialize empty lists to store the average stock prices
prev_stock_avg_list = []
after_stock_avg_list = []

# Iterate over the rows of announcement data
# for each announcement, find the avg
# of the closing price 3 days before and 3 days after
# its publication date
# store each avg in a list
for _, row in merged_df.iterrows():
    publish_date = row['Publication Date']

    # note to Jasmeet: adjust the number in tail() and head()
    # to control how many days to average on :)

    # i.e. (3) means take avg price of 3 days prev. and after the publish date
    prev_dates = stock_price[stock_price['Date'] < 
                             publish_date]['Date'].tail(3)
    after_dates = stock_price[stock_price['Date'] > 
                              publish_date]['Date'].head(3)
    prev_stock_avg = stock_price[stock_price['Date'].isin(
        prev_dates)]['Close'].mean()
    after_stock_avg = stock_price[stock_price['Date'].isin(
        after_dates)]['Close'].mean()
    prev_stock_avg_list.append(prev_stock_avg)
    after_stock_avg_list.append(after_stock_avg)

# Add the average stock prices as columns in merged_df
merged_df['prev_stock_avg'] = prev_stock_avg_list
merged_df['after_stock_avg'] = after_stock_avg_list


# keep only the relevant columns
merged_df = merged_df[['ArticleID', 'Date', 'Close',
                       'prev_stock_avg', 'after_stock_avg']] 


print(merged_df.head())

   ArticleID       Date         Close  prev_stock_avg  after_stock_avg
0         14 2023-04-12  20454.300781    20298.133464     20595.466797
1        883 2023-04-12  20454.300781    20298.133464     20595.466797
2         15 2023-03-29  19837.699219    19594.566406     20106.400391
3         23 2023-02-16  20606.400391    20709.133464     20320.366536
4         41 2022-11-22  20220.000000    19947.533203     20336.733724


In [39]:
# calculate the % change in stock prices 
# merged_df['Close'] = stock price at publication date
merged_df['prev_change'] = (merged_df['Close'] - prev_stock_avg_list) \
    / prev_stock_avg_list
merged_df['after_change'] = (after_stock_avg_list - merged_df['Close']) \
    / merged_df['Close']

print(merged_df.head())

# it seems like the % changes within 3 days prior and after are quite small.
# I've tried 1 day & 7 days as well and got similar results.

   ArticleID       Date         Close  prev_stock_avg  after_stock_avg  \
0         14 2023-04-12  20454.300781    20298.133464     20595.466797   
1        883 2023-04-12  20454.300781    20298.133464     20595.466797   
2         15 2023-03-29  19837.699219    19594.566406     20106.400391   
3         23 2023-02-16  20606.400391    20709.133464     20320.366536   
4         41 2022-11-22  20220.000000    19947.533203     20336.733724   

   prev_change  after_change  
0     0.007694      0.006902  
1     0.007694      0.006902  
2     0.012408      0.013545  
3    -0.004961     -0.013881  
4     0.013659      0.005773  


In [40]:
# now we look at if there is a discontinuity in relative price changes

# merged_df['relative_change'] = 