In [None]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import seaborn as sns
import nltk
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time

In [None]:
import sys
sys.path.insert(0, '/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks')

In [None]:
#Setting the random seeds for reproducability
import random
np.random.seed(42)
random.seed(42)

In [None]:
#Import a custom library that allows for performance evaluation
import multi_class_performance_eval as mce

In [None]:
full_dataset = pd.read_csv("/content/drive/MyDrive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/Sentences_50Agree.txt", sep="@", names=["Sentence", "Sentiment"], encoding="latin-1")

# Preprocessing

In [None]:
full_dataset.head()

Unnamed: 0,Sentence,Sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [None]:
def numeric_labels(dataframe):
  """
  Converts the string sentiment labels into numeric labels.

  param dataframe: the dataframe containing the sentiment labels.
  """
  i = 0
  for label in dataframe.Sentiment:
    if label == "positive":
      number = 2
    elif label == "neutral":
      number = 1
    else:
      number = 0
    dataframe.Sentiment[i] = number
    i += 1

In [None]:
numeric_labels(full_dataset)

To compare performance with and without stopwords, we will now remove the stopwords.

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def remove_punctuation(tokenized_sent):
  """
  Removes certain unwanted characters from a tokenized sentence, specified in the set 'punctuation'.
  Returns the cleaned-up tokenized sentence.

  param tokenized_sent: list of tokens containing punctuation.
  """
  punctuation = {'!', '#', '&', '(', ')', ',', ':', ';', '?', '[', ']', '@', '.'}
  new = []
  for token in tokenized_sent:
    if not token in punctuation:
      new.append(token)
  return new

def tokenize_and_clean(dataframe):
  """
  Tokenizes each sentence in the dataframe and removes punctuation.

  param dataframe: Pandas DataFrame containing sentences.
  """
  i = 0
  for sentence in dataframe.Sentence:
    tokenized = word_tokenize(sentence)
    clean = remove_punctuation(tokenized)
    dataframe.Sentence[i] = clean
    i += 1

In [None]:
print(full_dataset.head())
financial_data_stopwords = full_dataset.copy(deep=True)

tokenize_and_clean(financial_data_stopwords)
print(financial_data_stopwords.head())

financial_data_clean = financial_data_stopwords.copy(deep=True)
print(financial_data_clean.head())

                                            Sentence Sentiment
0  According to Gran , the company has no plans t...         1
1  Technopolis plans to develop in stages an area...         1
2  The international electronic industry company ...         0
3  With the new production plant the company woul...         2
4  According to the company 's updated strategy f...         2
                                            Sentence Sentiment
0  [According, to, Gran, the, company, has, no, p...         1
1  [Technopolis, plans, to, develop, in, stages, ...         1
2  [The, international, electronic, industry, com...         0
3  [With, the, new, production, plant, the, compa...         2
4  [According, to, the, company, 's, updated, str...         2
                                            Sentence Sentiment
0  [According, to, Gran, the, company, has, no, p...         1
1  [Technopolis, plans, to, develop, in, stages, ...         1
2  [The, international, electronic, industry, com...   

In [None]:
def convert_to_lower(dataframe):
  """
  Converts all sentences in the dataframe to lowercase.

  param dataframe: Pandas DataFrame containing tokenized sentences.
  """
  i = 0
  for row in dataframe.Sentence:
    lowered = [w.lower() for w in row]
    dataframe.Sentence[i] = lowered
    i += 1

In [None]:
convert_to_lower(financial_data_stopwords)
convert_to_lower(financial_data_clean)
financial_data_clean.head()

Unnamed: 0,Sentence,Sentiment
0,"[according, to, gran, the, company, has, no, p...",1
1,"[technopolis, plans, to, develop, in, stages, ...",1
2,"[the, international, electronic, industry, com...",0
3,"[with, the, new, production, plant, the, compa...",2
4,"[according, to, the, company, 's, updated, str...",2


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(dataframe):
  """
  Removes stopwords from sentences in the dataframe.

  param dataframe: Pandas DataFrame containing tokenized sentences.
  """
  j = 0
  for row in dataframe.Sentence:
    temp = []
    for w in row:
      if w not in stopwords.words('english'):
        temp.append(w)
    dataframe.Sentence[j] = temp
    j += 1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(financial_data_clean.head())
remove_stopwords(financial_data_clean)
print(financial_data_clean.head())

                                            Sentence Sentiment
0  [according, to, gran, the, company, has, no, p...         1
1  [technopolis, plans, to, develop, in, stages, ...         1
2  [the, international, electronic, industry, com...         0
3  [with, the, new, production, plant, the, compa...         2
4  [according, to, the, company, 's, updated, str...         2
                                            Sentence Sentiment
0  [according, gran, company, plans, move, produc...         1
1  [technopolis, plans, develop, stages, area, le...         1
2  [international, electronic, industry, company,...         0
3  [new, production, plant, company, would, incre...         2
4  [according, company, 's, updated, strategy, ye...         2


In [None]:
#Join all the tokens together again to gain full sentences, ready for classification.
#This is needed for the VADER lexicon, which expects full sentences.

i = 0
for tokens in financial_data_stopwords.Sentence:
  financial_data_stopwords.Sentence[i] = " ".join(tokens)
  i += 1
print(financial_data_stopwords.head())

i = 0
for tokens in financial_data_clean.Sentence:
  financial_data_clean.Sentence[i] = " ".join(tokens)
  i += 1
print(financial_data_clean.head())


                                            Sentence Sentiment
0  according to gran the company has no plans to ...         1
1  technopolis plans to develop in stages an area...         1
2  the international electronic industry company ...         0
3  with the new production plant the company woul...         2
4  according to the company 's updated strategy f...         2
                                            Sentence Sentiment
0  according gran company plans move production r...         1
1  technopolis plans develop stages area less 100...         1
2  international electronic industry company elco...         0
3  new production plant company would increase ca...         2
4  according company 's updated strategy years 20...         2


In [None]:
#The test set can be loaded from a file I created earlier, after preprocessing
test = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/test.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])

In [None]:
test.head()

Unnamed: 0,Sentence,Sentiment
2318,tiimari latvian representative ineta zaharova ...,2
177,cargo volume increased approximately 5 %,2
4836,operating profits half 0.8 0.9 glisten investe...,0
495,delivers different user experience enables us ...,2
4394,strike finnair estimates incur net loss eur2m ...,0


In [None]:
X_test = test.Sentence.tolist()
y_test = test.Sentiment.tolist()

# Sentiment Classification

In [None]:
def classify_VADER():
  """
  Uses the SentimentIntensityAnalyzer from the VADER lexicon to produce a prediction for each sentence in X_test

  Positive: compound >= 0.05
  Neutral: -0.05 =< compound < 0.05
  Negative: compound < -0.05
  """
  sent_analyzer = SentimentIntensityAnalyzer()
  y_head = []
  scores = []
  for sentence in X_test:
    sentiment = sent_analyzer.polarity_scores(sentence)
    scores.append(sentiment['compound'])
    if sentiment['compound'] >= 0.05:
      y_head.append(2)
    elif ((sentiment['compound'] < 0.05) and (sentiment['compound'] >= -0.05)):
      y_head.append(1)
    elif sentiment['compound'] < -0.05:
      y_head.append(0)
  return y_head, scores


In [None]:
y_head_stop, scores_stop = classify_VADER(financial_data_stopwords)  #This is for the data including stopwords --> should I still include this?
#LET OP: DE FUNCTIE HIERBOVEN GEBRUIKT GEWOON X_TEST, DUS FINANCIAL_DATA_STOPWORDS MEEGEVEN HEEFT GEEN ZIN
#ALS JE WEL DEZE TWEE TESTS (DUS MET DE STOPWOORDEN) ERIN WILT HOUDEN, MOET JE DAT FF OMSCHRIJVEN

In [None]:
#Let VADER produce predictions and record how long it takes
start_time = time.time_ns()
y_head_clean, scores_clean = classify_VADER(financial_data_clean)
end_time = time.time_ns()
print("Elapsed time: ", str(end_time - start_time), "ns")
print(str((end_time - start_time)/10000000000), "s")

Elapsed time:  81032841 ns
0.0081032841 s


In [None]:
#Evaluate the performance using the predictions and the real sentiments
performance_stop = mce.evaluate_performance(y_head_stop, financial_data_stopwords.Sentiment)
for metric in performance_stop:
  print(metric, "\n", performance_stop[metric], "\n")

Accuracy 
 48.4536 

Base Positive 
 {'TP': 199, 'FP': 56, 'TN': 43, 'FN': 187} 

Base Neutral 
 {'TP': 36, 'FP': 152, 'TN': 238, 'FN': 59} 

Base Negative 
 {'TP': 0, 'FP': 42, 'TN': 439, 'FN': 4} 

Advanced Positive 
 {'Precision': 0.7803921568627451, 'Recall': 0.5155440414507773, 'Specificity': 0.43434343434343436} 

Advanced Neutral 
 {'Precision': 0.19148936170212766, 'Recall': 0.37894736842105264, 'Specificity': 0.6102564102564103} 

Advanced Negative 
 {'Precision': 0.0, 'Recall': 0.0, 'Specificity': 0.9126819126819127} 

Balanced Accuracy 
 0.29816380329061 

F_Score 
 0.02999485734226144 



In [None]:
#Again, evaluate performance of VADER's predictions
performance_clean = mce.evaluate_performance(y_head_clean, financial_data_clean.Sentiment)
for metric in performance_clean:
  print(metric, "\n", performance_clean[metric], "\n")

Accuracy 
 48.4536 

Base Positive 
 {'TP': 199, 'FP': 56, 'TN': 43, 'FN': 187} 

Base Neutral 
 {'TP': 36, 'FP': 152, 'TN': 238, 'FN': 59} 

Base Negative 
 {'TP': 0, 'FP': 42, 'TN': 439, 'FN': 4} 

Advanced Positive 
 {'Precision': 0.7803921568627451, 'Recall': 0.5155440414507773, 'Specificity': 0.43434343434343436} 

Advanced Neutral 
 {'Precision': 0.19148936170212766, 'Recall': 0.37894736842105264, 'Specificity': 0.6102564102564103} 

Advanced Negative 
 {'Precision': 0.0, 'Recall': 0.0, 'Specificity': 0.9126819126819127} 

Balanced Accuracy 
 0.29816380329061 

F_Score 
 0.02999485734226144 



#Classify on the test set

In [None]:
#Classification on the test set

start_time = time.time_ns()
y_head, scores = classify_VADER(X_test)
end_time = time.time_ns()
print("Elapsed time: ", str(end_time - start_time), "ns")
print(str((end_time - start_time)/10000000000), "s")

Elapsed time:  57518056 ns
0.0057518056 s


In [None]:
#Evaluate performance on the test set

performance = mce.evaluate_performance(y_head, y_test)
for metric in performance:
  print(metric, "\n", performance[metric], "\n")

Accuracy 
 51.7526 

Base Positive 
 {'TP': 90, 'FP': 165, 'TN': 190, 'FN': 40} 

Base Neutral 
 {'TP': 140, 'FP': 48, 'TN': 152, 'FN': 145} 

Base Negative 
 {'TP': 21, 'FP': 21, 'TN': 394, 'FN': 49} 

Advanced Positive 
 {'Precision': 0.35294117647058826, 'Recall': 0.6923076923076923, 'Specificity': 0.5352112676056338} 

Advanced Neutral 
 {'Precision': 0.7446808510638298, 'Recall': 0.49122807017543857, 'Specificity': 0.76} 

Advanced Negative 
 {'Precision': 0.5, 'Recall': 0.3, 'Specificity': 0.9493975903614458} 

Balanced Accuracy 
 0.4945119208277103 

F_Score 
 0.1350505668744676 



In [None]:
#Calculate the confusion matrix for VADER's predictions.
print(mce.confusion_matrix(y_head, y_test))

[[ 21  11  10]
 [ 18 140  30]
 [ 31 134  90]]


# Sentence-level inspection of errors

In [None]:
def get_specific_errors(dataframe, y_pred, y_real, vertical, horizontal):
	"""
	Gets the id of specific errors, e.g. neutral instances predicted as positive.

	param dataframe: Pandas DataFrame containing the sentences.
	param y_pred: (int) predictions made by the model.
	param y_real: (int) actual sentiment label.
	param vertical: (int) the vertical column of the confusion matrix (so the real sentiment).
	param horizontal: (int) the horizontal row of the confusion matrix (so the prediction).
	"""
	i = 0
	errors = []
	while i < len(dataframe.Sentence):
		if (horizontal == y_pred[i]) and (vertical == y_real[i]):
			errors.append(dataframe.index[i])
		i += 1
	return errors

In [None]:
#Neutral sentences predicted to be positive.
pos_neu_errors_index = get_specific_errors(test, y_head, y_test, 1, 2)

In [None]:
#Save the indexes to a file so that we can use them later on for analysis
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Financial/updated_LB_financial_pos_neu.txt", "w") as writefile:
  for index in pos_neu_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Positive sentences predicted to be neutral.
neu_pos_errors_index = get_specific_errors(test, y_head, y_test, 2, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Financial/updated_LB_financial_neu_pos.txt", "w") as writefile:
  for index in neu_pos_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#True positives for the neutral class.
tp_neu_index = get_specific_errors(test, y_head, y_test, 1, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Financial/updated_LB_financial_tp_neu.txt", "w") as writefile:
  for index in tp_neu_index:
    writefile.write(str(index))
    writefile.write("\n")