In [None]:
import pandas as pd
import nltk
import sklearn
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Setting the random seeds for reproducability
import random
np.random.seed(42)
random.seed(42)

In [None]:
#Load the full dataset to train the Word2Vec model later on
full_dataset = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/Programming/findat_clean_nonlem.csv", sep=",", names=["Sentence", "Sentiment"], encoding="latin-1", skiprows=[0])

In [None]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   4846 non-null   object
 1   Sentiment  4846 non-null   object
dtypes: object(2)
memory usage: 113.6+ KB


In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#Tokenize the sentences in the dataset
i = 0
for sentence in full_dataset.Sentence:
  tokenized = word_tokenize(sentence)
  full_dataset.Sentence[i] = tokenized
  i += 1

In [None]:
full_dataset

Unnamed: 0,Sentence,Sentiment
0,"[according, gran, company, plans, move, produc...",neutral
1,"[technopolis, plans, develop, stages, area, le...",neutral
2,"[international, electronic, industry, company,...",negative
3,"[new, production, plant, company, would, incre...",positive
4,"[according, company, 's, updated, strategy, ye...",positive
...,...,...
4841,"[london, marketwatch, --, share, prices, ended...",negative
4842,"[rinkuskiai, 's, beer, sales, fell, 6.5, per, ...",neutral
4843,"[operating, profit, fell, eur, 35.4, mn, eur, ...",negative
4844,"[net, sales, paper, segment, decreased, eur, 2...",negative


In [None]:
from google.colab import drive

def load_and_prepare():
  """
  Loads the train and test sets. Makes it easy to quickly get new instances during fine-tuning later on.
  """
  train = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/train.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])
  test = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/test.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])

  return train, test

# Train Embeddings

In [None]:
from gensim.models import Word2Vec

In [None]:
#Train the Word2Vec model on the full dataset (train + test)
embeddings = Word2Vec(full_dataset.Sentence, min_count=1, vector_size=300)

In [None]:
embeddings

<gensim.models.word2vec.Word2Vec at 0x7f0b40db8790>

In [None]:
embeddings.wv.most_similar('financial')

[('also', 0.9998996257781982),
 ("'s", 0.9998944401741028),
 ('plant', 0.9998935461044312),
 ('finland', 0.9998883008956909),
 ('business', 0.9998868107795715),
 ('well', 0.9998852014541626),
 ('group', 0.9998826384544373),
 ("'", 0.9998780488967896),
 ('``', 0.9998753666877747),
 ('new', 0.9998742938041687)]

# Load train-test split

In [None]:
train = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/train.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3876 entries, 3738 to 1624
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   3876 non-null   object
 1   Sentiment  3876 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.8+ KB


In [None]:
test = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/test.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])
val = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/FinancialPhraseBank-v1.0/validation.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])
print(test.info())
print(val.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 485 entries, 2318 to 4756
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   485 non-null    object
 1   Sentiment  485 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 485 entries, 47 to 1935
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   485 non-null    object
 1   Sentiment  485 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.4+ KB
None


# Generate Vector Representations

First, we will convert each sentence into a list of vectors, one for each word.
We will store this in a separate list, so that it can be added to the dataset's DataFrame afterwards.

In [None]:
def convert_to_vector(dataframe, embeddings):
  """
  Converts the sentences in the dataframe into vector representations.

  param dataframe: Pandas DataFrame containing the sentences.
  param embeddings: Word2Vec model trained on the full dataset.
  """
  total_embeddings = []
  i = 0
  j = 0
  for sentence in dataframe.Sentence:
    sent = sentence.split()
    embedded_sent = []
    for word in sent:
      representation = [0] * 300          #If we encounter a word that is not present in the Word2Vec model, we give this a vector with 0's as representation
      try:
        representation = embeddings.wv[word]
      except:
        i += 1
      embedded_sent.append(representation)
      j += 1
    total_embeddings.append(embedded_sent)
  print("Unknown words: ", str(i))
  print("Total words seen: ", str(j))
  return total_embeddings


In [None]:
train_vect = convert_to_vector(train, embeddings)
test_vect = convert_to_vector(test, embeddings)

Unknown words:  2
Total words seen:  53532
Unknown words:  1
Total words seen:  6765


In [None]:
#The sentences will be represented by vectors of numbers. However, the SVM needs all inputs to have the same dimension, even though the sentences are not necessarily the same length.
#Since the Google Vectors that will be used later on have length=300, this is the value we'll use for the custom vectors as well
desired_length = 300

In [None]:
import statistics

def calculate_average_vector(vect_list, des_length):
  """
  Given a list of vectors (a sentence), calculate the average vector with des_length.

  param vect_list: (list) list with vectors.
  param des_length: (int) the length we want our vectors to be.
  """
  averaged_vector = []
  i = 0
  while i < des_length:
    values = []
    j = 0
    while j < len(vect_list):
      values.append(vect_list[j][i])
      j += 1
    i += 1
    average = statistics.mean(values)
    averaged_vector.append(average)
  return averaged_vector



In [None]:
def equalize_vector_lengths(vectors, des_length):
  """
  Equalizes all sentences in the dataset to vectors of the same length.

  param vectors: (list) list containing vectors.
  param des_length: (int) length we want the vectors representing sentences to be.
  """
  equalized_vectors = []
  for sentence in vectors:
    equalized_vectors.append(calculate_average_vector(sentence, des_length=desired_length))
  return equalized_vectors


In [None]:
print(len(train_vect))
print(len(test_vect))

3876
485


In [None]:
#Each sentence will now be encoded, with the 300 features vectors.
equalized_vects_train = equalize_vector_lengths(train_vect, des_length = desired_length)
equalized_vects_test = equalize_vector_lengths(test_vect, des_length = desired_length)

In [None]:
#Add another column to the train and test sets with the vector representations of the sentences.
train['Vectorized'] = equalized_vects_train
test['Vectorized'] = equalized_vects_test

In [None]:
train.head()

Unnamed: 0,Sentence,Sentiment,Vectorized
3738,rsa 's shares closed 156.9 p time going press,1,"[-0.021886272, 0.20334792, 0.06589593, 0.08188..."
3518,alexandria va. march 20 -- patrik flykt timo a...,1,"[-0.019545518, 0.156337, 0.050415944, 0.064684..."
4736,however net sales declined eur 803.6 million e...,0,"[0.012967357, 0.30594933, 0.122266866, 0.11062..."
1776,finnish raute precision large glass batch plan...,2,"[-0.019487185, 0.18337992, 0.058941558, 0.0733..."
2904,finnish property investor sponda said agreed 1...,1,"[0.0078882035, 0.2690966, 0.098811924, 0.09539..."


In [None]:
def determine_weights(dataframe):
  """
  Determine the inverse class weights so the SVM can compensate for the imbalance in the dataset.

  param dataframe: Pandas DataFrame containing the sentiment labels.
  """
  total = 0
  pos = 0
  neu = 0
  neg = 0
  i = 0

  for sent in dataframe.Sentiment:
    if sent == 2:
      pos += 1
    elif sent == 1:
      neu += 1
    else:
      neg += 1
    total += 1
    i += 1

  weights = {2: 1/(pos/total), 1: 1/(neu/total), 0: 1/(neg/total)}
  return weights

In [None]:
train_weights = determine_weights(train)

In [None]:
print(train_weights)

{2: 3.5462031107044827, 1: 1.677922077922078, 0: 8.194503171247357}


In [None]:
X_train = train.Vectorized.tolist()
X_test = test.Vectorized.tolist()

y_train = train.Sentiment.tolist()
y_test = test.Sentiment.tolist()

# Fine-tuning process

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.svm import SVC

In [None]:
param_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0, 0.5, 1, 1.5, 2], 'gamma': [0, 0.5, 1, 1.5, 2]}

In [None]:
grid = GridSearchCV(SVC(), param_grid, verbose=3)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END .........C=0, gamma=0, kernel=linear;, score=nan total time=   0.0s
[CV 2/5] END .........C=0, gamma=0, kernel=linear;, score=nan total time=   0.0s
[CV 3/5] END .........C=0, gamma=0, kernel=linear;, score=nan total time=   0.0s
[CV 4/5] END .........C=0, gamma=0, kernel=linear;, score=nan total time=   0.0s
[CV 5/5] END .........C=0, gamma=0, kernel=linear;, score=nan total time=   0.0s
[CV 1/5] END ............C=0, gamma=0, kernel=rbf;, score=nan total time=   0.0s
[CV 2/5] END ............C=0, gamma=0, kernel=rbf;, score=nan total time=   0.0s
[CV 3/5] END ............C=0, gamma=0, kernel=rbf;, score=nan total time=   0.0s
[CV 4/5] END ............C=0, gamma=0, kernel=rbf;, score=nan total time=   0.0s
[CV 5/5] END ............C=0, gamma=0, kernel=rbf;, score=nan total time=   0.0s
[CV 1/5] END ...........C=0, gamma=0, kernel=poly;, score=nan total time=   0.0s
[CV 2/5] END ...........C=0, gamma=0, kernel=po

75 fits failed out of a total of 375.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 180, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.u

In [None]:
grid.best_params_

{'C': 2, 'gamma': 1.5, 'kernel': 'rbf'}

We can see that the rbf kernel achieves the highest performance. It makes sense that this kernel performs better than the linear one, since the relationship between words and sentiment is not really linear. The poly kernel also achieved fairly good performance, but took way longer to train and is therefore not really optimal.

The C parameter determines the width of the margin. A smaller margin will mean lower training error, but also worse generalization. The C parameter is inversely related to the margin --> so we want C to be as small as possible to have the margin as large as possible.

With gamma, the higher the value the curvier the boundary will be --> higher gamma might cause overfitting. Therefore, we also want gamma to be as small as possible.

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.595 total time=   2.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.596 total time=   1.9s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.596 total time=   2.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.596 total time=   1.9s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.596 total time=   2.6s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.602 total time=   2.6s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.610 total time=   1.9s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.609 total time=   1.9s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.627 total time=   1.9s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.614 total time=   2.0s
[CV 1/5] END .......C=0.1, gamma=10, kernel=rbf;, score=0.610 total time=   2.1s
[CV 2/5] END .......C=0.1, gamma=10, kernel=rbf;,

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [8.5, 9, 9.5, 10, 10.5], 'gamma': [1, 5, 8, 10]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ........C=8.5, gamma=1, kernel=rbf;, score=0.612 total time=   2.2s
[CV 2/5] END ........C=8.5, gamma=1, kernel=rbf;, score=0.606 total time=   2.3s
[CV 3/5] END ........C=8.5, gamma=1, kernel=rbf;, score=0.609 total time=   2.2s
[CV 4/5] END ........C=8.5, gamma=1, kernel=rbf;, score=0.628 total time=   3.1s
[CV 5/5] END ........C=8.5, gamma=1, kernel=rbf;, score=0.619 total time=   2.7s
[CV 1/5] END ........C=8.5, gamma=5, kernel=rbf;, score=0.603 total time=   2.4s
[CV 2/5] END ........C=8.5, gamma=5, kernel=rbf;, score=0.606 total time=   2.3s
[CV 3/5] END ........C=8.5, gamma=5, kernel=rbf;, score=0.615 total time=   2.3s
[CV 4/5] END ........C=8.5, gamma=5, kernel=rbf;, score=0.643 total time=   2.8s
[CV 5/5] END ........C=8.5, gamma=5, kernel=rbf;, score=0.639 total time=   3.3s
[CV 1/5] END ........C=8.5, gamma=8, kernel=rbf;, score=0.608 total time=   2.5s
[CV 2/5] END ........C=8.5, gamma=8, kernel=rbf

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [9, 9.5, 10, 10.5, 11], 'gamma': [7, 8, 9, 10, 11]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ..........C=9, gamma=7, kernel=rbf;, score=0.603 total time=   2.9s
[CV 2/5] END ..........C=9, gamma=7, kernel=rbf;, score=0.606 total time=   2.3s
[CV 3/5] END ..........C=9, gamma=7, kernel=rbf;, score=0.614 total time=   2.4s
[CV 4/5] END ..........C=9, gamma=7, kernel=rbf;, score=0.646 total time=   2.5s
[CV 5/5] END ..........C=9, gamma=7, kernel=rbf;, score=0.634 total time=   3.4s
[CV 1/5] END ..........C=9, gamma=8, kernel=rbf;, score=0.608 total time=   2.5s
[CV 2/5] END ..........C=9, gamma=8, kernel=rbf;, score=0.612 total time=   2.3s
[CV 3/5] END ..........C=9, gamma=8, kernel=rbf;, score=0.617 total time=   2.4s
[CV 4/5] END ..........C=9, gamma=8, kernel=rbf;, score=0.644 total time=   2.3s
[CV 5/5] END ..........C=9, gamma=8, kernel=rbf;, score=0.630 total time=   3.0s
[CV 1/5] END ..........C=9, gamma=9, kernel=rbf;, score=0.611 total time=   3.0s
[CV 2/5] END ..........C=9, gamma=9, kernel=rbf

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [10, 11, 11.5], 'gamma': [9.5, 9.75, 10, 10,5]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END .......C=10, gamma=9.5, kernel=rbf;, score=0.610 total time=   3.8s
[CV 2/5] END .......C=10, gamma=9.5, kernel=rbf;, score=0.617 total time=   3.2s
[CV 3/5] END .......C=10, gamma=9.5, kernel=rbf;, score=0.619 total time=   2.3s
[CV 4/5] END .......C=10, gamma=9.5, kernel=rbf;, score=0.643 total time=   2.4s
[CV 5/5] END .......C=10, gamma=9.5, kernel=rbf;, score=0.632 total time=   2.3s
[CV 1/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.611 total time=   2.4s
[CV 2/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.617 total time=   3.4s
[CV 3/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.619 total time=   2.6s
[CV 4/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.643 total time=   2.4s
[CV 5/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.632 total time=   2.3s
[CV 1/5] END ........C=10, gamma=10, kernel=rbf;, score=0.611 total time=   2.4s
[CV 2/5] END ........C=10, gamma=10, kernel=rbf;

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [11.5, 15, 25], 'gamma': [10]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ......C=11.5, gamma=10, kernel=rbf;, score=0.612 total time=   3.6s
[CV 2/5] END ......C=11.5, gamma=10, kernel=rbf;, score=0.617 total time=   2.4s
[CV 3/5] END ......C=11.5, gamma=10, kernel=rbf;, score=0.622 total time=   2.3s
[CV 4/5] END ......C=11.5, gamma=10, kernel=rbf;, score=0.644 total time=   2.3s
[CV 5/5] END ......C=11.5, gamma=10, kernel=rbf;, score=0.635 total time=   3.0s
[CV 1/5] END ........C=15, gamma=10, kernel=rbf;, score=0.616 total time=   3.0s
[CV 2/5] END ........C=15, gamma=10, kernel=rbf;, score=0.619 total time=   2.4s
[CV 3/5] END ........C=15, gamma=10, kernel=rbf;, score=0.617 total time=   2.4s
[CV 4/5] END ........C=15, gamma=10, kernel=rbf;, score=0.640 total time=   2.4s
[CV 5/5] END ........C=15, gamma=10, kernel=rbf;, score=0.635 total time=   2.6s
[CV 1/5] END ........C=25, gamma=10, kernel=rbf;, score=0.619 total time=   3.6s
[CV 2/5] END ........C=25, gamma=10, kernel=rbf;,

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [10], 'gamma': [9.75, 10, 10.25, 10.5, 10.75, 11]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.611 total time=   3.3s
[CV 2/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.617 total time=   3.2s
[CV 3/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.619 total time=   2.8s
[CV 4/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.643 total time=   2.4s
[CV 5/5] END ......C=10, gamma=9.75, kernel=rbf;, score=0.632 total time=   2.4s
[CV 1/5] END ........C=10, gamma=10, kernel=rbf;, score=0.611 total time=   2.4s
[CV 2/5] END ........C=10, gamma=10, kernel=rbf;, score=0.617 total time=   2.9s
[CV 3/5] END ........C=10, gamma=10, kernel=rbf;, score=0.622 total time=   3.2s
[CV 4/5] END ........C=10, gamma=10, kernel=rbf;, score=0.644 total time=   2.4s
[CV 5/5] END ........C=10, gamma=10, kernel=rbf;, score=0.631 total time=   2.3s
[CV 1/5] END .....C=10, gamma=10.25, kernel=rbf;, score=0.610 total time=   2.4s
[CV 2/5] END .....C=10, gamma=10.25, kernel=rbf;,

# Evaluate on test set

So, it appears the optimal parameters for the SVC are C=10, gamma=10 and kernel='rbf'. We can now train this model and evaluate it on the test set.

In [None]:
import sys
sys.path.insert(0, '/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks')

In [None]:
from sklearn.svm import SVC
import multi_class_performance_eval as mce

In [None]:
svc = SVC(class_weight = train_weights, C=10, gamma=10, kernel='rbf')

In [None]:
import time

start_time = time.time_ns()
svc.fit(X_train, y_train)
end_time = time.time_ns()
el_time = (end_time - start_time)/1000000000
print("Elapsed time: ", str(el_time), "seconds")


Elapsed time:  4.218312117 seconds


In [None]:
y_pred = svc.predict(X_test)

In [None]:
evaluation = mce.evaluate_performance(y_pred, y_test)

In [None]:
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n")

Accuracy :  57.1134 

Base Positive :  {'TP': 50, 'FP': 80, 'TN': 275, 'FN': 80} 

Base Neutral :  {'TP': 194, 'FP': 65, 'TN': 135, 'FN': 91} 

Base Negative :  {'TP': 33, 'FP': 63, 'TN': 352, 'FN': 37} 

Advanced Positive :  {'Precision': 0.38461538461538464, 'Recall': 0.38461538461538464, 'Specificity': 0.7746478873239436} 

Advanced Neutral :  {'Precision': 0.749034749034749, 'Recall': 0.6807017543859649, 'Specificity': 0.675} 

Advanced Negative :  {'Precision': 0.34375, 'Recall': 0.4714285714285714, 'Specificity': 0.8481927710843373} 

Balanced Accuracy :  0.5122485701433069 

F_Score :  0.12667830858710485 



In [None]:
print(mce.confusion_matrix(y_pred, y_test))

[[ 33  30  33]
 [ 18 194  47]
 [ 19  61  50]]


#Pretrained Word2Vec

In this section, we will use the pre-trained Word2Vec model on the Google News dataset.

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
w2v_model = KeyedVectors.load_word2vec_format("/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
def convert_to_pretrained(dataframe, pretrained_emb, learned_emb):
  """
  Converts the textual sentences into vector representations using the pre-trained Google News vectors.

  param dataframe: Pandas DataFrame containing the sentences.
  param pretrained_emb: Word2Vec model containing pre-trained embeddings.
  param learned_emb: Word2Vec model learned on the dataset.
  """
  total_embeddings = []
  i = 0
  j = 0
  k = 0
  for sentence in dataframe.Sentence:
    sent = sentence.split()
    embedded_sent = []
    for word in sent:
      representation = [0] * 300
      try:
        #If the pretrained model encountered an unknown word, it would otherwise give an error
        representation = pretrained_emb[word]
        i += 1
      except:
        try:
          representation = learned_emb.wv[word]
          k += 1
        except:
          a = 0
      embedded_sent.append(representation)
      j += 1
    total_embeddings.append(embedded_sent)
  print("Total times used Google representation: ", str(i))
  print("Total times learned representation used: ", str(k))
  print("Total unknown words: ", str(a))
  print("Total amount of word seen: ", str(j))
  return total_embeddings

In [None]:
train_pre, test_pre = load_and_prepare()

In [None]:
train_vect_pre = convert_to_pretrained(train_pre, w2v_model, embeddings)
print("-------------------")
test_vect_pre = convert_to_pretrained(test_pre, w2v_model, embeddings)

Total times used Google representation:  41846
Total times learned representation used:  11684
Total unknown words:  0
Total amount of word seen:  53532
-------------------
Total times used Google representation:  5357
Total times learned representation used:  1407
Total unknown words:  0
Total amount of word seen:  6765


In [None]:
equalized_train_pre = equalize_vector_lengths(train_vect_pre, des_length=desired_length)
equalized_test_pre = equalize_vector_lengths(test_vect_pre, des_length=desired_length)

In [None]:
train_pre['Vectorized'] = equalized_train_pre
test_pre['Vectorized'] = equalized_test_pre

In [None]:
print(train_pre.info())
print(test_pre.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3876 entries, 3738 to 1624
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence    3876 non-null   object
 1   Sentiment   3876 non-null   int64 
 2   Vectorized  3876 non-null   object
dtypes: int64(1), object(2)
memory usage: 121.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 485 entries, 2318 to 4756
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sentence    485 non-null    object
 1   Sentiment   485 non-null    int64 
 2   Vectorized  485 non-null    object
dtypes: int64(1), object(2)
memory usage: 15.2+ KB
None


In [None]:
print(train_weights)

{2: 3.5462031107044827, 1: 1.677922077922078, 0: 8.194503171247357}


In [None]:
X_train_pre = train_pre.Vectorized.tolist()
X_test_pre = test_pre.Vectorized.tolist()

y_train_pre = train_pre.Sentiment.tolist()
y_test_pre = test_pre.Sentiment.tolist()

In [None]:
#Train the model with the pretrained vectors using the optimal hyperparameters found for the non-pretrained vectors.
import time
model = SVC(kernel = 'rbf', C=10, gamma=10, class_weight=train_weights)

start_time = time.time_ns()
model.fit(X_train_pre, y_train_pre)
end_time = time.time_ns()
el_time = (end_time - start_time)/1000000000
print("Elapsed time: ", str(el_time), "seconds")

Elapsed time:  4.206768583 seconds


In [None]:
y_pred = model.predict(X_test_pre)

In [None]:
evaluation = mce.evaluate_performance(y_pred, y_test_pre)

In [None]:
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n")

Accuracy :  64.1237 

Base Positive :  {'TP': 18, 'FP': 13, 'TN': 342, 'FN': 112} 

Base Neutral :  {'TP': 284, 'FP': 156, 'TN': 44, 'FN': 1} 

Base Negative :  {'TP': 9, 'FP': 5, 'TN': 410, 'FN': 61} 

Advanced Positive :  {'Precision': 0.5806451612903226, 'Recall': 0.13846153846153847, 'Specificity': 0.9633802816901409} 

Advanced Neutral :  {'Precision': 0.6454545454545455, 'Recall': 0.9964912280701754, 'Specificity': 0.22} 

Advanced Negative :  {'Precision': 0.6428571428571429, 'Recall': 0.12857142857142856, 'Specificity': 0.9879518072289156} 

Balanced Accuracy :  0.42117473170104747 

F_Score :  0.1318692250747273 



In [None]:
print(mce.confusion_matrix(y_pred, y_test_pre))

[[  9   0   5]
 [ 49 284 107]
 [ 12   1  18]]


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['rbf'], 'C': [1, 5, 10, 20], 'gamma': [1, 10, 50]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train_pre, y_train_pre)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.718 total time=   1.5s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.754 total time=   1.4s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.754 total time=   1.4s
[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.756 total time=   1.4s
[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.751 total time=   1.8s
[CV 1/5] END .........C=1, gamma=10, kernel=rbf;, score=0.624 total time=   2.9s
[CV 2/5] END .........C=1, gamma=10, kernel=rbf;, score=0.619 total time=   2.3s
[CV 3/5] END .........C=1, gamma=10, kernel=rbf;, score=0.632 total time=   2.3s
[CV 4/5] END .........C=1, gamma=10, kernel=rbf;, score=0.637 total time=   2.3s
[CV 5/5] END .........C=1, gamma=10, kernel=rbf;, score=0.628 total time=   2.2s
[CV 1/5] END .........C=1, gamma=50, kernel=rbf;, score=0.602 total time=   3.3s
[CV 2/5] END .........C=1, gamma=50, kernel=rbf;

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [3, 4, 5, 6, 7], 'gamma': [0.5, 0.75, 1, 1.25, 1.5, 1.75]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train_pre, y_train_pre)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ........C=3, gamma=0.5, kernel=rbf;, score=0.753 total time=   1.2s
[CV 2/5] END ........C=3, gamma=0.5, kernel=rbf;, score=0.768 total time=   1.2s
[CV 3/5] END ........C=3, gamma=0.5, kernel=rbf;, score=0.770 total time=   1.2s
[CV 4/5] END ........C=3, gamma=0.5, kernel=rbf;, score=0.774 total time=   1.2s
[CV 5/5] END ........C=3, gamma=0.5, kernel=rbf;, score=0.770 total time=   1.2s
[CV 1/5] END .......C=3, gamma=0.75, kernel=rbf;, score=0.759 total time=   1.3s
[CV 2/5] END .......C=3, gamma=0.75, kernel=rbf;, score=0.769 total time=   1.3s
[CV 3/5] END .......C=3, gamma=0.75, kernel=rbf;, score=0.779 total time=   2.0s
[CV 4/5] END .......C=3, gamma=0.75, kernel=rbf;, score=0.773 total time=   1.7s
[CV 5/5] END .......C=3, gamma=0.75, kernel=rbf;, score=0.779 total time=   1.3s
[CV 1/5] END ..........C=3, gamma=1, kernel=rbf;, score=0.755 total time=   1.4s
[CV 2/5] END ..........C=3, gamma=1, kernel=rbf

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [3.5, 3.75, 4, 4.25, 4.5], 'gamma': [0.6, 0.65, 0.7, 0.75, 0.8]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train_pre, y_train_pre)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.762 total time=   1.9s
[CV 2/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.764 total time=   1.9s
[CV 3/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.777 total time=   1.3s
[CV 4/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.775 total time=   1.2s
[CV 5/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.777 total time=   1.3s
[CV 1/5] END .....C=3.5, gamma=0.65, kernel=rbf;, score=0.763 total time=   1.3s
[CV 2/5] END .....C=3.5, gamma=0.65, kernel=rbf;, score=0.766 total time=   1.3s
[CV 3/5] END .....C=3.5, gamma=0.65, kernel=rbf;, score=0.778 total time=   1.3s
[CV 4/5] END .....C=3.5, gamma=0.65, kernel=rbf;, score=0.775 total time=   1.3s
[CV 5/5] END .....C=3.5, gamma=0.65, kernel=rbf;, score=0.778 total time=   1.7s
[CV 1/5] END ......C=3.5, gamma=0.7, kernel=rbf;, score=0.758 total time=   2.0s
[CV 2/5] END ......C=3.5, gamma=0.7, kernel=rbf

In [None]:
param_grid = {'kernel': ['rbf'], 'C': [3.5, 3.6, 3.75, 3.8, 3.9], 'gamma': [0.6, 0.62, 0.65, 0.68, 0.7]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train_pre, y_train_pre)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.762 total time=   1.3s
[CV 2/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.764 total time=   1.3s
[CV 3/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.777 total time=   1.3s
[CV 4/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.775 total time=   1.3s
[CV 5/5] END ......C=3.5, gamma=0.6, kernel=rbf;, score=0.777 total time=   1.3s
[CV 1/5] END .....C=3.5, gamma=0.62, kernel=rbf;, score=0.763 total time=   1.3s
[CV 2/5] END .....C=3.5, gamma=0.62, kernel=rbf;, score=0.765 total time=   1.9s
[CV 3/5] END .....C=3.5, gamma=0.62, kernel=rbf;, score=0.779 total time=   1.6s
[CV 4/5] END .....C=3.5, gamma=0.62, kernel=rbf;, score=0.775 total time=   1.3s
[CV 5/5] END .....C=3.5, gamma=0.62, kernel=rbf;, score=0.778 total time=   1.2s
[CV 1/5] END .....C=3.5, gamma=0.65, kernel=rbf;, score=0.763 total time=   1.3s
[CV 2/5] END .....C=3.5, gamma=0.65, kernel=rbf

## So, for the pretrained embeddings the optimal hyperparameters are C=3.75 and gamma=0.65.

In [None]:
#Test performance of the model using pre-trained embeddings and the optimal hyperparameters found for this model.
import time

svc_pretrain = SVC(kernel='rbf', C=3.75, gamma=0.65, class_weight=train_weights)

start_time = time.time_ns()
svc_pretrain.fit(X_train_pre, y_train_pre)
end_time = time.time_ns()
el_time = (end_time - start_time)/1000000000
print("Elapsed time: ", str(el_time), "seconds")

Elapsed time:  2.637742646 seconds


In [None]:
y_pred_pretrain = svc_pretrain.predict(X_test_pre)

In [None]:
evaluation_pretrain = mce.evaluate_performance(y_pred_pretrain, y_test_pre)

In [None]:
for metric in evaluation_pretrain:
  print(metric, ": ", evaluation_pretrain[metric], "\n")

Accuracy :  76.2887 

Base Positive :  {'TP': 85, 'FP': 49, 'TN': 306, 'FN': 45} 

Base Neutral :  {'TP': 235, 'FP': 43, 'TN': 157, 'FN': 50} 

Base Negative :  {'TP': 50, 'FP': 23, 'TN': 392, 'FN': 20} 

Advanced Positive :  {'Precision': 0.6343283582089553, 'Recall': 0.6538461538461539, 'Specificity': 0.8619718309859155} 

Advanced Neutral :  {'Precision': 0.8453237410071942, 'Recall': 0.8245614035087719, 'Specificity': 0.785} 

Advanced Negative :  {'Precision': 0.684931506849315, 'Recall': 0.7142857142857143, 'Specificity': 0.944578313253012} 

Balanced Accuracy :  0.7308977572135467 

F_Score :  0.38296190225180615 



In [None]:
print(mce.confusion_matrix(y_pred_pretrain, y_test_pre))

[[ 50   9  14]
 [ 12 235  31]
 [  8  41  85]]


In [None]:
import pickle

In [None]:
#Save the optimal pre-trained embeddings model.
filename = "SVC_pretrained_optimparam.pickle"
pickle.dump(svc_pretrain, open(filename, "wb"))

In [None]:
#Load the saved model and test it on the test set.
loaded_best_model = pickle.load(open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Financial_SVC_pretrained_optimparam.pickle", "rb"))
start = time.time()
y_pred_loaded = loaded_best_model.predict(X_test_pre)
end = time.time()
print("Prediction time: ", str(end-start), " seconds")

Prediction time:  0.8276116847991943  seconds


In [None]:
evaluation_pretrain = mce.evaluate_performance(y_pred_loaded, y_test_pre)

In [None]:
for metric in evaluation_pretrain:
  print(metric, ": ", evaluation_pretrain[metric], "\n")

Accuracy :  76.2887 

Base Positive :  {'TP': 85, 'FP': 50, 'TN': 305, 'FN': 45} 

Base Neutral :  {'TP': 235, 'FP': 43, 'TN': 157, 'FN': 50} 

Base Negative :  {'TP': 50, 'FP': 22, 'TN': 393, 'FN': 20} 

Advanced Positive :  {'Precision': 0.6296296296296297, 'Recall': 0.6538461538461539, 'Specificity': 0.8591549295774648} 

Advanced Neutral :  {'Precision': 0.8453237410071942, 'Recall': 0.8245614035087719, 'Specificity': 0.785} 

Advanced Negative :  {'Precision': 0.6944444444444444, 'Recall': 0.7142857142857143, 'Specificity': 0.946987951807229} 

Balanced Accuracy :  0.7308977572135467 

F_Score :  0.3842427361711301 



In [None]:
print(mce.confusion_matrix(y_pred_loaded, y_test_pre))

[[ 50   8  14]
 [ 12 235  31]
 [  8  42  85]]


#Save indices of predictions

In [None]:
def get_specific_errors(dataframe, y_pred, y_real, vertical, horizontal):
	"""
	Get the indexes from specific cells in the confusion matrix.

	param dataframe: Pandas DataFrame containing the sentences and indices.
	param y_pred: (list) contains the predicted sentiments.
	param y_real: (list) contains the real sentiments.
	param vertical: (int) corresponds to the column in the confusion matrix.
	param horizontal: (int) corresponds to the row in the confusion matrix.
	"""
	i = 0
	errors = []
	while i < len(dataframe.Sentence):
		if (horizontal == y_pred[i]) and (vertical == y_real[i]):
			errors.append(dataframe.index[i])
		i += 1
	return errors

In [None]:
#Neutral sentences predicted to be positive.
pos_neu_errors_index = get_specific_errors(test_pre, y_pred_loaded, y_test_pre, 1, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Financial/updated_SVM_financial_pos_neu.txt", "w") as writefile:
  for index in pos_neu_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Positive sentences predicted to be neutral.
neu_pos_errors_index = get_specific_errors(test_pre, y_pred_loaded, y_test_pre, 2, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Financial/updated_SVM_financial_neu_pos.txt", "w") as writefile:
  for index in neu_pos_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#True positives for the neutral class.
tp_neu_index = get_specific_errors(test_pre, y_pred_loaded, y_test_pre, 1, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Financial/updated_SVM_financial_tp_neu.txt", "w") as writefile:
  for index in tp_neu_index:
    writefile.write(str(index))
    writefile.write("\n")

#Non-pretrained vector model with hyperparameters found for pretrained vector model

In [None]:
svc_basic = SVC(kernel='rbf', C=3.75, gamma=0.65, class_weight=train_weights) #This is a non-pretrained vector model with the hyperparameters found from the pretrained model

In [None]:
import time

start_time = time.time_ns()
svc_basic.fit(X_train, y_train)
end_time = time.time_ns()
el_time = (end_time - start_time)/1000000000
print("Elapsed time: ", str(el_time), "seconds")

Elapsed time:  5.583744543 seconds


In [None]:
y_pred_basic = svc_basic.predict(X_test)

In [None]:
evaluation_basic = mce.evaluate_performance(y_pred_basic, y_test)

In [None]:
for metric in evaluation_basic:
  print(metric, ": ", evaluation_basic[metric])
  print("\n")

Accuracy :  52.3711


Base Positive :  {'TP': 37, 'FP': 91, 'TN': 264, 'FN': 93}


Base Neutral :  {'TP': 185, 'FP': 64, 'TN': 136, 'FN': 100}


Base Negative :  {'TP': 32, 'FP': 76, 'TN': 339, 'FN': 38}


Advanced Positive :  {'Precision': 0.2890625, 'Recall': 0.2846153846153846, 'Specificity': 0.7436619718309859}


Advanced Neutral :  {'Precision': 0.7429718875502008, 'Recall': 0.6491228070175439, 'Specificity': 0.68}


Advanced Negative :  {'Precision': 0.2962962962962963, 'Recall': 0.45714285714285713, 'Specificity': 0.8168674698795181}


Balanced Accuracy :  0.4636270162585952


F_Score :  0.09298557802616766




In [None]:
print(mce.confusion_matrix(y_pred_basic, y_test))

[[ 32  29  47]
 [ 18 185  46]
 [ 20  71  37]]
