In [None]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import seaborn as sns
import nltk
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks')

In [None]:
import multi_class_performance_eval as mce

In [None]:
#Setting the random seeds for reproducability
import random
np.random.seed(42)
random.seed(42)

#Load the test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#For the VADER lexicon, we don't need to train so we only need the test set.
test_set = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_test_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], encoding="utf-8", skiprows=[0])

In [None]:
#50/50 test/val split is used for the DL model, so to keep the test set even we do the same here.
X_test, X_val, y_test, y_val = train_test_split(test_set.Sentence, test_set.Sentiment, test_size=0.5, random_state=42, stratify=test_set.Sentiment)

In [None]:
X_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 26883 entries, 8500 to 27424
Series name: Sentence
Non-Null Count  Dtype 
--------------  ----- 
26883 non-null  object
dtypes: object(1)
memory usage: 420.0+ KB


In [None]:
#Convert the numbers in y_test into int.
i = 0
for rating in y_test:
  y_test[i] = int(rating)
  i += 1

#Sentiment Classification

In [None]:
def classify_VADER(dataframe):
  """
  Uses the SentimentIntensityAnalyzer from the VADER lexicon to produce a prediction for each sentence in X_test

  Positive: compound >= 0.05
  Neutral: -0.05 =< compound < 0.05
  Negative: compound < -0.05
  """
  sent_analyzer = SentimentIntensityAnalyzer()
  y_head = []
  scores = []
  for sentence in dataframe:
    sentiment = sent_analyzer.polarity_scores(sentence)
    scores.append(sentiment['compound'])
    if sentiment['compound'] >= 0.05:
      y_head.append(2)
    elif ((sentiment['compound'] < 0.05) and (sentiment['compound'] >= -0.05)):
      y_head.append(1)
    elif sentiment['compound'] < -0.05:
      y_head.append(0)
  return y_head, scores

In [None]:
#Generate predictions for the test set using VADER.
start_time = time.time()
y_pred, scores = classify_VADER(X_test)
end_time = time.time()
print("Elapsed time: ", str(end_time - start_time), "s")

Elapsed time:  7.252785921096802 s


In [None]:
evaluation = mce.evaluate_performance(y_pred, y_test)

In [None]:
for metric in evaluation:
  print(metric, "\n", evaluation[metric], "\n")

Accuracy 
 55.2208 

Base Positive 
 {'TP': 10273, 'FP': 2984, 'TN': 6179, 'FN': 7447} 

Base Neutral 
 {'TP': 130, 'FP': 1180, 'TN': 23288, 'FN': 2285} 

Base Negative 
 {'TP': 4442, 'FP': 7874, 'TN': 12261, 'FN': 2306} 

Advanced Positive 
 {'Precision': 0.774911367579392, 'Recall': 0.5797404063205418, 'Specificity': 0.6743424642584307} 

Advanced Neutral 
 {'Precision': 0.09923664122137404, 'Recall': 0.053830227743271224, 'Specificity': 0.9517737452999836} 

Advanced Negative 
 {'Precision': 0.36066904839233516, 'Recall': 0.6582691167753408, 'Specificity': 0.6089396573131364} 

Balanced Accuracy 
 0.430613250279718 

F_Score 
 0.07460063136952352 



In [None]:
print(mce.confusion_matrix(y_pred, y_test))

[[ 4442  1242  6632]
 [  365   130   815]
 [ 1941  1043 10273]]


#Sentence-level error inspection

In [None]:
def get_specific_errors(sentences, y_pred, y_real, vertical, horizontal):
	"""
	Gets the id of specific errors, e.g. neutral instances predicted as positive.

	param dataframe: Pandas DataFrame containing the sentences.
	param y_pred: (int) predictions made by the model.
	param y_real: (int) actual sentiment label.
	param vertical: (int) the vertical column of the confusion matrix (so the real sentiment).
	param horizontal: (int) the horizontal row of the confusion matrix (so the prediction).
	"""
	i = 0
	errors = []
	while i < len(sentences):
		if (horizontal == y_pred[i]) and (vertical == y_real[i]):
			errors.append(sentences.index[i])
		i += 1
	return errors

In [None]:
#Negative sentences predicted to be positive.
pos_neg_error_index = get_specific_errors(X_test, y_pred, y_test, 0, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_LB_medical_pos_neg.txt", "w") as writefile:
  for index in pos_neg_error_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Positive sentneces predicted to be negative.
neg_pos_error_index = get_specific_errors(X_test, y_pred, y_test, 2, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_LB_medical_neg_pos.txt", "w") as writefile:
  for index in neg_pos_error_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Neutral sentences predicted to be positive
pos_neu_error_index = get_specific_errors(X_test, y_pred, y_test, 1, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_LB_medical_pos_neu.txt", "w") as writefile:
  for index in pos_neu_error_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Neutral sentences predicted to be negative.
neg_neu_error_index = get_specific_errors(X_test, y_pred, y_test, 1, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_LB_medical_neg_neu.txt", "w") as writefile:
  for index in neg_neu_error_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#True positives for the neutral class.
tp_neu_index = get_specific_errors(X_test, y_pred, y_test, 1, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_LB_medical_tp_neu.txt", "w") as writefile:
  for index in tp_neu_index:
    writefile.write(str(index))
    writefile.write("\n")