In [None]:
import pandas as pd
import nltk
import sklearn
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
sys.path.insert(0, '/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks')

In [None]:
from sklearn.svm import SVC
import multi_class_performance_eval as mce

In [None]:
#Setting the random seeds for reproducability
import random
np.random.seed(42)
random.seed(42)

# Loading the dataset

In [None]:
#Both train and test have already been tokenized and cleaned during preprocessing, so we only have to load them.
train_set = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_train_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], skiprows=[0])
test_set = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_test_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], skiprows=[0])

In [None]:
print(train_set.info())
print(test_set.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161297 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   161294 non-null  object 
 1   Sentiment  161297 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53766 entries, 0 to 53765
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Sentence   53766 non-null  object 
 1   Sentiment  53766 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.2+ MB
None


Train contains 3 rows with a null value, so we will remove these now.

In [None]:
train_set.dropna(inplace=True)

In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161294 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   161294 non-null  object 
 1   Sentiment  161294 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#Split the test set into test and val using the same random state as for the other two models.
X_test, X_val, y_test, y_val = train_test_split(test_set.Sentence, test_set.Sentiment, test_size=0.5, random_state=42, stratify=test_set.Sentiment)

In [None]:
X_test.head()

8500     viagra = strongest one terrible heartburn wate...
19047    5 ' 7 quot weigh 109lbs three kids 8lbs anorex...
32586    product god send minor burning sensation apply...
20840    medicine works wonders ' go anti-nausea med wi...
22141    taking medication 12 days sometimes diarrhea m...
Name: Sentence, dtype: object

# Create list of words that can be used to generate embeddings

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#Embeddings will be trained on the whole dataset, so we append train and test sets.
full_dataset = train_set.append(test_set, ignore_index=True)

  full_dataset = train_set.append(test_set, ignore_index=True)


In [None]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215060 entries, 0 to 215059
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   215060 non-null  object 
 1   Sentiment  215060 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.3+ MB


In [None]:
tokenized_dataset = []

for sentence in full_dataset.Sentence:
    tokenized = word_tokenize(sentence)
    tokenized_dataset.append(tokenized)

# Train embeddings on train set

In [None]:
from gensim.models import Word2Vec

In [None]:
#Train the Word2Vec model on the whole dataset, using vector_size = 300 (since Google News vectors also use length 300)
embeddings = Word2Vec(tokenized_dataset, min_count=1, vector_size=300)

In [None]:
embeddings.wv.most_similar('drug')

[('medicine', 0.6598494052886963),
 ('medication', 0.6591436266899109),
 ('med', 0.6173158288002014),
 ('drugs', 0.5617853403091431),
 ('thatmatter', 0.507120668888092),
 ('demonized', 0.46689373254776),
 ('benzo', 0.4520038366317749),
 ('parnate', 0.43318814039230347),
 ('opiate', 0.42383041977882385),
 ('benzodiazepine', 0.42211678624153137)]

# Check label balance

In [None]:
def determine_weights(dataframe):
  """
  Determine the inverse class weights so the SVM can compensate for the imbalance in the dataset.

  param dataframe: Pandas DataFrame containing the sentiment labels.
  """
  total = 0
  pos = 0
  neu = 0
  neg = 0
  i = 0

  for sent in dataframe.Sentiment:
    if sent == 2:
      pos += 1
    elif sent == 1:
      neu += 1
    else:
      neg += 1
    total += 1
    i += 1

  weights = {2: 1/(pos/total), 1: 1/(neu/total), 0: 1/(neg/total)}
  return weights

In [None]:
train_weights = determine_weights(train_set)
test_weights = determine_weights(test_set)

In [None]:
for weight in train_weights:
  print("The label ", str(weight), " occurs ", str(1 / train_weights[weight] * 100), "% in the train set")

The label  2  occurs  66.25354941907324 % in the train set
The label  1  occurs  8.900517068210846 % in the train set
The label  0  occurs  24.84593351271591 % in the train set


In [None]:
for weight in test_weights:
  print("The label ", str(weight), " occurs ", str(1 / test_weights[weight] * 100), "% in the test set")

The label  2  occurs  65.91526243350818 % in the test set
The label  1  occurs  8.981512480005952 % in the test set
The label  0  occurs  25.10322508648588 % in the test set


#Generate vectors

In [None]:
def convert_to_vector(dataframe, embeddings):
  """
  Converts the sentences in the dataframe into vector representations.

  param dataframe: Pandas DataFrame containing the sentences.
  param embeddings: Word2Vec model trained on the full dataset.
  """
  total_embeddings = []
  i = 0
  j = 0
  for sentence in dataframe:
    sent = sentence.split()
    embedded_sent = []
    for word in sent:
      representation = [0] * 300          #If we encounter a word that is not present in the Word2Vec model, then we give this a vector with 0's as representation
      try:
        representation = embeddings.wv[word]
      except:
        i += 1
      embedded_sent.append(representation)
      j += 1
    total_embeddings.append(embedded_sent)
  print("Unknown words: ", str(i))
  print("Total words seen: ", str(j))
  return total_embeddings

In [None]:
train_vect = convert_to_vector(train_set.Sentence, embeddings)
test_vect = convert_to_vector(X_test, embeddings)

Unknown words:  8336
Total words seen:  7100779
Unknown words:  1376
Total words seen:  1182551


In [None]:
#The sentences will be represented by vectors of numbers. However, the SVM needs all inputs to have the same dimension, even though the sentences are not necessarily the same length.
#Since the Google Vectors that will be used later on have length=300, this is the value we'll use for the custom vectors as well.
desired_length = 300

In [None]:
import statistics

def calculate_average_vector(vect_list, des_length):
  """
  Given a list of vectors (a sentence), calculate the average vector with des_length.

  param vect_list: (list) list with vectors.
  param des_length: (int) the length we want our vectors to be.
  """
  averaged_vector = []
  i = 0
  while i < des_length:
    values = []
    j = 0
    while j < len(vect_list):
      values.append(vect_list[j][i])
      j += 1
    i += 1
    average = statistics.mean(values)
    averaged_vector.append(average)
  return averaged_vector

In [None]:
#Now we apply this to all the sentences in the dataset
def equalize_vector_lengths(vectors, des_length):
  """
  Equalizes all sentences in the dataset to vectors of the same length.

  param vectors: (list) list containing vectors.
  param des_length: (int) length we want the vectors representing sentences to be.
  """
  equalized_vectors = []
  for sentence in vectors:
    equalized_vectors.append(calculate_average_vector(sentence, des_length=desired_length))
  return equalized_vectors

In [None]:
equalized_vects_train = equalize_vector_lengths(train_vect, des_length = desired_length)  #Each sentence will now be encoded, with the 300 features vectors
equalized_vects_test = equalize_vector_lengths(test_vect, des_length = desired_length)

## Save vector representations

Since it took a really long time to generate the vector representations due to the size of the dataset, we save the generated vectors for later use.

In [None]:
train_vects_dict = {"Vectorized": equalized_vects_train}
test_vects_dict = {"Vectorized": equalized_vects_test}

In [None]:
train_vects_df = pd.DataFrame(data=train_vects_dict)
test_vects_df = pd.DataFrame(data=test_vects_dict)

In [None]:
test_vects_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26883 entries, 0 to 26882
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Vectorized  26883 non-null  object
dtypes: object(1)
memory usage: 210.1+ KB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

with open("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/train_vectors_drugscom.csv", 'w', encoding = 'utf-8-sig') as f:
  train_vects_df.to_csv(f)

with open("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/test_vectors_drugscom.csv", 'w', encoding = 'utf-8-sig') as f:
  test_vects_df.to_csv(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#The vectors can now simply be loaded.
train_vects_str = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/train_vectors_drugscom.csv", sep=",", names=["Vectorized"], skiprows=[0])
test_vects_str = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/test_vectors_drugscom.csv", sep=",", names=["Vectorized"], skiprows=[0])

In [None]:
test_vects_str.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26883 entries, 0 to 26882
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Vectorized  26883 non-null  object
dtypes: object(1)
memory usage: 420.0+ KB


In [None]:
train_vects_list = train_vects_str.Vectorized.tolist()
test_vects_list = test_vects_str.Vectorized.tolist()

In [None]:
def text_to_vector(string_vector):
  """
  Since the vectors were saved in a file, they are now strings and need to be converted to proper vectors again.

  param string_vector: (str) string representing a vector representation of a word.
  """
  remove_first = string_vector[1:]
  remove_last = remove_first[:-1]

  splitten = remove_last.split(',')

  output = []
  for item in splitten:
    if item[0] == ' ':
      item = item[1:]
    output.append(float(item))
  return output

In [None]:
#Convert all train vectors back to numbers.
train_vects = []
for i in train_vects_list:
  numerical = text_to_vector(i)
  train_vects.append(numerical)

In [None]:
print(len(train_vects))
print(len(train_vects[0]))

161294
300


In [None]:
#Convert all test vectors back to numbers.
test_vects = []
for i in test_vects_list:
  numerical = text_to_vector(i)
  test_vects.append(numerical)

In [None]:
print(len(test_vects))
print(len(test_vects[0]))

26883
300


In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161294 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   161294 non-null  object 
 1   Sentiment  161294 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [None]:
train_set['Vectorized'] = train_vects
X_test_vect = test_vects

In [None]:
train_set.Vectorized.info()

<class 'pandas.core.series.Series'>
Int64Index: 161294 entries, 0 to 161296
Series name: Vectorized
Non-Null Count   Dtype 
--------------   ----- 
161294 non-null  object
dtypes: object(1)
memory usage: 2.5+ MB


# Train SVM

In [None]:
X_train = train_set.Vectorized.tolist()
#For test, we use X_test_vects

y_train = train_set.Sentiment.tolist()
#y_test is still the y_test created when making the test-val split.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

In [None]:
#Fine-tune the C parameter
params = [0.1, 1, 5, 10]
for param in params:
  #Due to the size of the dataset, a normal SVC was not viable. Hence, we use LinearSVC on this dataset (so an SVC with linear kernel)
  #Since we use dual=False, according to documentation there is no randomness, so we don't need to specify the random_state
  model = LinearSVC(C=param, class_weight = train_weights, dual=False)
  print("Testing model with C = ", str(param), "\n \n")
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  evaluation = mce.evaluate_performance(predictions, y_test)
  for metric in evaluation:
    print(metric, ": ", evaluation[metric], "\n")


Testing model with C =  0.1 
 

Accuracy :  74.1119 

Base Positive :  {'TP': 30056, 'FP': 5831, 'TN': 12495, 'FN': 5384} 

Base Neutral :  {'TP': 925, 'FP': 2968, 'TN': 45969, 'FN': 3904} 

Base Negative :  {'TP': 8866, 'FP': 5120, 'TN': 35149, 'FN': 4631} 

Advanced Positive :  {'Precision': 0.837517764092847, 'Recall': 0.8480812641083522, 'Specificity': 0.6818181818181818} 

Advanced Neutral :  {'Precision': 0.2376059594143334, 'Recall': 0.19155104576516876, 'Specificity': 0.9393505936203691} 

Advanced Negative :  {'Precision': 0.6339196339196339, 'Recall': 0.6568867155664222, 'Specificity': 0.8728550497901612} 

Balanced Accuracy :  0.5655063418133144 

F_Score :  0.18285255009590184 

Testing model with C =  1 
 

Accuracy :  74.1026 

Base Positive :  {'TP': 30054, 'FP': 5837, 'TN': 12489, 'FN': 5386} 

Base Neutral :  {'TP': 930, 'FP': 2970, 'TN': 45967, 'FN': 3899} 

Base Negative :  {'TP': 8858, 'FP': 5117, 'TN': 35152, 'FN': 4639} 

Advanced Positive :  {'Precision': 0.83736

In [None]:
#Further fine-tune for the C parameter.
params = [0.05, 0.1, 0.25, 0.5, 0.75]
for param in params:
  model = LinearSVC(C=param, class_weight = train_weights, dual=False)
  print("Testing model with C = ", str(param), "\n \n")
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  evaluation = mce.evaluate_performance(predictions, y_test)
  for metric in evaluation:
    print(metric, ": ", evaluation[metric], "\n")

Testing model with C =  0.05 
 

Accuracy :  74.1007 

Base Positive :  {'TP': 30057, 'FP': 5841, 'TN': 12485, 'FN': 5383} 

Base Neutral :  {'TP': 928, 'FP': 2962, 'TN': 45975, 'FN': 3901} 

Base Negative :  {'TP': 8856, 'FP': 5122, 'TN': 35147, 'FN': 4641} 

Advanced Positive :  {'Precision': 0.8372889854587999, 'Recall': 0.848109480812641, 'Specificity': 0.6812725090036015} 

Advanced Neutral :  {'Precision': 0.238560411311054, 'Recall': 0.19217229240008282, 'Specificity': 0.9394732002370395} 

Advanced Negative :  {'Precision': 0.6335670339104307, 'Recall': 0.65614581018004, 'Specificity': 0.8728053837939854} 

Balanced Accuracy :  0.5654758611309213 

F_Score :  0.1828975452341104 

Testing model with C =  0.1 
 

Accuracy :  74.1119 

Base Positive :  {'TP': 30056, 'FP': 5831, 'TN': 12495, 'FN': 5384} 

Base Neutral :  {'TP': 925, 'FP': 2968, 'TN': 45969, 'FN': 3904} 

Base Negative :  {'TP': 8866, 'FP': 5120, 'TN': 35149, 'FN': 4631} 

Advanced Positive :  {'Precision': 0.837517

In [None]:
#Fine-tune both C and the tolerance hyperparameter (the values for C were based on the earlier tests)
tols = [1e-5, 1e-4, 1e-3, 1e-2]
cs = [0.075, 0.1, 0.125, 0.15]
for tolerance in tols:
  for c in cs:
    model = LinearSVC(C=c, class_weight = train_weights, dual=False, tol=tolerance)
    print("Testing model with c = ", str(c), "and tolerance = ", str(tolerance), "\n \n")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    evaluation = mce.evaluate_performance(predictions, y_test)
    for metric in evaluation:
      print(metric, ": ", evaluation[metric], "\n")

Testing model with c =  0.075 and tolerance =  1e-05 
 

Accuracy :  74.1119 

Base Positive :  {'TP': 30052, 'FP': 5829, 'TN': 12497, 'FN': 5388} 

Base Neutral :  {'TP': 931, 'FP': 2965, 'TN': 45972, 'FN': 3898} 

Base Negative :  {'TP': 8864, 'FP': 5125, 'TN': 35144, 'FN': 4633} 

Advanced Positive :  {'Precision': 0.837546333714222, 'Recall': 0.8479683972911964, 'Specificity': 0.6819273163810979} 

Advanced Neutral :  {'Precision': 0.2389630390143737, 'Recall': 0.19279353903499688, 'Specificity': 0.9394118969287043} 

Advanced Negative :  {'Precision': 0.6336407177067696, 'Recall': 0.6567385344891458, 'Specificity': 0.8727308847997218} 

Balanced Accuracy :  0.5658334902717796 

F_Score :  0.18318902037332765 

Testing model with c =  0.1 and tolerance =  1e-05 
 

Accuracy :  74.1138 

Base Positive :  {'TP': 30058, 'FP': 5831, 'TN': 12495, 'FN': 5382} 

Base Neutral :  {'TP': 925, 'FP': 2966, 'TN': 45971, 'FN': 3904} 

Base Negative :  {'TP': 8865, 'FP': 5121, 'TN': 35148, 'FN': 

## We can see that the optimal values appear to be C = 0.125 and Tol = 1e-2.
Note that the differences between all the values is extremely small.

In [None]:
final_custom_model = LinearSVC(C=0.125, tol=1e-2, class_weight = train_weights, dual=False)

In [None]:
import time

print("Testing the model with optimal hyperparameters on the test set...")

start_time = time.time()
final_custom_model.fit(X_train, y_train)
end_time = time.time()
el_time = (end_time - start_time)
print("Elapsed time: ", str(el_time), "seconds")

Testing the model with optimal hyperparameters on the test set...
Elapsed time:  89.68045783042908 seconds


In [None]:
print(len(X_test_vect))
print(len(y_test))

26883
26883


In [None]:
y_pred = final_custom_model.predict(X_test_vect)

In [None]:
evaluation = mce.evaluate_performance(y_pred, y_test.tolist())

In [None]:
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n")

Accuracy :  73.686 

Base Positive :  {'TP': 14982, 'FP': 2923, 'TN': 6240, 'FN': 2738} 

Base Neutral :  {'TP': 462, 'FP': 1503, 'TN': 22965, 'FN': 1953} 

Base Negative :  {'TP': 4365, 'FP': 2648, 'TN': 17487, 'FN': 2383} 

Advanced Positive :  {'Precision': 0.83674951130969, 'Recall': 0.8454853273137698, 'Specificity': 0.6809996725963112} 

Advanced Neutral :  {'Precision': 0.23511450381679388, 'Recall': 0.19130434782608696, 'Specificity': 0.9385728298185385} 

Advanced Negative :  {'Precision': 0.6224155140453443, 'Recall': 0.6468583283935981, 'Specificity': 0.8684877079711945} 

Balanced Accuracy :  0.5612160011778182 

F_Score :  0.17843852709026597 



In [None]:
print(mce.confusion_matrix(y_pred, y_test.tolist()))

[[ 4365   804  1844]
 [  609   462   894]
 [ 1774  1149 14982]]


In [None]:
import pickle
import time

In [None]:
#Save the optimal model
filename = "SVC_customvect_optimparam_medical.pickle"
pickle.dump(final_custom_model, open(filename, "wb"))

In [None]:
loaded_best_model = pickle.load(open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Medical_SVC_customvect_optimparam.pickle", "rb"))

In [None]:
#Use the loaded model to make predictions.
start = time.time()
predictions = loaded_best_model.predict(X_test_vect)
end = time.time()
print("Prediction time: ", str(end-start), " seconds")

Prediction time:  1.433598518371582  seconds


In [None]:
evaluation_loaded = mce.evaluate_performance(predictions, y_test.tolist())

In [None]:
for metric in evaluation_loaded:
  print(metric, ": ", str(evaluation_loaded[metric]), "\n \n")

Accuracy :  73.686 
 

Base Positive :  {'TP': 14982, 'FP': 2923, 'TN': 6240, 'FN': 2738} 
 

Base Neutral :  {'TP': 462, 'FP': 1503, 'TN': 22965, 'FN': 1953} 
 

Base Negative :  {'TP': 4365, 'FP': 2648, 'TN': 17487, 'FN': 2383} 
 

Advanced Positive :  {'Precision': 0.83674951130969, 'Recall': 0.8454853273137698, 'Specificity': 0.6809996725963112} 
 

Advanced Neutral :  {'Precision': 0.23511450381679388, 'Recall': 0.19130434782608696, 'Specificity': 0.9385728298185385} 
 

Advanced Negative :  {'Precision': 0.6224155140453443, 'Recall': 0.6468583283935981, 'Specificity': 0.8684877079711945} 
 

Balanced Accuracy :  0.5612160011778182 
 

F_Score :  0.17843852709026597 
 



In [None]:
print(mce.confusion_matrix(predictions, y_test.tolist()))

[[ 4365   804  1844]
 [  609   462   894]
 [ 1774  1149 14982]]


#Sentence-level inspection

In [None]:
def get_specific_errors(sentences, y_pred, y_real, vertical, horizontal):
	"""
	Get the indexes from specific cells in the confusion matrix.

	param dataframe: Pandas DataFrame containing the sentences and indices.
	param y_pred: (list) contains the predicted sentiments.
	param y_real: (list) contains the real sentiments.
	param vertical: (int) corresponds to the column in the confusion matrix.
	param horizontal: (int) corresponds to the row in the confusion matrix.
	"""
	i = 0
	errors = []
	while i < len(sentences):
		if (horizontal == y_pred[i]) and (vertical == y_real[i]):
			errors.append(sentences.index[i])
		i += 1
	return errors

Positive/negative and negative/positive

In [None]:
#Negative sentences predicted to be positive.
pos_neg_errors_index = get_specific_errors(X_test, y_pred, y_test.tolist(), 0, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_SVM_medical_pos_neg.txt", "w") as writefile:
  for index in pos_neg_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Positive sentences predicted to be negative.
neg_pos_errors_index = get_specific_errors(X_test, y_pred, y_test.tolist(), 2, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_SVM_medical_neg_pos.txt", "w") as writefile:
  for index in neg_pos_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

Positive/neutral and negative/neutral

In [None]:
#Neutral sentences predicted to be positive.
pos_neu_errors_index = get_specific_errors(X_test, y_pred, y_test.tolist(), 1, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_SVM_medical_pos_neu.txt", "w") as writefile:
  for index in pos_neu_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Neutral sentences predicted to be negative.
neg_neu_errors_index = get_specific_errors(X_test, y_pred, y_test.tolist(), 1, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_SVM_medical_neg_neu.txt", "w") as writefile:
  for index in neg_neu_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#True positives for the neutral class.
tp_neu_index = get_specific_errors(X_test, y_pred, y_test.tolist(), 1, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/updated_SVM_medical_tp_neu.txt", "w") as writefile:
  for index in tp_neu_index:
    writefile.write(str(index))
    writefile.write("\n")

# Pre-trained Google Vectors

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
#Import the Word2Vec model pre-trained on the Google News 300 dataset.
w2v_model = KeyedVectors.load_word2vec_format("/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
def convert_to_pretrained(dataframe, pretrained_emb, learned_emb):
  """
  Converts the textual sentences into vector representations using the pre-trained Google News vectors.

  param dataframe: Pandas DataFrame containing the sentences.
  param pretrained_emb: Word2Vec model containing pre-trained embeddings.
  param learned_emb: Word2Vec model learned on the dataset.
  """
  total_embeddings = []
  i = 0
  j = 0
  k = 0
  for sentence in dataframe:
    sent = sentence.split()
    embedded_sent = []
    for word in sent:
      representation = [0] * 300
      try:
        #This is needed, because the pretrained model will give an error if it encounters an unknown word.
        representation = pretrained_emb[word]
        i += 1
      except:
        try:
          representation = learned_emb.wv[word]
          k += 1
        except:
          a = 0
      embedded_sent.append(representation)
      j += 1
    total_embeddings.append(embedded_sent)
  print("Total times used Google representation: ", str(i))
  print("Total times learned representation used: ", str(k))
  print("Total unknown words: ", str(a))
  print("Total amount of word seen: ", str(j))
  return total_embeddings

In [None]:
def load_and_prepare():
  """
  Loads the train and test set, and already drops the empty rows in the train set.
  """
  train_set = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_train_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], skiprows=[0])
  train_set.dropna(inplace=True)

  test_set = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_test_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], skiprows=[0])

  return train_set, test_set

In [None]:
train_pre, test_pre = load_and_prepare()

In [None]:
print(train_pre.info())
print(test_pre.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161294 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   161294 non-null  object 
 1   Sentiment  161294 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53766 entries, 0 to 53765
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Sentence   53766 non-null  object 
 1   Sentiment  53766 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.2+ MB
None


In [None]:
#Create the same test-val split as before.
X_test, X_val, y_test, y_val = train_test_split(test_pre.Sentence, test_pre.Sentiment, test_size=0.5, random_state=42, stratify=test_pre.Sentiment)

In [None]:
print(X_test.info())

<class 'pandas.core.series.Series'>
Int64Index: 26883 entries, 8500 to 27424
Series name: Sentence
Non-Null Count  Dtype 
--------------  ----- 
26883 non-null  object
dtypes: object(1)
memory usage: 420.0+ KB
None


In [None]:
train_vect_pre = convert_to_pretrained(train_pre.Sentence, w2v_model, embeddings)
print("-------------------")
test_vect_pre = convert_to_pretrained(X_test, w2v_model, embeddings)

Total times used Google representation:  6404553
Total times learned representation used:  688896
Total unknown words:  0
Total amount of word seen:  7100779
-------------------
Total times used Google representation:  1065527
Total times learned representation used:  115799
Total unknown words:  0
Total amount of word seen:  1182551


In [None]:
#This one takes very long (>1 hr)
equalized_train_pre = equalize_vector_lengths(train_vect_pre, des_length=desired_length)
equalized_test_pre = equalize_vector_lengths(test_vect_pre, des_length=desired_length)

In [None]:
train_vect_dict = {'Vectorized': equalized_train_pre}
test_vect_dict = {'Vectorized': equalized_test_pre}

In [None]:
pre_train_df = pd.DataFrame(data=train_vect_dict)
pre_test_df = pd.DataFrame(data=test_vect_dict)

In [None]:
print(pre_train_df.info())
print(pre_test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161294 entries, 0 to 161293
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Vectorized  161294 non-null  object
dtypes: object(1)
memory usage: 1.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26883 entries, 0 to 26882
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Vectorized  26883 non-null  object
dtypes: object(1)
memory usage: 210.1+ KB
None


In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Again, we save the vectors so that we don't have to run the long cell each time we need them.

train_path = '/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/pretrained_train_vectors_drugscom.csv'
test_path = '/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/pretrained_test_vectors_drugscom.csv'
with open(train_path, 'w', encoding = 'utf-8-sig') as f:
  pre_train_df.to_csv(f)

with open(test_path, 'w', encoding = 'utf-8-sig') as f:
  pre_test_df.to_csv(f)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Load the saved vectors from the files.
pre_train_vects_str = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/pretrained_train_vectors_drugscom.csv", sep=",", names=["Vectorized"], skiprows=[0])
pre_test_vects_str = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/pretrained_test_vectors_drugscom.csv", sep=",", names=["Vectorized"], skiprows=[0])

In [None]:
print(pre_train_vects_str.info())
print(pre_test_vects_str.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161294 entries, 0 to 161293
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Vectorized  161294 non-null  object
dtypes: object(1)
memory usage: 2.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26883 entries, 0 to 26882
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Vectorized  26883 non-null  object
dtypes: object(1)
memory usage: 420.0+ KB
None


In [None]:
pre_train_vects_str = pre_train_vects_str.Vectorized.tolist()
pre_test_vects_str = pre_test_vects_str.Vectorized.tolist()

In [None]:
#Convert the textual vectors back to actual vectors.
train_vects_pre = []
for i in pre_train_vects_str:
  numerical = text_to_vector(i)
  train_vects_pre.append(numerical)

In [None]:
print(len(train_vects_pre))
print(len(train_vects_pre[0]))

161294
300


In [None]:
test_vects_pre = []
for i in pre_test_vects_str:
  numerical = text_to_vector(i)
  test_vects_pre.append(numerical)

In [None]:
print(len(test_vects_pre))
print(len(test_vects_pre[0]))

26883
300


In [None]:
train_pre['Vectorized'] = train_vects_pre
X_test_vects_pre =  test_vects_pre

In [None]:
X_train_pre = train_pre.Vectorized.tolist()
y_train_pre = train_pre.Sentiment.tolist()

#X_test_pre = the vectors we read from the file.
#y_test_pre = already created when performing the test-val split.

In [None]:
print(len(X_train_pre))
print(len(y_train_pre))

161294
161294


In [None]:
print(len(X_test_pre))
print(len(y_test_pre))

53766
53766


#Train SVM

In [None]:
#Fine-tune the C and tolerance parameters.
c_values = [0.1, 1, 5, 10]
tolerances = [1e-5, 1e-4, 1e-3, 1e-2]
for tol in tolerances:
  for c in c_values:
    model = LinearSVC(C=c, class_weight = train_weights, dual=False, tol=tol)
    print("Testing model with c = ", str(c), "and tolerance = ", str(tol), "\n \n")
    model.fit(X_train_pre, y_train_pre)
    predictions = model.predict(X_test_pre)
    evaluation = mce.evaluate_performance(predictions, y_test_pre)
    for metric in evaluation:
      print(metric, ": ", evaluation[metric], "\n")

Testing model with c =  0.1 and tolerance =  1e-05 
 

Accuracy :  70.6543 

Base Positive :  {'TP': 29583, 'FP': 7552, 'TN': 10774, 'FN': 5857} 

Base Neutral :  {'TP': 677, 'FP': 2394, 'TN': 46543, 'FN': 4152} 

Base Negative :  {'TP': 7728, 'FP': 5832, 'TN': 34437, 'FN': 5769} 

Advanced Positive :  {'Precision': 0.7966339033257035, 'Recall': 0.834734762979684, 'Specificity': 0.5879078904288988} 

Advanced Neutral :  {'Precision': 0.22044936502767828, 'Recall': 0.14019465727893973, 'Specificity': 0.9510799599485052} 

Advanced Negative :  {'Precision': 0.5699115044247788, 'Recall': 0.5725716825961324, 'Specificity': 0.8551739551516054} 

Balanced Accuracy :  0.5158337009515854 

F_Score :  0.14253169693719425 

Testing model with c =  1 and tolerance =  1e-05 
 

Accuracy :  70.5278 

Base Positive :  {'TP': 29484, 'FP': 7488, 'TN': 10838, 'FN': 5956} 

Base Neutral :  {'TP': 694, 'FP': 2449, 'TN': 46488, 'FN': 4135} 

Base Negative :  {'TP': 7742, 'FP': 5909, 'TN': 34360, 'FN': 575

In [None]:
#Further fine-tuning
c_values = [0.05, 0.1, 0.125, 0.15]
tolerances = [5e-3, 1e-2, 5e-2]
for tol in tolerances:
  for c in c_values:
    model = LinearSVC(C=c, class_weight = train_weights, dual=False, tol=tol)
    print("Testing model with c = ", str(c), "and tolerance = ", str(tol), "\n \n")
    model.fit(X_train_pre, y_train_pre)
    predictions = model.predict(X_test_pre)
    evaluation = mce.evaluate_performance(predictions, y_test_pre)
    for metric in evaluation:
      print(metric, ": ", evaluation[metric], "\n")

Testing model with c =  0.05 and tolerance =  0.005 
 

Accuracy :  70.7548 

Base Positive :  {'TP': 29682, 'FP': 7623, 'TN': 10703, 'FN': 5758} 

Base Neutral :  {'TP': 660, 'FP': 2330, 'TN': 46607, 'FN': 4169} 

Base Negative :  {'TP': 7700, 'FP': 5771, 'TN': 34498, 'FN': 5797} 

Advanced Positive :  {'Precision': 0.7956574185765983, 'Recall': 0.8375282167042889, 'Specificity': 0.5840336134453782} 

Advanced Neutral :  {'Precision': 0.22073578595317725, 'Recall': 0.1366742596810934, 'Specificity': 0.9523877638596563} 

Advanced Negative :  {'Precision': 0.5715982480884864, 'Recall': 0.5704971475142624, 'Specificity': 0.8566887680349649} 

Balanced Accuracy :  0.5148998746332148 

F_Score :  0.1422764643313159 

Testing model with c =  0.1 and tolerance =  0.005 
 

Accuracy :  70.6432 

Base Positive :  {'TP': 29581, 'FP': 7549, 'TN': 10777, 'FN': 5859} 

Base Neutral :  {'TP': 674, 'FP': 2398, 'TN': 46539, 'FN': 4155} 

Base Negative :  {'TP': 7727, 'FP': 5837, 'TN': 34432, 'FN': 5

# Evaluate performance with both hyperparameter sets

In [None]:
#Pre-trained Word2Vec embeddings with the optimal hyperparameters of the other model type.
other_hyperparameter_model = LinearSVC(C=0.125, tol=1e-2, class_weight = train_weights, dual=False)

In [None]:
print("Testing model with c = 0.125 and tolerance = 1e-2 \n \n")
other_hyperparameter_model.fit(X_train_pre, y_train_pre)
predictions = other_hyperparameter_model.predict(X_test_vects_pre)
evaluation = mce.evaluate_performance(predictions, y_test.tolist())
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n")

Testing model with c = 0.125 and tolerance = 1e-2 
 

Accuracy :  70.4051 

Base Positive :  {'TP': 14741, 'FP': 3811, 'TN': 5352, 'FN': 2979} 

Base Neutral :  {'TP': 341, 'FP': 1206, 'TN': 23262, 'FN': 2074} 

Base Negative :  {'TP': 3845, 'FP': 2939, 'TN': 17196, 'FN': 2903} 

Advanced Positive :  {'Precision': 0.7945774040534713, 'Recall': 0.8318848758465012, 'Specificity': 0.5840881807268362} 

Advanced Neutral :  {'Precision': 0.22042663219133807, 'Recall': 0.14120082815734988, 'Specificity': 0.9507111329082883} 

Advanced Negative :  {'Precision': 0.5667747641509434, 'Recall': 0.5697984588026082, 'Specificity': 0.854035261981624} 

Balanced Accuracy :  0.5142947209354864 

F_Score :  0.1411956104717921 



In [None]:
print(mce.confusion_matrix(predictions, y_test.tolist()))

[[ 3845   742  2197]
 [  424   341   782]
 [ 2479  1332 14741]]


In [None]:
#Pre-trained Word2Vec model with optimal hyperparameters for this model.
tuned_hyperparameter_model = LinearSVC(C=0.15, tol=5e-3, class_weight = train_weights, dual=False)

In [None]:
print("Testing model with c = 0.15 and tolerance = 5e-3 \n \n")
tuned_hyperparameter_model.fit(X_train_pre, y_train_pre)
predictions = tuned_hyperparameter_model.predict(X_test_vects_pre)
evaluation = mce.evaluate_performance(predictions, y_test.tolist())
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n")

Testing model with c = 0.15 and tolerance = 5e-3 
 

Accuracy :  70.2972 

Base Positive :  {'TP': 14710, 'FP': 3810, 'TN': 5353, 'FN': 3010} 

Base Neutral :  {'TP': 345, 'FP': 1218, 'TN': 23250, 'FN': 2070} 

Base Negative :  {'TP': 3843, 'FP': 2957, 'TN': 17178, 'FN': 2905} 

Advanced Positive :  {'Precision': 0.7942764578833693, 'Recall': 0.8301354401805869, 'Specificity': 0.5841973152897523} 

Advanced Neutral :  {'Precision': 0.22072936660268713, 'Recall': 0.14285714285714285, 'Specificity': 0.9502206964198137} 

Advanced Negative :  {'Precision': 0.5651470588235294, 'Recall': 0.5695020746887967, 'Specificity': 0.8531412962503104} 

Balanced Accuracy :  0.5141648859088421 

F_Score :  0.14092525104949039 



In [None]:
print(mce.confusion_matrix(predictions, y_test.tolist()))

[[ 3843   741  2216]
 [  424   345   794]
 [ 2481  1329 14710]]


#Test custom-vectored model with Google-vector tuned hyperparameters

In [None]:
#Custom Word2Vec embeddings model with Google-vector optimal hyperparameters
custom_with_googleparameters = LinearSVC(C=0.15, tol=5e-3, class_weight= train_weights, dual=False)

In [None]:
print("Testing custom-vectored model with c = 0.15 and tolerance = 5e-3 \n \n")
custom_with_googleparameters.fit(X_train, y_train)
predictions = custom_with_googleparameters.predict(X_test_vect)
evaluation = mce.evaluate_performance(predictions, y_test.tolist())
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n")

Testing custom-vectored model with c = 0.15 and tolerance = 5e-3 
 

Accuracy :  73.7009 

Base Positive :  {'TP': 14982, 'FP': 2919, 'TN': 6244, 'FN': 2738} 

Base Neutral :  {'TP': 455, 'FP': 1518, 'TN': 22950, 'FN': 1960} 

Base Negative :  {'TP': 4376, 'FP': 2633, 'TN': 17502, 'FN': 2372} 

Advanced Positive :  {'Precision': 0.8369364839953075, 'Recall': 0.8454853273137698, 'Specificity': 0.6814362108479756} 

Advanced Neutral :  {'Precision': 0.230613279270147, 'Recall': 0.18840579710144928, 'Specificity': 0.9379597842079451} 

Advanced Negative :  {'Precision': 0.6243401341132829, 'Recall': 0.6484884410195614, 'Specificity': 0.8692326794139558} 

Balanced Accuracy :  0.5607931884782601 

F_Score :  0.17786014154642654 



In [None]:
print(mce.confusion_matrix(predictions, y_test.tolist()))

[[ 4376   803  1830]
 [  610   455   908]
 [ 1762  1157 14982]]
