In [1]:
import pandas as pd
import nltk
import sklearn
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_text
  Downloading tensorflow_text-2.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_text
Successfully installed tensorflow_text-2.12.1


In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [4]:
import sys
sys.path.insert(0, '/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks')

In [5]:
import multi_class_performance_eval as mce

In [6]:
#Setting the random seeds for reproducability
import random
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

In [7]:
#Load the cleaned and tokenized train and test sets.
train = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_train_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], skiprows=[0])
test = pd.read_csv("/content/drive/My Drive/Bachelor Scriptie KI/DrugReviews/clean_test_drugscom.csv", sep=",", names=["Sentence", "Sentiment"], skiprows=[0])

In [8]:
#Drop the three empty rows in the train set.
print(train.info())
train.dropna(inplace=True)
print(train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161297 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   161294 non-null  object 
 1   Sentiment  161297 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 161294 entries, 0 to 161296
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Sentence   161294 non-null  object 
 1   Sentiment  161294 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB
None


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53766 entries, 0 to 53765
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Sentence   53766 non-null  object 
 1   Sentiment  53766 non-null  float64
dtypes: float64(1), object(1)
memory usage: 1.2+ MB


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
#Create the same test-val split as is done for the other models.
X_test, X_val, y_test, y_val = train_test_split(test.Sentence, test.Sentiment, test_size=0.5, random_state=42, stratify=test.Sentiment)

In [12]:
def label_distribution(dataframe):
  """
  Determine the distribution of labels in the dataframe.

  param dataframe: Pandas DataFrame contains the sentences and sentiment labels.
  """
  total = 0
  pos = 0
  neu = 0
  neg = 0

  for label in dataframe:
    if label == 0:
      neg += 1
    elif label == 1:
      neu += 1
    else:
      pos += 1
    total += 1
  return pos, neu, neg, total

In [13]:
pos, neu, neg, total = label_distribution(train.Sentiment)
print("Positive: ", str(round(pos/total*100, 2)))
print("Neutral: ", str(round(neu/total*100, 2)))
print("Negative: ", str(round(neg/total*100, 2)))

Positive:  66.25
Neutral:  8.9
Negative:  24.85


In [14]:
pos, neu, neg, total = label_distribution(y_test)
print("Positive: ", str(round(pos/total*100, 2)))
print("Neutral: ", str(round(neu/total*100, 2)))
print("Negative: ", str(round(neg/total*100, 2)))

Positive:  65.92
Neutral:  8.98
Negative:  25.1


In [15]:
pos, neu, neg, total = label_distribution(y_val)
print("Positive: ", str(round(pos/total*100, 2)))
print("Neutral: ", str(round(neu/total*100, 2)))
print("Negative: ", str(round(neg/total*100, 2)))

Positive:  65.92
Neutral:  8.98
Negative:  25.11


In [16]:
X_train = train.Sentence

In [17]:
#The categorical cross-entropy expects the labels to be one-hot encoded.
y_train = tf.keras.utils.to_categorical(train.Sentiment.tolist())
y_test = tf.keras.utils.to_categorical(y_test.tolist())
y_val = tf.keras.utils.to_categorical(y_val.tolist())

In [18]:
print(len(X_train))
print(len(y_train))
print("--------")
print(len(X_test))
print(len(y_test))

161294
161294
--------
26883
26883


# BERT + CNN

In [19]:
#Load the BERT preprocesser and encoder into KerasLayers.
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [20]:
def create_BERT_CNN(preprocesser, encoder, kernel_size, filters, activation, padding):
  """
  Creates a Tensorflow model that incorporates BERT preprocessing and encoding layers.
  Adds a CNN behind it and a Dense layer with 3 neurons for classification.

  param preprocesser: (KerasLayer) a layer containing the BERT preprocesser.
  param encoder: (KerasLayer) a layer containing the BERT encoder.
  param kernel_size: (int) the size of the kernel used in the Convolutional layer.
  param filters: (int) the amount of filters used in the Convolutional layer.
  param activation: (str) activation function to be used in the Convolutional layer.
  param padding: (str) padding used in Convolutional layer.
  """
  #Define the BERT layers
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text_input')
  preprocessing_bert = preprocesser(text_input)
  encoder_bert = encoder(preprocessing_bert)
  outputs = encoder_bert['sequence_output']

  #Define the CNN that uses the BERT embeddings
  conv = tf.keras.layers.Conv1D(kernel_size=kernel_size, filters=filters, padding=padding, activation=activation, name="conv1d")(outputs)
  conv = tf.keras.layers.GlobalMaxPool1D(name="pool")(conv)

  #Define Dense output layer
  ff = tf.keras.layers.Dense(3, activation='softmax', name="output")(conv)

  classifier = tf.keras.Model(inputs=[text_input], outputs=[ff])
  return classifier

In [21]:
#The hyperparameters are set according to the ones found on the Financial Phrasebank dataset.
classifier = create_BERT_CNN(bert_preprocess, bert_encoder, 1, 256, "relu", "same")

In [22]:
classifier.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text_input[0][0]']             
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [None]:
#The optimal learning rate found on the other dataset was 0.0003.
optim = tf.keras.optimizers.Adam(learning_rate=0.0003)

In [None]:
classifier.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])

In [None]:
#We make use of early stopping to prevent overfitting and checkpoints to save the best weights.
early_stop =  tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode="min", patience=5)
checkpoint = tf.keras.callbacks.ModelCheckpoint("./BERT_conv_model", save_best_only=True)

In [None]:
#Train on the train set and then load the best weigths back into the model.
history = classifier.fit(X_train, y_train, batch_size=16, epochs=50, validation_data=(X_val, y_val), callbacks=[early_stop, checkpoint])
classifier.load_weights("./BERT_conv_model")

Epoch 1/50



Epoch 2/50



Epoch 3/50



Epoch 4/50



Epoch 5/50



Epoch 6/50
Epoch 7/50



Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fbc2a4b7610>

In [None]:
#Predict the sentiment of the sentences in the test set.
predictions = classifier.predict(X_test)



In [None]:
#Convert the one-hot encoded y_pred and y_real back into 0, 1 or 2.
y_pred = []
for pred in predictions:
  label = np.argmax(pred)
  y_pred.append(label)

y_real = []
for encoding in y_test:
  label = np.argmax(encoding)
  y_real.append(label)

In [None]:
evaluation = mce.evaluate_performance(y_pred, y_real)

In [None]:
for metric in evaluation:
  print(metric, ": ", evaluation[metric], "\n \n")

Accuracy :  85.3476 
 

Base Positive :  {'TP': 16266, 'FP': 1560, 'TN': 7603, 'FN': 1454} 
 

Base Neutral :  {'TP': 1155, 'FP': 958, 'TN': 23510, 'FN': 1260} 
 

Base Negative :  {'TP': 5523, 'FP': 1421, 'TN': 18714, 'FN': 1225} 
 

Advanced Positive :  {'Precision': 0.9124873779872097, 'Recall': 0.9179458239277652, 'Specificity': 0.8297500818509221} 
 

Advanced Neutral :  {'Precision': 0.5466161855182206, 'Recall': 0.4782608695652174, 'Specificity': 0.9608468203367664} 
 

Advanced Negative :  {'Precision': 0.7953629032258065, 'Recall': 0.8184647302904564, 'Specificity': 0.9294263719890737} 
 

Balanced Accuracy :  0.7382238079278131 
 

F_Score :  0.4131888948950041 
 



In [None]:
print(mce.confusion_matrix(y_pred, y_real))

[[ 5523   538   883]
 [  387  1155   571]
 [  838   722 16266]]


In [None]:
#Save the trained model
classifier.save("/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks/medical_BERT_CNN_model")

In [None]:
#Load the trained model
load_model = tf.keras.models.load_model("/content/drive/My Drive/Bachelor Scriptie KI/Programming/Notebooks/medical_BERT_CNN_model")

In [None]:
load_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text_input[0][0]']             
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [None]:
#Generate predictions using the loaded model
predictions_loaded = load_model.predict(X_test)

y_pred_loaded = []
for pred in predictions_loaded:
  label = np.argmax(pred)
  y_pred_loaded.append(label)

y_real_loaded = []
for encoding in y_test:
  label = np.argmax(encoding)
  y_real_loaded.append(label)

#Evaluate performance of the model using the current set of hyperparameters
evaluation_loaded = mce.evaluate_performance(y_pred_loaded, y_real_loaded)

print("Tested final model on the test set")
for metric in evaluation_loaded:
  print(metric, ": ", evaluation_loaded[metric], "\n")
print("\n \n \n")

Tested final model on the test set
Accuracy :  85.3513 

Base Positive :  {'TP': 16266, 'FP': 1558, 'TN': 7605, 'FN': 1454} 

Base Neutral :  {'TP': 1157, 'FP': 959, 'TN': 23509, 'FN': 1258} 

Base Negative :  {'TP': 5522, 'FP': 1421, 'TN': 18714, 'FN': 1226} 

Advanced Positive :  {'Precision': 0.9125897666068222, 'Recall': 0.9179458239277652, 'Specificity': 0.8299683509767544} 

Advanced Neutral :  {'Precision': 0.5467863894139886, 'Recall': 0.47908902691511385, 'Specificity': 0.9608059506293934} 

Advanced Negative :  {'Precision': 0.7953334293533055, 'Recall': 0.8183165382335507, 'Specificity': 0.9294263719890737} 

Balanced Accuracy :  0.7384504630254766 

F_Score :  0.4134464338262947 


 
 



In [None]:
print(mce.confusion_matrix(y_pred_loaded, y_real_loaded))

[[ 5522   539   882]
 [  387  1157   572]
 [  839   719 16266]]


#Sentence-level analysis

Positive/neutral and neutral/positive

In [None]:
def get_specific_errors(dataframe, y_pred, y_real, vertical, horizontal):
	"""
	Get the indexes from specific cells in the confusion matrix.

	param dataframe: Pandas DataFrame containing the sentences and indices.
	param y_pred: (list) contains the predicted sentiments.
	param y_real: (list) contains the real sentiments.
	param vertical: (int) corresponds to the column in the confusion matrix.
	param horizontal: (int) corresponds to the row in the confusion matrix.
	"""
	i = 0
	errors = []
	while i < len(dataframe):
		if (horizontal == y_pred[i]) and (vertical == y_real[i]):
			errors.append(dataframe.index[i])
		i += 1
	return errors

In [None]:
#Neutral sentences predicted to be positive.
pos_neu_errors_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 1, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_pos_neu.txt", "w") as writefile:
  for index in pos_neu_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Positive sentences predicted to be neutral.
neu_pos_errors_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 2, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_neu_pos.txt", "w") as writefile:
  for index in neu_pos_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

Negative/neutral and neutral/negative

In [None]:
#Neutral sentences predicted to be negative.
neg_neu_errors_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 1, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_neg_neu.txt", "w") as writefile:
  for index in neg_neu_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Negative sentences predicted to be neutral.
neu_neg_errors_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 0, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_neu_neg.txt", "w") as writefile:
  for index in neu_neg_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

Positive/negative and negative/positive

In [None]:
#Negative sentences predicted to be positive.
pos_neg_errors_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 0, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_pos_neg.txt", "w") as writefile:
  for index in pos_neg_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#Positive sentences predicted to be negative.
neg_pos_errors_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 2, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_neg_pos.txt", "w") as writefile:
  for index in neg_pos_errors_index:
    writefile.write(str(index))
    writefile.write("\n")

Also save the indices of the true positives

In [None]:
#True positives for the positive class.
tp_pos_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 2, 2)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_tp_pos.txt", "w") as writefile:
  for index in tp_pos_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#True positives for the neutral class.
tp_neu_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 1, 1)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_tp_neu.txt", "w") as writefile:
  for index in tp_neu_index:
    writefile.write(str(index))
    writefile.write("\n")

In [None]:
#True positives for the negative class.
tp_neg_index = get_specific_errors(X_test, y_pred_loaded, y_real_loaded, 0, 0)

In [None]:
with open("/content/drive/MyDrive/Bachelor Scriptie KI/Programming/Notebooks/Error Indexes/Medical/DL_medical_tp_neg.txt", "w") as writefile:
  for index in tp_neg_index:
    writefile.write(str(index))
    writefile.write("\n")