# Performance Analysis of BioBERT in Radiology Report Classification
- Author: Eric Yang
- Created: 04/19/21

Script adapted from BERT tutorial: 
- https://colab.research.google.com/drive/1Y4o3jh3ZH70tl6mCd76vz_IxX23biCPP#scrollTo=2bBdb3pt8LuQ

## Imports

In [None]:
import tensorflow as tf                                   
import torch                                              
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pandas as pd                                       
import numpy as np                                        
from transformers import BertTokenizer                    
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from keras.preprocessing.sequence import pad_sequences    
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix   
from sklearn.utils import resample, shuffle
import random

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Load Data

In [None]:
# load data to df
training_data = pd.read_csv('training_table.csv').loc[:,'ACC':'report_excerpts']
testing_data = pd.read_csv('testing_table.csv').loc[:,'ACC':'report_excerpts']
# remove rows where MRN = nan
training_data = training_data.dropna(subset = ['MRN'])
testing_data = testing_data.dropna(subset = ['MRN'])
# get corpus, labels
training_corpus = list(training_data['report_excerpts'])
testing_corpus = list(testing_data['report_excerpts'])
training_MM_labels = np.array(list(training_data['medial_meniscus_R']))
training_LM_labels = np.array(list(training_data['lateral_meniscus_R']))
testing_MM_labels = np.array(list(testing_data['medial_meniscus_R']))
testing_LM_labels = np.array(list(testing_data['lateral_meniscus_R']))

# CHANGE n_samp to perform sampling experiments
n_samp = round(len(training_corpus)*0.1)
training_corpus, training_LM_labels, training_MM_labels = shuffle(training_corpus, 
                                                                  training_LM_labels, 
                                                                  training_MM_labels,
                                                                  n_samples = n_samp, random_state=2)

## Tokenize, pad, and convert texts into tensors

In [None]:
# load BioBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

In [None]:
# Tokenize all of the training reports and map the tokens to their word IDs.
input_ids_train = []

for report in training_corpus:
    encoded_report = tokenizer.encode(
                        report,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids_train.append(encoded_report)

# Print report 0, now as a list of IDs.
print('Original: ', training_corpus[0])
print('Token IDs:', input_ids_train[0])

# Tokenize all of the testing reports and map the tokens to their word IDs.
input_ids_test = []

for report in testing_corpus:
    encoded_report = tokenizer.encode(
                        report,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids_test.append(encoded_report)

In [None]:
# get training report length summary
MAX_LEN = max([len(rep) for rep in input_ids_train])
print('Max report length: ', MAX_LEN)
print('Min report length: ', min([len(rep) for rep in input_ids_train]))
print('Mean report length: ', np.mean([len(rep) for rep in input_ids_train]))

In [None]:
# pad all sentences to max report length
input_ids_train = pad_sequences(input_ids_train, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

In [None]:
# Create attention masks, tells us which tokens are words and which are padding

# train data
attention_masks_train = []
for rep in input_ids_train:
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in rep]
    attention_masks_train.append(att_mask)

# test data
attention_masks_test = []
for rep in input_ids_test:
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in rep]
    attention_masks_test.append(att_mask)

In [None]:
# convert to pytorch tensors
train_inputs = torch.tensor(input_ids_train)
test_inputs = torch.tensor(input_ids_test)

train_MM_labels = torch.tensor(training_MM_labels)
train_LM_labels = torch.tensor(training_LM_labels)
test_MM_labels = torch.tensor(testing_MM_labels)
test_LM_labels = torch.tensor(testing_LM_labels)

train_masks = torch.tensor(attention_masks_train)
test_masks = torch.tensor(attention_masks_test)

In [None]:
# define batch size for training using DataLoader
batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_LM_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
test_data = TensorDataset(test_inputs, test_masks, test_LM_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## Configure pre-trained model

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1", 
    num_labels = 2, # The number of output labels--2 for binary classification.  
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# define optimizer 
# try AdamW from huggingface
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )
# learning rate scheduler
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

## Train model

In [None]:
# perform training

seed_val = 5

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_loss = 0
    # Put the model into training mode. 
    model.train()

    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # Unpack this training batch from our dataloader. 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # clear any previously calculated gradients before performing a backward pass.
        model.zero_grad() 

        # Perform a forward pass
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # get loss
        loss = outputs[0]
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader) 

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
print("")
print("Training complete!")


## Model testing

In [None]:
# func to calculate specificity, the rest of the metrics imported from sklearn
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    spec = tn / (tn+fp)
    return spec

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# perform testing
accuracy_scores = []
cohen_kappa_scores = []
precision_scores = []
recall_scores = []
specificity_scores = []
f1_scores = []

for i in range(100):
    testing_corpus_samp, testing_LM_labels_samp = resample(testing_corpus, testing_LM_labels, replace=True)

    # Tokenize all of the testing reports and map the tokens to their word IDs.
    input_ids_test = []

    for report in testing_corpus_samp:
        encoded_report = tokenizer.encode(
                            report,                      
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                      )
        input_ids_test.append(encoded_report)

    # pad all sentences to max report length
    input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")

    # Create attention masks, tells us which tokens are words and which are padding
    # test data
    attention_masks_test = []
    for rep in input_ids_test:
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in rep]
        attention_masks_test.append(att_mask)

    # convert to pytorch tensors
    test_inputs = torch.tensor(input_ids_test)
    test_masks = torch.tensor(attention_masks_test)
    test_LM_labels_samp = torch.tensor(testing_LM_labels_samp)
    # test_LM_labels = torch.tensor(testing_LM_labels_samp)

    # Create the DataLoader for our validation set.
    test_data = TensorDataset(test_inputs, test_masks, test_LM_labels_samp)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    #print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in test_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      
      # Telling the model not to compute or store gradients, saving memory and 
      # speeding up prediction
      with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)

      logits = outputs[0]

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()
      
      # Store predictions and true labels
      predictions.append(logits)
      true_labels.append(label_ids)

    # Combine the predictions for each batch into a single list of 0s and 1s.
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    # Combine the correct labels for each batch into a single list.
    flat_true_labels = [item for sublist in true_labels for item in sublist]

    accuracy_scores.append(accuracy_score(flat_true_labels, flat_predictions))
    precision_scores.append(precision_score(flat_true_labels, flat_predictions))
    recall_scores.append(recall_score(flat_true_labels, flat_predictions))
    specificity_scores.append(specificity(flat_true_labels, flat_predictions)) 
    f1_scores.append(f1_score(flat_true_labels, flat_predictions)) 
print('    DONE.')

## Format results

In [None]:
# format bootstrapping results
accuracy_scores = sorted(accuracy_scores, reverse = False)
precision_scores = sorted(precision_scores, reverse = False)
recall_scores = sorted(recall_scores, reverse = False)
specificity_scores = sorted(specificity_scores, reverse = False)
f1_scores = sorted(f1_scores, reverse = False)
accuracy_CI = ((accuracy_scores[1]+accuracy_scores[2])/2, accuracy_scores[49], (accuracy_scores[97]+accuracy_scores[98])/2)
precision_CI = ((precision_scores[1]+precision_scores[2])/2, precision_scores[49], (precision_scores[97]+precision_scores[98])/2)
recall_CI = ((recall_scores[1]+recall_scores[2])/2, recall_scores[49], (recall_scores[97]+recall_scores[98])/2)
specificity_CI = ((specificity_scores[1]+specificity_scores[2])/2, specificity_scores[49], (specificity_scores[97]+specificity_scores[98])/2)
f1_CI = ((f1_scores[1]+f1_scores[2])/2, f1_scores[49], (f1_scores[97]+f1_scores[98])/2)
results = {'accuracy_CI':accuracy_CI,
           'precision_CI':precision_CI,
           'recall_CI':recall_CI,
           'specificity_CI':specificity_CI,
           'f1_CI':f1_CI}