In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os 

# Set your working directory to a folder in your Google Drive. This way, if your notebook times out,
# your files will be saved in your Google Drive!

# the base Google Drive directory
root_dir = "/content/drive/My Drive/"

# choose where you want your project files to be saved
project_folder = "Colab Notebooks/BERT"

def create_and_set_working_directory(project_folder):
  # check if your project folder exists. if not, it will be created.
  if os.path.isdir(root_dir + project_folder) == False:
    os.mkdir(root_dir + project_folder)
    print(root_dir + project_folder + ' did not exist but was created.')

  # change the OS to use your project folder as the working directory
  os.chdir(root_dir + project_folder)

create_and_set_working_directory(project_folder)

In [None]:
!pip install transformers

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [None]:
# !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
# !unzip -o smsspamcollection.zip
# !head -10 SMSSpamCollection

In [None]:
!head -10 MAUDE_2008_2016_labeled_data_train.csv

ID,HIT,REPORT
1915321,0,"THE ROTATOR BROKE DURING A LEFT VENTRICULOGRAM PROCEDURE. NO HARM OR INJURY WAS REPORTED. THE CUSTOMER REPORTED THREE (3) DEFECTIVE DEVICES BUT HAS NOT PROVIDED ANY ADDITIONAL INFORMATION OR CLINICAL DETAILS FOR THE ADDITIONAL EVENTS. THE CUSTOMER RETURNED ONE DEVICE. THEREFORE, THIS SINGLE REPORT WILL BE SUBMITTED FOR THIS COMPLAINT. DEVICE EVALUATION: THE EVALUATION/INVESTIGATION IS NOT COMPLETE. A FOLLOW-UP REPORT WILL BE SUBMITTED WHEN THE EVALUATION IS COMPLETED. EVALUATION: CONCLUSIONS - A FOLLOW-UP REPORT WILL BE SUBMITTED WHEN THE DEVICE EVALUATION HAS BEEN COMPLETED."
2042167,0,"THE REPORTER REPORTED THAT ON (B)(6) 2011 THE PT OBTAINED THE ELEVATED BLOOD GLUCOSE READING OF 360 MG/DL AND SHE TESTED POSITIVE FOR KETONES. AT THAT TIME, THE PT EXPERIENCED THE SYMPTOMS OF NAUSEA AND BLURRED VISION. AT 11:11 PM THE PT CHANGED THE CARTRIDGE AND OBSERVED INSULIN LEAKING FROM THE TUBING CONNECTION TO THE CARTRIDGE, AND THE CARTRIDGE COMPARTMENT SMELLED OF INS

In [None]:
file_path = './MAUDE_2008_2016_labeled_data_train.csv'
df = pd.DataFrame({'label':int(), 'text':str()}, index = [])
with open(file_path) as f:
  next(f)
  for line in f.readlines():
    split = line.split(',')
    df = df.append({'label': split[1],
                    'text': split[2]},
                    ignore_index = True)
df.head()

Unnamed: 0,label,text
0,0,"""THE ROTATOR BROKE DURING A LEFT VENTRICULOGRA..."
1,0,"""THE REPORTER REPORTED THAT ON (B)(6) 2011 THE..."
2,0,THERMADRAPE THERMAL BLANKET EMITTED A STRONG F...
3,0,LEICA BIOSYSTEMS RECEIVED A COMPLAINT REGARDIN...
4,1,"""GE HEALTHCARE HAS RECEIVED NOTIFICATION OF A ..."


#Extract text and label values

In [None]:
text = df.text.values
labels = df.label.values

#Preprocessing



In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

In [None]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  #index = random.randint(0, len(text)-1)
  max_len = 0
  for index in range(len(text)):
    table = np.array([tokenizer.tokenize(text[index]), 
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    # print(tabulate(table,
    #               headers = ['Tokens', 'Token IDs'],
    #               tablefmt = 'fancy_grid'))
    max_len = max(max_len, len(table))
  print(max_len)
print_rand_sentence()

1820


In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels.astype(int))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
token_id[6]

In [None]:

def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒══════════════╤═════════════╤══════════════════╕
│ Tokens       │   Token IDs │   Attention Mask │
╞══════════════╪═════════════╪══════════════════╡
│ [CLS]        │         101 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ "            │        1000 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ facility     │        4322 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ reported     │        2988 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ a            │        1037 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ product      │        4031 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ problem      │        3291 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ with         │        2007 │                1 │
├──────────────┼─────────────┼──────────────────┤
│ no           │        2053 │                1 │


#Data split

In [None]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

#Train

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 2e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


Epoch:  25%|██▌       | 1/4 [05:56<17:50, 356.84s/it]


	 - Train loss: 0.5120
	 - Validation Accuracy: 0.8402
	 - Validation Precision: 0.8143
	 - Validation Recall: 0.7746
	 - Validation Specificity: 0.8886



Epoch:  50%|█████     | 2/4 [11:53<11:53, 356.81s/it]


	 - Train loss: 0.3242
	 - Validation Accuracy: 0.8466
	 - Validation Precision: 0.8194
	 - Validation Recall: 0.7675
	 - Validation Specificity: 0.8950



Epoch:  75%|███████▌  | 3/4 [17:50<05:56, 356.78s/it]


	 - Train loss: 0.2172
	 - Validation Accuracy: 0.8434
	 - Validation Precision: 0.7994
	 - Validation Recall: 0.8208
	 - Validation Specificity: 0.8595



Epoch: 100%|██████████| 4/4 [23:47<00:00, 356.84s/it]


	 - Train loss: 0.1335
	 - Validation Accuracy: 0.8402
	 - Validation Precision: 0.7943
	 - Validation Recall: 0.8075
	 - Validation Specificity: 0.8606






#Predict

In [None]:
new_sentence = "customer reported unexpected negative results when testing patient sample with known anti-e on the galileo. the sample resulted as negative for with both cells of capture-r ready-screen i and ii (crrs 2). >< review of instrument images: 2 cell plate. well e1- negative reaction, 28 reaction strength- visually the well appears weakly positive. well f1- negative reaction, 29 reaction strength- visually the well appears weakly positive. per the capture-r package insert: some igg antibodies have been shown to react poorly in solid phase red blood cell adherence assays. weak examples of clinically relevant antibodies may fail to react by cature-r ready-screen, even though the antibodies are detected by an alternative technique. the customer did not return product or sample for further serological testing. the sample could not be ruled as the cause of the event."

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = 'HIT' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'None-HIT'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  customer reported unexpected negative results when testing patient sample with known anti-e on the galileo. the sample resulted as negative for with both cells of capture-r ready-screen i and ii (crrs 2). >< review of instrument images: 2 cell plate. well e1- negative reaction, 28 reaction strength- visually the well appears weakly positive. well f1- negative reaction, 29 reaction strength- visually the well appears weakly positive. per the capture-r package insert: some igg antibodies have been shown to react poorly in solid phase red blood cell adherence assays. weak examples of clinically relevant antibodies may fail to react by cature-r ready-screen, even though the antibodies are detected by an alternative technique. the customer did not return product or sample for further serological testing. the sample could not be ruled as the cause of the event.
Predicted Class:  None-HIT


