In [1]:
from transformers import BertTokenizerFast, BertModel
from transformers import AdamW

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

print(torch.cuda.is_available())

True


## Dataset Parameters

In [2]:
num_docs = 32689
num_neg_ex = 2

test_docs_per = 0.1
dataset_usage_per = 0.005

num_test_docs = int(2 * num_neg_ex * num_docs * test_docs_per * dataset_usage_per) + 1
num_train_docs = int((2 * num_neg_ex * num_docs - num_test_docs) * dataset_usage_per)

## Model Hyperparameters

In [3]:
num_of_epochs = 2
learning_rate = 27e-6
batch_size = 16
hidden_layers = 8

## Utility functions

In [4]:
def read_doc(path):
    with open(path, encoding="utf8") as f:
        data = f.read()
        f.close()
    return data
    
def read_labels(path):
    labels = []
    with open(path, encoding="utf8") as f:
        count = 1
        for line in f:
            if count > num_train_docs:
                break
            labels.append(float(line.rstrip()))
            count += 1
    return labels

## Loading Dataset
Because of the large size of the Dataset, only the document indices will be used for splitting into train/validation sets. The documents will be loaded when the tokenization process takes place.

In [5]:
indices = [i for i in range(1, num_train_docs+1)]
labels = read_labels(f"data\\raw\\train\\labels.txt")

train_indices, val_indices, train_y, val_y = train_test_split(indices, labels, test_size=0.15, random_state=42)

### Tokenization
Each document is loaded from the disk and then run through the tokenizer.

In [6]:
pretrained_model = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model)

# Tokenize train data
documents = []
for i in train_indices:
    text = read_doc(f"data\\raw\\train\\data_{i}.txt")
    documents.append(text)
train_X = tokenizer(documents, max_length=512, truncation='longest_first', return_tensors="pt")

    
# Tokenize validation data
documents = []
for i in val_indices:
    text = read_doc(f"data\\raw\\train\\data_{i}.txt")
    documents.append(text)
val_X = tokenizer(documents, max_length=512, truncation='longest_first', return_tensors="pt")

del documents

### Custom PyTorch Dataset
Create a custom PyTorch Dataset that will contain the tokenized text encodings. Then use DataLoaders to prepare the Dataset for training and testing.

In [7]:
class SumPubMedDataset(torch.utils.data.Dataset):    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = SumPubMedDataset(train_X, train_y)
val_dataset = SumPubMedDataset(val_X, val_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

## Model Setup

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

in_features = 768 # it's 768 because that's the size of the output provided by the underlying BERT model

class TexSumClassifier(torch.nn.Module):
    def __init__(self, linear_size):
        super(TexSumClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout1 = torch.nn.Dropout()
        self.linear1 = torch.nn.Linear(in_features=in_features, out_features=linear_size)
        self.batch_norm1 = torch.nn.BatchNorm1d(num_features=linear_size)
        self.dropout2 = torch.nn.Dropout(p=0.8)
        self.linear2 = torch.nn.Linear(in_features=linear_size, out_features=1)
        self.batch_norm2 = torch.nn.BatchNorm1d(num_features=1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        x = self.dropout1(bert_output[1])
        x = self.linear1(x)
        x = self.dropout2(x)
        x = self.batch_norm1(x)
        x = self.linear2(x)
        x = self.batch_norm2(x)
        return self.sigmoid(x)
    
    def freeze_bert(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=False
    
    def unfreeze_bert(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the weights of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=True

### Metrics

In [10]:
def eval_prediction(y_batch_actual, y_batch_predicted):
    """Return batches of accuracy and f1 scores."""
    y_batch_actual_np = y_batch_actual.cpu().detach().numpy()
    y_batch_predicted_np = np.round(y_batch_predicted.cpu().detach().numpy())
    
    acc = accuracy_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np)
    f1 = f1_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np, average='weighted')
    
    return acc, f1

### Model Initialization

In [11]:
model = TexSumClassifier(linear_size=hidden_layers)
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCELoss()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Training step

In [12]:
def training_step(dataloader, model, optimizer, loss_fn, if_freeze_bert):
    """Method to train the model"""
    
    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()
      
    epoch_loss = 0
 
    for i, batch in enumerate(dataloader):        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = torch.flatten(model(tokens=input_ids, attention_mask=attention_mask))
                        
        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

### Validation step

In [13]:
def validation_step(dataloader, model, loss_fn):
    """Method to test the model's accuracy and loss on the validation set"""
    
    model.eval()
    model.freeze_bert()
    
    size = len(dataloader)
    f1, acc = 0, 0
    
    with torch.no_grad():
        for batch in dataloader:
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)
                  
            pred = model(tokens=X, attention_mask=attention_mask)
            
            acc_batch, f1_batch = eval_prediction(y.float(), pred)                        
            acc += acc_batch
            f1 += f1_batch

        acc = acc/size
        f1 = f1/size
                
    return acc, f1

## Training the Model

In [14]:
tqdm.pandas()

best_acc, best_f1 = 0, 0
path = "models\\sumpubmed_bert\\best_model.pt"
if_freeze_bert = False

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = False
        print("Bert is not frozen")
    else:
        if_freeze_bert = True
        print("Bert is frozen")
    
    training_step(train_loader, model,optimizer, loss_fn, if_freeze_bert)
    train_acc, train_f1 = validation_step(train_loader, model, loss_fn)
    val_acc, val_f1 = validation_step(val_loader, model, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))
    
    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: #1
Bert is not frozen


  # Remove the CWD from sys.path while we load stuff.


RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 2.00 GiB total capacity; 5.00 GiB already allocated; 0 bytes free; 5.05 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Testing the Model

### Load Test Dataset

In [None]:
test_indices = [i for i in range(1, num_test_docs+1)]
test_y = read_labels(f"data\\raw\\test\\labels.txt")

# Tokenize train data
test_X = []
for i in test_indices:
    text = read_doc(f"data\\raw\\test\\data_{i}.txt")
    encodings = tokenizer(text, max_length=512, truncation='longest_first', return_tensors="pt")
    test_X.append(encodings)

### Evaluate on Test Predictions

In [None]:
model = torch.load(path)
model.eval()
with torch.no_grad():
    predictions = model(tokens=test_X['input_ids'], attention_mask=test_X['attention_mask'])
    acc_test, f1_test = eval_prediction(test_y.float(), predictions) 
binary_predictions = np.round(predictions.cpu().detach().numpy()).astype(int).flatten()
    
print("Testing results: ")
print("Acc: {:.3f}, f1: {:.3f}".format(acc_test, f1_test))