### Used only for Google Colab
Run before everything

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# 
# import zipfile
# zip = zipfile.ZipFile('/content/drive/MyDrive/Colab Datasets/SumPubMed/data.zip')
# zip.extractall('/tmp')
# zip.close()

In [2]:
from transformers import BertTokenizerFast, BertModel
from transformers import AdamW
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

print(torch.cuda.is_available())

True


## Dataset Parameters

In [3]:
num_docs = 32689
num_neg_ex = 2

test_docs_per = 0.1
dataset_usage_per = 0.01

data_path = 'processed_data/preprocessed/results'

num_test_docs = int(2 * num_neg_ex * num_docs * test_docs_per * dataset_usage_per) + 1
num_train_docs = int((2 * num_neg_ex * num_docs - num_test_docs) * dataset_usage_per)

## Model Hyperparameters

In [4]:
num_of_epochs = 50
learning_rate = 1e-4
batch_size = 16
hidden_layers = 8

## Utility functions

In [5]:
def read_doc(path):
    with open(path, encoding="utf8") as f:
        data = f.read()
        f.close()
    return data

def read_labels(path):
    labels = []
    with open(path, encoding="utf8") as f:
        count = 1
        for line in f:
            if count > num_train_docs:
                break
            labels.append(float(line.rstrip()))
            count += 1
    return torch.FloatTensor(labels)

## Loading Dataset
Because of the large size of the Dataset, only the document indices will be used for splitting into train/validation sets. The documents will be loaded when the tokenization process takes place.

In [6]:
indices = [i for i in range(1, num_train_docs+1)]
labels = read_labels(f"{data_path}/train/labels.txt")[:num_train_docs]

train_indices, val_indices, train_y, val_y = train_test_split(indices, labels, test_size=0.15, random_state=42)

### Tokenization
Each document is loaded from the disk and then run through the tokenizer.

In [7]:
pretrained_model = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model)

# Tokenize train data
documents = []
for i in train_indices:
    text = read_doc(f"{data_path}/train/data_{i}.txt")
    documents.append(text)
train_X = tokenizer(documents, max_length=512, truncation='longest_first', return_tensors="pt")


# Tokenize validation data
documents = []
for i in val_indices:
    text = read_doc(f"{data_path}/train/data_{i}.txt")
    documents.append(text)
val_X = tokenizer(documents, max_length=512, truncation='longest_first', return_tensors="pt")

del documents

### Custom PyTorch Dataset
Create a custom PyTorch Dataset that will contain the tokenized text encodings. Then use DataLoaders to prepare the Dataset for training and testing.

In [8]:
class SumPubMedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = SumPubMedDataset(train_X, train_y)
val_dataset = SumPubMedDataset(val_X, val_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

## Model Setup

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

in_features = 768 # it's 768 because that's the size of the output provided by the underlying BERT model

class TexSumClassifier(torch.nn.Module):
    def __init__(self, linear_size):
        super(TexSumClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout1 = torch.nn.Dropout(0.9)
        self.linear1 = torch.nn.Linear(in_features=in_features, out_features=1)
        # self.linear1 = torch.nn.Linear(in_features=in_features, out_features=linear_size)
        # self.batch_norm1 = torch.nn.BatchNorm1d(num_features=linear_size)
        # self.dropout2 = torch.nn.Dropout(p=0.9)
        # self.linear2 = torch.nn.Linear(in_features=linear_size, out_features=1)
        # self.batch_norm2 = torch.nn.BatchNorm1d(num_features=1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        # print(f"bert: {bert_output[1]}")
        # x = self.dropout1(bert_output[1])
        # print(f"dropout: {x}")
        x = self.linear1(bert_output[1])
        # x = self.dropout2(x)
        # x = self.batch_norm1(x)
        # x = self.linear2(x)
        # x = self.batch_norm2(x)
        # print(f"linear: {x}")
        x = self.sigmoid(x)
        # print(f"sigmoid: {x}")
        return x

    def freeze_bert(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=False

    def unfreeze_bert(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the weights of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=True

### Metrics

In [11]:
def eval_prediction(y_batch_actual, y_batch_predicted):
    """Return batches of accuracy and f1 scores."""
    y_batch_actual_np = y_batch_actual.cpu().detach().numpy()
    y_batch_predicted_np = np.round(y_batch_predicted.cpu().detach().numpy())

    acc = accuracy_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np)
    f1 = f1_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np, average='weighted')

    return acc, f1

### Model Initialization

In [12]:
model = TexSumClassifier(linear_size=hidden_layers)
model.to(device)

# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCELoss()

### Training step

In [13]:
def training_step(dataloader, model, optimizer, loss_fn, if_freeze_bert):
    """Method to train the model"""

    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()

    epoch_loss = 0

    for i, batch in enumerate(tqdm(dataloader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = torch.flatten(model(tokens=input_ids, attention_mask=attention_mask))

        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    return epoch_loss/i

### Validation step

In [14]:
def validation_step(dataloader, model, loss_fn):
    """Method to test the model's accuracy and loss on the validation set"""

    model.eval()
    model.freeze_bert()

    size = len(dataloader)
    f1, acc = 0, 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)

            pred = model(tokens=X, attention_mask=attention_mask)

            acc_batch, f1_batch = eval_prediction(y.float(), pred)
            acc += acc_batch
            f1 += f1_batch

        acc = acc/size
        f1 = f1/size

    return acc, f1

## Training the Model

In [None]:
tqdm.pandas()

best_acc, best_f1 = 0, 0
path = "./best_model.pt"
os.makedirs(os.path.dirname(path), exist_ok=True)
if_freeze_bert = False

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = False
        print("Bert is not frozen")
    else:
        if_freeze_bert = True
        print("Bert is frozen")

    epoch_loss = training_step(train_loader, model,optimizer, loss_fn, if_freeze_bert)
    train_acc, train_f1 = validation_step(train_loader, model, loss_fn)
    val_acc, val_f1 = validation_step(val_loader, model, loss_fn)

    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}, loss: {:.3f}".format(train_acc, train_f1, epoch_loss))

    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model, path)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: #1
Bert is not frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.501, f1: 0.347, loss: 0.709
Validation results: 
Acc: 0.481, f1: 0.317
Epoch: #2
Bert is not frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.498, f1: 0.340, loss: 0.706
Validation results: 
Acc: 0.548, f1: 0.404
Epoch: #3
Bert is not frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.499, f1: 0.342, loss: 0.707
Validation results: 
Acc: 0.505, f1: 0.350
Epoch: #4
Bert is not frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.501, f1: 0.344, loss: 0.708
Validation results: 
Acc: 0.466, f1: 0.308
Epoch: #5
Bert is not frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.493, f1: 0.335, loss: 0.710
Validation results: 
Acc: 0.519, f1: 0.362
Epoch: #6
Bert is frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.498, f1: 0.339, loss: 0.712
Validation results: 
Acc: 0.519, f1: 0.364
Epoch: #7
Bert is frozen


  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training results: 
Acc: 0.507, f1: 0.351, loss: 0.703
Validation results: 
Acc: 0.495, f1: 0.342
Epoch: #8
Bert is frozen


  0%|          | 0/70 [00:00<?, ?it/s]

## Testing the Model

### Load Test Dataset

In [None]:
test_indices = [i for i in range(1, num_test_docs+1)]
test_y = read_labels(f"{data_path}/test/labels.txt")[:num_test_docs]

# Tokenize train data
documents = []
for i in test_indices:
    text = read_doc(f"{data_path}/test/data_{i}.txt")
    documents.append(text)
test_X = tokenizer(documents, max_length=512, truncation='longest_first', return_tensors="pt")

### Evaluate on Test Predictions

In [None]:
test_X.to(device)
model = torch.load(path)
model.eval()
with torch.no_grad():
    predictions = model(tokens=test_X['input_ids'], attention_mask=test_X['attention_mask'])
    acc_test, f1_test = eval_prediction(test_y, predictions)

print("Testing results: ")
print("Acc: {:.3f}, f1: {:.3f}".format(acc_test, f1_test))