In [1]:
from transformers import BertTokenizerFast, BertModel
from transformers import AdamW
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

print(torch.cuda.is_available())

True


## Dataset Parameters

In [2]:
train_docs = 287113
val_docs = 13368
test_docs = 11490

num_neg_ex = 1

docs_per = 0.002

train_docs_used = int(2 * num_neg_ex * train_docs * docs_per)
val_docs_used = int(2 * num_neg_ex * val_docs * docs_per)
test_docs_used = int(2 * num_neg_ex * test_docs * docs_per)

print(f"Number of training documents used:{train_docs_used}")
print(f"Number of validation documents used:{val_docs_used}")
print(f"Number of testing documents used:{test_docs_used}")

data_path = 'processed_data'

Number of training documents used:1148
Number of validation documents used:53
Number of testing documents used:45


## Model Hyperparameters

In [3]:
num_of_epochs = 100
learning_rate = 1e-5
batch_size = 16
hidden_layers = 84

## Loading Dataset
Because of the large size of the Dataset, only a small portion of the documents will be loaded for training and validation.

The labels are converted from 0, 1 to [1, 0] and [0, 1] respectively.

In [4]:
train_df = pd.read_csv(f"{data_path}/train.csv")
train_docs = train_df['input'][:train_docs_used].to_list()
train_y = train_df['label'][:train_docs_used].apply(lambda label: [0, 1] if label == 1 else [1, 0]).to_list()

val_df = pd.read_csv(f"{data_path}/validation.csv")
val_docs = val_df['input'][:val_docs_used].to_list()
val_y = val_df['label'][:val_docs_used].apply(lambda label: [0, 1] if label == 1 else [1, 0]).to_list()

### Tokenization
Each document is run through the tokenizer.

In [5]:
pretrained_model = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model)

# Tokenize train data
train_X = tokenizer(train_docs, max_length=512, truncation='longest_first', return_tensors="pt", padding="max_length")
val_X = tokenizer(val_docs, max_length=512, truncation='longest_first', return_tensors="pt", padding="max_length")

### Custom PyTorch Dataset
Create a custom PyTorch Dataset that will contain the tokenized text encodings. Then use DataLoaders to prepare the Dataset for training and testing.

In [6]:
class CnnDailymailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
train_dataset = CnnDailymailDataset(train_X, train_y)
val_dataset = CnnDailymailDataset(val_X, val_y)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

## Model Setup

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

in_features = 768 # it's 768 because that's the size of the output provided by the underlying BERT model

class TexSumClassifier(torch.nn.Module):
    def __init__(self, linear_size):
        super(TexSumClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout1 = torch.nn.Dropout()
        self.linear1 = torch.nn.Linear(in_features=in_features, out_features=linear_size)
        self.batch_norm1 = torch.nn.BatchNorm1d(num_features=linear_size)
        self.dropout2 = torch.nn.Dropout(p=0.8)
        self.linear2 = torch.nn.Linear(in_features=linear_size, out_features=2)
        self.batch_norm2 = torch.nn.BatchNorm1d(num_features=2)
        self.sigmoid = torch.nn.Sigmoid()
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        x = self.dropout1(bert_output[1])
        x = self.linear1(x)
        x = self.batch_norm1(x)
        x = self.dropout2(x)
        x = self.linear2(x)
        x = self.batch_norm2(x)
        return self.softmax(x)

    def freeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=False

    def unfreeze_bert(self):
        for param in self.bert.named_parameters():
            param[1].requires_grad=True

### Metrics

In [9]:
def eval_prediction(y_batch_actual, y_batch_predicted):
    """Return batches of accuracy and f1 scores."""
    y_batch_actual_np = torch.argmax(y_batch_actual, dim=1).cpu().detach().numpy()
    y_batch_predicted_np = torch.argmax(y_batch_predicted, dim=1).cpu().detach().numpy()

    acc = accuracy_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np)
    f1 = f1_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np, average='weighted')

    return acc, f1

### Model Initialization

In [10]:
model = TexSumClassifier(linear_size=hidden_layers)
model.to(device)

# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCELoss()

### Training step

In [11]:
def training_step(dataloader, model, optimizer, loss_fn):
    """Method to train the model"""

    model.train()
    model.unfreeze_bert()

    epoch_loss = 0

    for i, batch in enumerate(tqdm(dataloader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        # outputs = torch.flatten(model(tokens=input_ids, attention_mask=attention_mask))
        outputs = model(tokens=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels.float())
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    return epoch_loss/i

### Validation step

In [12]:
def validation_step(dataloader, model):
    """Method to test the model's accuracy and loss on the validation set"""

    model.eval()
    model.freeze_bert()

    size = len(dataloader)
    f1, acc = 0, 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)

            pred = model(tokens=X, attention_mask=attention_mask)

            acc_batch, f1_batch = eval_prediction(y.float(), pred)
            acc += acc_batch
            f1 += f1_batch

        acc = acc/size
        f1 = f1/size

    return acc, f1

## Training the Model

In [13]:
tqdm.pandas()

best_acc = 0
path = "./best_model.pt"
os.makedirs(os.path.dirname(path), exist_ok=True)

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    epoch_loss = training_step(train_loader, model,optimizer, loss_fn)
    
    if i % 10 == 0: 
        # Print accuracy and F1 statistics every 10 epochs
        train_acc, train_f1 = validation_step(train_loader, model)
        val_acc, val_f1 = validation_step(val_loader, model)
        
        print("Training results: ")
        print("Acc: {:.3f}, f1: {:.3f}, loss: {:.3f}".format(train_acc, train_f1, epoch_loss))
        
        print("Validation results: ")
        print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
        
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model, path)
    else:
        # Else print just the epoch loss
        print("Training results: ")
        print("Loss: {:.3f}".format(epoch_loss))

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: #1


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.508, f1: 0.377, loss: 0.864
Validation results: 
Acc: 0.509, f1: 0.346
Epoch: #2


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.916
Epoch: #3


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.892
Epoch: #4


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.893
Epoch: #5


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.891
Epoch: #6


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.901
Epoch: #7


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.903
Epoch: #8


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.901
Epoch: #9


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.894
Epoch: #10


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.904
Epoch: #11


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.555, f1: 0.471, loss: 0.885
Validation results: 
Acc: 0.569, f1: 0.485
Epoch: #12


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.888
Epoch: #13


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.832
Epoch: #14


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.822
Epoch: #15


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.817
Epoch: #16


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.748
Epoch: #17


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.753
Epoch: #18


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.704
Epoch: #19


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.698
Epoch: #20


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.648
Epoch: #21


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.591, f1: 0.508, loss: 0.664
Validation results: 
Acc: 0.716, f1: 0.690
Epoch: #22


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.661
Epoch: #23


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.684
Epoch: #24


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.645
Epoch: #25


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.685
Epoch: #26


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.643
Epoch: #27


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.631
Epoch: #28


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.637
Epoch: #29


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.637
Epoch: #30


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.646
Epoch: #31


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.592, f1: 0.505, loss: 0.617
Validation results: 
Acc: 0.700, f1: 0.643
Epoch: #32


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.623
Epoch: #33


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.646
Epoch: #34


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.636
Epoch: #35


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.618
Epoch: #36


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.640
Epoch: #37


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.618
Epoch: #38


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.630
Epoch: #39


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.624
Epoch: #40


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.625
Epoch: #41


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.590, f1: 0.511, loss: 0.634
Validation results: 
Acc: 0.681, f1: 0.622
Epoch: #42


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.646
Epoch: #43


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.632
Epoch: #44


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.638
Epoch: #45


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.627
Epoch: #46


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.602
Epoch: #47


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.616
Epoch: #48


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.633
Epoch: #49


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.605
Epoch: #50


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.612
Epoch: #51


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.587, f1: 0.502, loss: 0.637
Validation results: 
Acc: 0.634, f1: 0.559
Epoch: #52


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.611
Epoch: #53


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.640
Epoch: #54


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.630
Epoch: #55


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.627
Epoch: #56


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.628
Epoch: #57


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.615
Epoch: #58


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.707
Epoch: #59


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.622
Epoch: #60


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.644
Epoch: #61


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.592, f1: 0.511, loss: 0.643
Validation results: 
Acc: 0.716, f1: 0.688
Epoch: #62


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.635
Epoch: #63


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.627
Epoch: #64


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.611
Epoch: #65


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.625
Epoch: #66


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.612
Epoch: #67


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.625
Epoch: #68


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.614
Epoch: #69


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.635
Epoch: #70


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.618
Epoch: #71


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.591, f1: 0.506, loss: 0.624
Validation results: 
Acc: 0.716, f1: 0.674
Epoch: #72


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.614
Epoch: #73


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.606
Epoch: #74


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.605
Epoch: #75


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.600
Epoch: #76


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.609
Epoch: #77


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.619
Epoch: #78


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.633
Epoch: #79


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.598
Epoch: #80


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.630
Epoch: #81


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.591, f1: 0.505, loss: 0.612
Validation results: 
Acc: 0.750, f1: 0.717
Epoch: #82


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.616
Epoch: #83


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.612
Epoch: #84


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.612
Epoch: #85


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.622
Epoch: #86


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.617
Epoch: #87


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.618
Epoch: #88


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.644
Epoch: #89


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.627
Epoch: #90


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.615
Epoch: #91


  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Training results: 
Acc: 0.593, f1: 0.509, loss: 0.618
Validation results: 
Acc: 0.716, f1: 0.679
Epoch: #92


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.619
Epoch: #93


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.602
Epoch: #94


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.605
Epoch: #95


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.609
Epoch: #96


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.620
Epoch: #97


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.625
Epoch: #98


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.626
Epoch: #99


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.626
Epoch: #100


  0%|          | 0/72 [00:00<?, ?it/s]

Training results: 
Loss: 0.605


## Testing the Model

### Load Test Dataset

In [14]:
test_df = pd.read_csv(f"{data_path}/test.csv")
test_docs = test_df['input'][:test_docs_used].to_list()
test_y = test_df['label'][:test_docs_used].apply(lambda label: [0, 1] if label == 1 else [1, 0]).to_list()

test_X = tokenizer(test_docs, max_length=512, truncation='longest_first', return_tensors="pt", padding="max_length")

### Evaluate on Test Predictions

In [15]:
test_X.to(device)
model = torch.load(path)
model.eval()
with torch.no_grad():
    predictions = model(tokens=test_X['input_ids'], attention_mask=test_X['attention_mask'])
    acc_test, f1_test = eval_prediction(torch.tensor(test_y), predictions)

print("Testing results: ")
print("Acc: {:.3f}, f1: {:.3f}".format(acc_test, f1_test))

Testing results: 
Acc: 0.689, f1: 0.653
