## Part 2 -- BERT Application 1
- Mission1: Binary Classification * 
- Mission2: Keywords Extraction

### Introduction of BERT


### Usage
1. Tokenizer  [Doc](https://huggingface.co/transformers/v3.0.2/main_classes/tokenizer.html)  
    Output format: dict
    - `input_ids`: indice of tokens (tokens_tensor)
    - `token_type_ids`: sentence segmentation, 0-first sentence; 1-second sentence (segments_tensor)
    - `attention_mask`: 1 indicate attention required for this token (mask_tensor)
2. Model  [Doc](https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#bertmodel)


In [1]:
import os
from pathlib import Path

import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizerFast, BertModel

In [2]:
# Global Parameter Initialization
batch_size = 16
text_max_length = 128
epochs = 100
lr = 3e-5
validation_ratio = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging_per_step = 50

dataset_dir = Path('./data/')
model_dir = Path('./models/bert-checkpoints/classification/')
os.makedirs(model_dir) if not os.path.exists(model_dir) else ''

print(f'Device: {device}')


Device: cuda


In [3]:
# Data Loading and Preprocessing
train_data = pd.read_csv('./data/train.csv')
train_data['title'] = train_data['title'].fillna('')
train_data['abstract'] = train_data['abstract'].fillna('')

test_data = pd.read_csv('./data/test.csv')
test_data['title'] = test_data['title'].fillna('')
test_data['abstract'] = test_data['abstract'].fillna('')

# Integration
train_data['text'] = train_data['title'] + ' ' + train_data['author'].fillna('') + ' ' + train_data['abstract'] + train_data['Keywords'].fillna('')
test_data['text'] = test_data['title'] + ' ' + test_data['author'].fillna('') + ' ' + test_data['abstract'] + test_data['Keywords'].fillna('')

In [4]:
# Split Validation Dataset
validation_data = train_data.sample(frac=validation_ratio)
train_data = train_data[~train_data.index.isin(validation_data.index)]

In [5]:
# Dataset Definition
class CustomDataset(Dataset):

    def __init__(self, mode: str = 'train') -> None:
        super().__init__()
        self.mode: str = mode
        if mode == 'train':
            self.dataset: pd.DataFrame = train_data
        elif mode == 'validation':
            self.dataset: pd.DataFrame = validation_data
        elif mode == 'test':
            self.dataset: pd.DataFrame = test_data
        else:
            raise Exception(f'Unknown mode "{mode}"')
        
    def __getitem__(self, index: int) -> (str, int):
        data = self.dataset.iloc[index]
        text = data['text']
        if self.mode == 'test':
            label = data['uuid']
        else:
            label = data['label']
        return text, label
    
    def __len__(self):
        return len(self.dataset)
    
train_dataset = CustomDataset('train')
validation_dataset = CustomDataset("validation")


In [6]:
# Pre-trained Tokenizer
tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [7]:
def collate_fn(batch: tuple[str, int]) -> (dict[str, torch.Tensor], torch.LongTensor):
    """
    Transform sentence into tensor, and form batch
    """
    text, label = zip(*batch)
    text, label = list(text), list(label)

    # Generate input source to BERT model
    # padding: fulfill short sentence
    # truncation: truncate long sentence
    src: dict[str, torch.Tensor] = tokenizer(text, padding="max_length", max_length=text_max_length, return_tensors='pt', truncation=True)

    return src, torch.LongTensor(label)


In [8]:
train_loader: DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
validation_loader: DataLoader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [9]:
inputs, targets = next(iter(train_loader))
print(f'inputs: {inputs}')
print(f'targets: {targets}')

inputs: {'input_ids': tensor([[  101, 13228,  9324,  ...,  5265,  1997,   102],
        [  101,  1996,  3894,  ...,  2038,  4719,   102],
        [  101, 15873,  5107,  ..., 11443,  1996,   102],
        ...,
        [  101,  1037,  3319,  ...,  2024, 13047,   102],
        [  101,  1037,  3117,  ...,  1011, 18215,   102],
        [  101,  1056, 21693,  ...,  5524,  1044,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}
targets: tensor([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1])


In [10]:
# Model Definition
# BERT Model + Prediction Layer
class CustomModel(nn.Module):
    
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        self.bert: BertModel = BertModel.from_pretrained("bert-base-uncased")
        self.predictor: nn.Sequential = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, src: dict):
        outputs = self.bert(**src).last_hidden_state[:, 0, :]
        return self.predictor(outputs)

In [11]:
model = CustomModel()
model = model.to(device)

# Loss Function: Binary Cross Entropy
citeria = nn.BCELoss()
# Optimizer: Adam
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def to_device(dict_tensors: dict[str, torch.Tensor]):
    result_tensors: dict = {}
    for k, v in dict_tensors.items():
        result_tensors[k] = v.to(device)
    return result_tensors

def validate():
    model.eval()
    total_loss = 0
    total_correct = 0

    for inputs, targets in validation_loader:
        inputs, targets = to_device(inputs), targets.to(device)
        outputs = model(inputs)
        loss = citeria(outputs.view(-1), targets.float())
        total_loss += float(loss)

        correct_num = (((outputs > 0.5).float() * 1).flatten() == targets).sum()
        total_correct += correct_num
    
    return total_correct / len(validation_dataset), total_loss / len(validation_dataset)

In [13]:
# Training Part

# Switch to Training Mode
model.train()

# Clear Cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

total_loss = 0
step = 0

best_accuracy = 0

for epoch in range(epochs):
    model.train()
    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = to_device(inputs), targets.to(device)
        outputs = model(inputs)
        loss = citeria(outputs.view(-1), targets.float())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += float(loss)
        step += 1

        if step % logging_per_step == 0:
            print(f'Epoch {epoch+1}/{epochs}, Step: {i+1}/{len(train_loader)}, Total Loss: {total_loss:.4f}')
            total_loss = 0
        
        del inputs, targets
    
    accuracy, validation_loss = validate()
    print(f'Epoch {epoch+1}, accuracy: {accuracy:.4f}, validation Loss: {validation_loss:.4f}\n')
    torch.save(model, model_dir / f'model_{epoch+1}.pt')

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model, model_dir / f'model_best.pt')

Epoch 1/100, Step: 50/338, Total Loss: 16.1978
Epoch 1/100, Step: 100/338, Total Loss: 7.0319
Epoch 1/100, Step: 150/338, Total Loss: 5.4716
Epoch 1/100, Step: 200/338, Total Loss: 5.1327
Epoch 1/100, Step: 250/338, Total Loss: 5.4352
Epoch 1/100, Step: 300/338, Total Loss: 6.0479
Epoch 1, accuracy: 0.9633, validation Loss: 0.0061

Epoch 2/100, Step: 12/338, Total Loss: 5.7042
Epoch 2/100, Step: 62/338, Total Loss: 4.0575
Epoch 2/100, Step: 112/338, Total Loss: 2.1664
Epoch 2/100, Step: 162/338, Total Loss: 3.9270
Epoch 2/100, Step: 212/338, Total Loss: 2.7538
Epoch 2/100, Step: 262/338, Total Loss: 3.4065
Epoch 2/100, Step: 312/338, Total Loss: 4.6778
Epoch 2, accuracy: 0.9583, validation Loss: 0.0098

Epoch 3/100, Step: 24/338, Total Loss: 3.7689
Epoch 3/100, Step: 74/338, Total Loss: 2.1891
Epoch 3/100, Step: 124/338, Total Loss: 1.9464
Epoch 3/100, Step: 174/338, Total Loss: 2.7616
Epoch 3/100, Step: 224/338, Total Loss: 2.8268
Epoch 3/100, Step: 274/338, Total Loss: 2.5298
Epoch 3