# Arash Hajian nezhad | DataCoLab interview task | BERT training

#### Installing `transformers` library

In [1]:
%%capture
!pip install transformers

#### Imports

In [2]:
%matplotlib inline

import ast

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertTokenizer, logging
logging.set_verbosity_error()

from tqdm import tqdm

#### Variables

In [3]:
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
df = pd.read_csv('/content/stories_processed.csv')
df.head()

Unnamed: 0,body,labels
0,hello and welcome to BBC News a woman who gave...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
1,news now out of North Hollywood. A 14 yearold ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
2,homelessness his city's greatest failure. That...,"[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
3,Minneapolis police officer Kim Potter guilty o...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,Judy an update now to the wildfires that wiped...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]"


#### Function for getting `pos_weight` for BCEWithLogitsLoss.
It is a vector the size of the labels. This is used when there is an imbalance in labels
distribution, especially for multi-label classification.

For each label, it is calculated as the number of negative samples of that label divided by the number of positive samples of the same label.

In [4]:
def get_loss_pos_weights(labels):
    label_count = len(ast.literal_eval(labels[0]))

    positives = [0] * label_count
    negatives = [0] * label_count

    for label in labels:
        label = ast.literal_eval(label)

        for i, item in enumerate(label):
            if item == 0:
                negatives[i] += 1
            else:
                positives[i] += 1
    
    # division by zero exception
    for i in range(label_count):
        if positives[i] == 0:
            positives[i] = 1
    
    weights = [negatives[i] / positives[i] for i in range(label_count)]

    return weights

#### Load tokenizer for BERT

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### Define dataset class

In [6]:
class StoriesDataset(Dataset):
    def __init__(self, df):
        super().__init__()

        self.text = [tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors='pt') for text in df['body']]
        self.labels = [torch.tensor(ast.literal_eval(label)).float() for label in df['labels'].values]
        self.__size = len(self.text)
    
    def __len__(self):
        return self.__size
    
    def __getitem__(self, idx):
        return self.text[idx], self.labels[idx]

#### Define model class

In [7]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(768, 15),  # we have 15 labels
        )
    
    def forward(self, input_ids, mask):
        _, x = self.bert(input_ids=input_ids, attention_mask=mask, return_dict=False)  # taking only the pooled output
        x = self.classifier(x)
        
        return x

#### Train and Validation functions

In [8]:
def validate(model, validation_dataloader, criterion):
    model.eval()

    eval_loss = 0

    with torch.no_grad():
        for data, labels in tqdm(validation_dataloader):
            data, labels = data.to(DEVICE), labels.to(DEVICE)

            input_ids = data['input_ids'].squeeze(1)  # Bx1x512 -> Bx512
            mask = data['attention_mask'].squeeze(1)  # Bx1x512 -> Bx512

            preds = model(input_ids=input_ids, mask=mask)

            loss = criterion(preds, labels)
            eval_loss += loss.item()
        
        eval_loss = eval_loss / len(validation_dataloader)

        return eval_loss


def train(model, epochs, train_dataloader, validation_dataloader, criterion, optimizer, scheduler = None):
    model.to(DEVICE)

    best_val_loss = 0
    train_loss, validation_loss = [], []

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        for data, labels in tqdm(train_dataloader):
            data, labels = data.to(DEVICE), labels.to(DEVICE)

            input_ids = data['input_ids'].squeeze(1)  # Bx1x512 -> Bx512
            mask = data['attention_mask'].squeeze(1)  # Bx1x512 -> Bx512
            
            optimizer.zero_grad()

            preds = model(input_ids=input_ids, mask=mask)

            loss = criterion(preds, labels)
            loss.backward()
            epoch_loss += loss.item()

            optimizer.step()
        
        if scheduler:
            scheduler.step()

        epoch_loss = epoch_loss / len(train_dataloader)
        train_loss.append(epoch_loss)

        print(f'EPOCH {epoch + 1:4d} | LOSS: {epoch_loss:.4f}')
        print('---------------------------------------------\n')

        eval_loss = validate(model, validation_dataloader, criterion)
        validation_loss
        print()
        print('[|||||||||||||||||||||||||||||]')
        print('| VALIDATION ROUND            |')
        print(f'| VALID LOSS: {eval_loss:.4f}          |')    
        print('[|||||||||||||||||||||||||||||]')
        print()
        print()

        # saving the best model's weights
        if epoch == 0:
            best_val_loss = eval_loss
            torch.save(model.state_dict(), 'bert_topic_classifier.pt')
        elif eval_loss < best_val_loss:
            best_val_loss = eval_loss
            torch.save(model.state_dict(), 'bert_topic_classifier.pt')

    return train_loss, validation_loss

#### Preparing train and validation datasets

In [9]:
from sklearn.model_selection import train_test_split


df_train, df_valid = train_test_split(df, train_size=0.85, shuffle=True)
loss_pos_weight = get_loss_pos_weights(df['labels'])

BATCH_SIZE = 16
train_dataloader = DataLoader(StoriesDataset(df_train), batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(StoriesDataset(df_valid), batch_size=BATCH_SIZE // 2, shuffle=False)

#### Prepare model and training parameters

In [10]:
EPOCHS = 5

model = BertClassifier()
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(loss_pos_weight).to(DEVICE))
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8)

#### Train

In [11]:
train_loss, validation_loss = train(model, EPOCHS, train_dataloader, valid_dataloader, criterion, optimizer, scheduler)

100%|██████████| 274/274 [07:00<00:00,  1.53s/it]


EPOCH    1 | LOSS: 1.2550
---------------------------------------------



100%|██████████| 97/97 [00:26<00:00,  3.66it/s]



[|||||||||||||||||||||||||||||]
| VALIDATION ROUND            |
| VALID LOSS: 1.3026          |
[|||||||||||||||||||||||||||||]




100%|██████████| 274/274 [06:59<00:00,  1.53s/it]


EPOCH    2 | LOSS: 1.2501
---------------------------------------------



100%|██████████| 97/97 [00:26<00:00,  3.67it/s]



[|||||||||||||||||||||||||||||]
| VALIDATION ROUND            |
| VALID LOSS: 1.3332          |
[|||||||||||||||||||||||||||||]




100%|██████████| 274/274 [06:59<00:00,  1.53s/it]


EPOCH    3 | LOSS: 1.2513
---------------------------------------------



100%|██████████| 97/97 [00:26<00:00,  3.65it/s]



[|||||||||||||||||||||||||||||]
| VALIDATION ROUND            |
| VALID LOSS: 1.3039          |
[|||||||||||||||||||||||||||||]




100%|██████████| 274/274 [06:59<00:00,  1.53s/it]


EPOCH    4 | LOSS: 1.2496
---------------------------------------------



100%|██████████| 97/97 [00:26<00:00,  3.67it/s]



[|||||||||||||||||||||||||||||]
| VALIDATION ROUND            |
| VALID LOSS: 1.3061          |
[|||||||||||||||||||||||||||||]




100%|██████████| 274/274 [06:59<00:00,  1.53s/it]


EPOCH    5 | LOSS: 1.2485
---------------------------------------------



100%|██████████| 97/97 [00:26<00:00,  3.68it/s]



[|||||||||||||||||||||||||||||]
| VALIDATION ROUND            |
| VALID LOSS: 1.2988          |
[|||||||||||||||||||||||||||||]




#### Reading final dataframe

In [12]:
final = pd.read_csv('/content/to_fill_proccessed.csv')

#### Write a function to infere the text

In [14]:
def infere_text(model: nn.Module, text: str):
    tokenized_text = tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
    
    input_ids = tokenized_text['input_ids']
    mask = tokenized_text['attention_mask']

    model.to('cpu')
    model.eval()
    with torch.no_grad():
        return model(input_ids=input_ids, mask=mask)

#### Checking the output

In [15]:
index = 1
output = infere_text(model, df_train['body'].iloc[index])
print(df_train['labels'].iloc[index])
print(output[0].sigmoid().numpy().tolist())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0.4622751772403717, 0.49163854122161865, 0.5101140737533569, 0.5118148922920227, 0.4966602921485901, 0.47596871852874756, 0.4965690076351166, 0.5146575570106506, 0.503700315952301, 0.5132160186767578, 0.5153558254241943, 0.44886836409568787, 0.5017725825309753, 0.5090907216072083, 0.49593818187713623]


From here on, we notice the model overfits to outputting the same values no matter the input text. This is not desirable and is a failure.
One way to fix this would be to fine-tune 15 seperate BERT models on each label, which is
left for another day as this is an interview task that is time-sensitive :)

In [16]:
output = infere_text(model, final['body'][0])
F.sigmoid(output)

tensor([[0.4623, 0.4916, 0.5101, 0.5118, 0.4967, 0.4760, 0.4966, 0.5147, 0.5037,
         0.5132, 0.5154, 0.4489, 0.5018, 0.5091, 0.4959]])

In [18]:
infere_text(model, final['body'][4]).sigmoid()

tensor([[0.4623, 0.4916, 0.5101, 0.5118, 0.4967, 0.4760, 0.4966, 0.5147, 0.5037,
         0.5132, 0.5154, 0.4489, 0.5018, 0.5091, 0.4959]])

In [19]:
infere_text(model, final['body'][12]).sigmoid()

tensor([[0.4623, 0.4916, 0.5101, 0.5118, 0.4967, 0.4760, 0.4966, 0.5147, 0.5037,
         0.5132, 0.5154, 0.4489, 0.5018, 0.5091, 0.4959]])