# Feedback Prize Predicting Effective Arguments - BERT Large Model training with KFolds and WandB 🎯

This notebooks shows how you can train big transformer models in this exciting competition data. Here I am training BERT-Large model on a max-length of 185 characters.

I hope to add more optimizations to increase the max-length without affecting training or throwing OOM errors.

<center>
    <img src="https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle">
</center>

In [None]:
%%sh
pip install -q --upgrade transformers

In [None]:
import os
import wandb
import platform
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.simplefilter('ignore')

In [None]:
Config = {
    'NB_EPOCHS': 3,
    'N_SPLITS': 5,
    'NUM_WORKERS': 8,
    'NUM_LABELS': 3,
    'LR': 2e-5,
    'T_0': 20,
    'η_min': 1e-4,
    'MAX_LEN': 200,
    'N_SPLITS': 5,
    'TRAIN_BS': 16,
    'VALID_BS': 32,
    'DEVICE': 'cuda',
    'MODEL_NAME': 'bert-large-uncased',
    'CSV_PATH': '../input/feedback-prize-effectiveness/train.csv',
    'TXT_PATHS': '../input/feedback-prize-effectiveness/train/',
    'TOKENIZER': transformers.BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True),
    'scaler': GradScaler(),
    'infra': "Kaggle",
    'competition': 'feedback_prize',
    '_wandb_kernel': 'tanaym',
}

## About W&B:
<center><img src="https://i.imgur.com/gb6B4ig.png" width="400" alt="Weights & Biases"/></center><br>
<p style="text-align:center">WandB is a developer tool for companies turn deep learning research projects into deployed software by helping teams track their models, visualize model performance and easily automate training and improving models.
We will use their tools to log hyperparameters and output metrics from your runs, then visualize and compare results and quickly share findings with your colleagues.<br><br></p>

To login to W&B, you can use below snippet.

```python
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wb_key = user_secrets.get_secret("WANDB_API_KEY")

wandb.login(key=wb_key)
```
Make sure you have your W&B key stored as `WANDB_API_KEY` under Add-ons -> Secrets

You can view [this](https://www.kaggle.com/ayuraj/experiment-tracking-with-weights-and-biases) notebook to learn more about W&B tracking.

If you don't want to login to W&B, the kernel will still work and log everything to W&B in anonymous mode.

In [None]:
# Start W&B logging
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wb_key = user_secrets.get_secret("WANDB_API_KEY")

wandb.login(key=wb_key)

run = wandb.init(
    project='pytorch',
    config=Config,
    group='nlp',
    job_type='train',
)

In [None]:
def fetchEssay(essay_id: str):
    """
    Read the text file of the specific essay_id
    """
    essay_path = os.path.join(Config['TXT_PATHS'], essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    return essay_text

def wandb_log(**kwargs):
    """
    Logs a key-value pair to W&B
    """
    for k, v in kwargs.items():
        wandb.log({k: v})

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data, config=Config, is_test=False):
        self.data = data
        self.config = config
        self.is_test = is_test
    
    def __getitem__(self, idx):
        text = self.data['text'].values[idx]
        if not self.is_test:
            target_value = self.data['discourse_effectiveness'].values[idx]
                
        inputs = self.config['TOKENIZER'].encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=Config['MAX_LEN'],
            pad_to_max_length=True
        )
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
            }
        else:
            targets = torch.tensor(target_value, dtype=torch.long)
            return {
                'ids': ids,
                'mask': mask,
                'token_type_ids': token_type_ids,
                'targets': targets
            }
        
    def __len__(self):
        return len(self.data)

In [None]:
class BERTModel(nn.Module):
    def __init__(self, backbone_arch):
        super(BERTModel, self).__init__()
        hidden_size = self.get_model_size(backbone_arch)
        
        self.backbone = transformers.BertModel.from_pretrained(backbone_arch)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, Config['NUM_LABELS'])
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.backbone(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.dropout(output)
        output = self.fc(output)
        return output
        
    def get_model_size(self, backbone_name: str):
        """
        Gets the size of the BERT model variant and decides other parameter based off it.
        """
        if "base" in backbone_name:
            hidden_size = 768
        elif "large" in backbone_name:
            hidden_size = 1024
        else:
            raise NotImplementedError("Unknown BERT variant")
        
        return hidden_size

In [None]:
class Trainer:
    def __init__(self, config, dataloaders, optimizer, model, loss_fns, scheduler, device="cuda:0", apex=False):
        self.train_loader, self.valid_loader = dataloaders
        self.train_loss_fn, self.valid_loss_fn = loss_fns
        self.scheduler = scheduler
        self.optimizer = optimizer
        self.model = model
        self.device = torch.device(device)
        self.apex = apex
        self.config = config

    def train_one_epoch(self):
        """
        Trains the model for 1 epoch
        """
        if self.apex:
            scaler = GradScaler()

        self.model.train()
        train_pbar = tqdm(enumerate(self.train_loader), total=len(self.train_loader))
        train_preds, train_labels = [], []
        running_loss = 0
        for bnum, data_cache in train_pbar:
            ids = self._convert_if_not_tensor(data_cache['ids'], dtype=torch.long)
            mask = self._convert_if_not_tensor(data_cache['mask'], dtype=torch.long)
            ttis = self._convert_if_not_tensor(data_cache['token_type_ids'], dtype=torch.long)
            targets = self._convert_if_not_tensor(data_cache['targets'], dtype=torch.long)
            
            # Support of Apex 🛠️
            if self.apex:
                outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis)
                loss = self.train_loss_fn(outputs, targets)
                
                Config['scaler'].scale(loss).backward()
                Config['scaler'].step(self.optimizer)
                Config['scaler'].update()
                self.optimizer.zero_grad()
                self.scheduler.step()
            
            # No Apex
            else:
                outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis)
                loss = self.train_loss_fn(outputs, targets)
                
                loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
                self.scheduler.step()
            
            loss_value = loss.item()
            running_loss += loss_value

            train_pbar.set_description(desc=f"loss: {loss_value:.4f}")

            # Chug the targets and labels in a list
            train_preds += [outputs.detach().cpu().numpy()]
            train_labels += [targets.detach().cpu().numpy()]
        
        all_train_preds = np.concatenate(train_preds)
        all_train_labels = np.concatenate(train_labels)
        
        # Tidy
        del outputs, targets, train_preds, train_labels, loss_value, loss, ids, mask, ttis
        gc.collect()
        torch.cuda.empty_cache()
        
        return running_loss / len(self.train_loader)

    @torch.no_grad()
    def valid_one_epoch(self):
        """
        Validates the model for 1 epoch
        """
        self.model.eval()
        valid_pbar = tqdm(enumerate(self.valid_loader), total=len(self.valid_loader))
        valid_preds, valid_targets = [], []
        running_val_loss = 0

        for idx, cache in valid_pbar:
            ids = self._convert_if_not_tensor(cache['ids'], dtype=torch.long)
            mask = self._convert_if_not_tensor(cache['mask'], dtype=torch.long)
            ttis = self._convert_if_not_tensor(cache['token_type_ids'], dtype=torch.long)
            targets = self._convert_if_not_tensor(cache['targets'], dtype=torch.long)

            outputs = self.model(ids=ids, mask=mask, token_type_ids=ttis)
            valid_loss = self.valid_loss_fn(outputs, targets)
            running_val_loss += valid_loss.item()
            
            valid_pbar.set_description(desc=f"val_loss: {valid_loss.item():.4f}")

            valid_preds += [outputs.cpu().numpy()]
            valid_targets += [targets.cpu().numpy()]

        all_valid_preds = np.concatenate(valid_preds)
        all_valid_targets = np.concatenate(valid_targets)

        # Tidy
        del ids, mask, ttis, targets, outputs, valid_loss, valid_preds, valid_targets
        gc.collect()
        torch.cuda.empty_cache()
        
        return all_valid_preds, all_valid_targets, running_val_loss / len(self.valid_loader)

    def fit(self, fold: str, epochs: int = 10, output_dir: str = "/kaggle/working/", custom_name: str = 'model.pth'):
        """
        Low-effort alternative for doing the complete training and validation process
        """
        best_loss = int(1e+7)
        for epx in range(epochs):
            print(f"{'='*20} Epoch: {epx+1} / {epochs} {'='*20}")

            train_running_loss = self.train_one_epoch()
            print(f"Training loss: {train_running_loss:.4f}")

            valid_preds, valid_targets, valid_loss = self.valid_one_epoch()
            # valid_loss = log_loss(valid_targets, valid_preds)
            print(f"Validation loss: {valid_loss:.4f}")

            if valid_loss < best_loss:
                best_loss = valid_loss
                self.save_model(output_dir, custom_name)
                print(f"Saved model with val_loss: {best_loss:.4f}")
            
            # Log
            wandb_log(
                train_loss=train_running_loss,
                val_loss=valid_loss
            )
            
    def save_model(self, path, name, verbose=False):
        """
        Saves the model at the provided destination
        """
        try:
            if not os.path.exists(path):
                os.makedirs(path)
        except:
            print("Errors encountered while making the output directory")

        torch.save(self.model.state_dict(), os.path.join(path, name))
        if verbose:
            print(f"Model Saved at: {os.path.join(path, name)}")

    def _convert_if_not_tensor(self, x, dtype):
        if self._tensor_check(x):
            return x.to(self.device, dtype=dtype)
        else:
            return torch.tensor(x, dtype=dtype, device=self.device)

    def _tensor_check(self, x):
        return isinstance(x, torch.Tensor)

In [None]:
if __name__ == '__main__':
    kf = StratifiedKFold(n_splits=Config['N_SPLITS'])
    train_file = pd.read_csv(Config['CSV_PATH'])
    train_file['discourse_effectiveness'] = train_file['discourse_effectiveness'].map(
        {'Ineffective': 0, 'Adequate': 1, 'Effective': 2}
    )
    train_file['essay'] = train_file['essay_id'].apply(fetchEssay)
    train_file['text'] = train_file['discourse_text'] + ' [SEP] ' + train_file['essay']
    train_file = train_file.drop(['discourse_text', 'essay'], axis=1)
    
    for fold_, (train_idx, valid_idx) in enumerate(kf.split(X=train_file, y=train_file['essay_id'])):
        print(f"{'='*40} Fold: {fold_+1} / {Config['N_SPLITS']} {'='*40}")
        
        train_ = train_file.loc[train_idx]
        valid_ = train_file.loc[valid_idx]
        
        train_set = BERTDataset(
            data = train_,
            config = Config,
        )
        valid_set = BERTDataset(
            data = valid_,
            config = Config,
        )
        
        train_loader = DataLoader(
            train_set,
            batch_size = Config['TRAIN_BS'],
            shuffle = True,
            num_workers = Config['NUM_WORKERS'],
            pin_memory = True
        )
        
        valid_loader = DataLoader(
            valid_set,
            batch_size = Config['VALID_BS'],
            shuffle = False,
            num_workers = Config['NUM_WORKERS'],
        )
        
        model = BERTModel(backbone_arch=Config['MODEL_NAME'])
        model = model.to(torch.device(Config['DEVICE']))
        
        # Log model to WandB
        wandb.watch(model)
            
        optimizer = torch.optim.AdamW(model.parameters(), lr=Config['LR'])
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, 
            T_0=Config['T_0'], 
            eta_min=Config['η_min']
        )
        train_lfn, valid_lfn = nn.CrossEntropyLoss(), nn.CrossEntropyLoss()
        
        trainer = Trainer(
            config = Config,
            dataloaders=(train_loader, valid_loader),
            loss_fns=(train_lfn, valid_lfn),
            optimizer=optimizer,
            model = model,
            scheduler=scheduler,
            apex=True
        )
        
        trainer.fit(
            fold = fold_,
            epochs = Config['NB_EPOCHS'],
            custom_name = f"{Config['MODEL_NAME']}_fold_{fold_}.bin"
        )

In [None]:
run.finish()

**NOTEBOOK IS STILL UNDER PROGRESS**