<div class="alert alert-info">
    <h1 align='center'>PyTorch Lightning ⚡ Training Notebook: RoBERTa with KFolds and TPU Support 👨‍💻</h1>
</div>

<img src="https://miro.medium.com/max/876/0*ViwaI3Vvbnd-CJSQ.png">

<p style='text-align: center'>
    This notebook features PyTorch lightning fine-tuning of RoBERTa Large model from 🤗 transformers. Training is done in KFolds format and support for GPU/TPU is also provided.<br> Huge inspiration for this notebook was <a href="https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train">Y Nakama's training notebook</a>. I would encourage you to fork my notebook and play around with the models and other parameters.
    <br>
    To run the model on TPU, un-comment and run the below cell and change the <code>gpus=1</code> argument to <code>tpu_cores=1</code> or <code>tpu_cores=8</code> in the <code>Trainer</code> class.
</p>

<h3 style='color: #fc0362; font-size: 1.5em; font-weight: 300; font-size: 24px'>If you liked this notebook, kindly leave an upvote ⬆️</h3>

<div class="alert alert-warning" role="alert" align="center">    
    <h2>1. Installation and Imports</h2>
</div>

In [None]:
# Installation necessary for TPU training
# ! curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# ! python pytorch-xla-env-setup.py --version 1.7 --apt-packages libomp5 libopenblas-dev

In [None]:
%%sh
pip install -q pytorch-lightning
pip install -q --upgrade transformers

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import transformers
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import os
import re
import json
import cv2
from sklearn.model_selection import StratifiedKFold

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import warnings
warnings.simplefilter('ignore')

<div class="alert alert-warning" role="alert" align="center">    
    <h2>2. Functions, Variables, Configs and Preprocessing</h2>
</div>

In [None]:
table = {
'A': 'Human Necessities',
'B': 'Operations and Transport',
'C': 'Chemistry and Metallurgy',
'D': 'Textiles',
'E': 'Fixed Constructions',
'F': 'Mechanical Engineering',
'G': 'Physics',
'H': 'Electricity',
'Y': 'Emerging Cross-Sectional Technologies'
}

def pearson(prediction, ground_truth):
    """
    Pearson Correlation Coefficient
    """
    return np.corrcoef(prediction, ground_truth)[0][1]

def get_cpc_texts():
    """
    Function taken from Y Nakama's notebook: 
    https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train
    """
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

In [None]:
class Config:
    NB_EPOCHS = 5
    LR = 2e-5
    MAX_LEN = 128
    N_SPLITS = 5
    TRAIN_BS = 16
    VALID_BS = 32
    NUM_WORKERS = 4
    MODEL_NAME = 'roberta-large'
    TRAIN_FILE = '../input/us-patent-phrase-to-phrase-matching/train.csv'
    TEST_FILE = '../input/us-patent-phrase-to-phrase-matching/test.csv'
    TOKENIZER = transformers.RobertaTokenizer.from_pretrained('roberta-large')

In [None]:
cpc_texts = get_cpc_texts()
train_file = pd.read_csv(Config.TRAIN_FILE)
test_file = pd.read_csv(Config.TEST_FILE)
train_file['context_text'] = train_file['context'].map(cpc_texts)
test_file['context_text'] = test_file['context'].map(cpc_texts)

train_file['text'] = train_file['anchor'] + '[SEP]' + train_file['target'] + '[SEP]' + train_file['context_text']
test_file['text'] = test_file['anchor'] + '[SEP]' + test_file['target'] + '[SEP]' + test_file['context_text']

train_file['score_map'] = train_file['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})

train_file.to_csv("train.csv", index=None)
test_file.to_csv("test.csv", index=None)

train_file.head()

<div class="alert alert-warning" role="alert" align="center">    
    <h2>3. Custom Dataset Class</h2>
</div>

In [None]:
class PPPMDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.is_test = is_test
        self.texts = df['text'].values
        if not self.is_test:
            self.scores = df['score'].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = Config.TOKENIZER.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=Config.MAX_LEN,
            pad_to_max_length=True
        )
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
            }
        else:
            targets = torch.tensor(self.scores[idx], dtype=torch.float)
            return {
                'ids': ids,
                'mask': mask,
                'targets': targets
            }

<div class="alert alert-warning" role="alert" align="center">    
    <h2>4. Model class</h2>
</div>

In [None]:
class Model(pl.LightningModule):
    def __init__(self, train_df, valid_df) -> None:
        super(Model, self).__init__()
        self.model = transformers.RobertaModel.from_pretrained(Config.MODEL_NAME)
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(1024, 1)
        self.all_targets = []
        self.train_loss_fn = nn.BCEWithLogitsLoss()
        self.valid_loss_fn = nn.BCEWithLogitsLoss()
        
        self.train_df = train_df
        self.valid_df = valid_df
        
    def forward(self, ids, mask) -> torch.Tensor:
        _, output = self.model(ids, attention_mask=mask, return_dict=False)
        output = self.drop(output)
        output = self.out(output)
        return output
    
    def prepare_data(self) -> None:
        # Make Training and Validation Datasets
        self.training_set = PPPMDataset(
            self.train_df
        )

        self.validation_set = PPPMDataset(
            self.valid_df
        )

    def train_dataloader(self):
        train_loader = DataLoader(
            self.training_set,
            batch_size=Config.TRAIN_BS,
            shuffle=True,
            num_workers=Config.NUM_WORKERS,
            pin_memory=True
        )
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(
            self.validation_set,
            batch_size=Config.VALID_BS,
            shuffle=False,
            num_workers=Config.NUM_WORKERS,
        )
        return val_loader
    
    def training_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids=ids, mask=mask)

        train_loss = self.train_loss_fn(outputs, targets.view(-1, 1))
        return {'loss': train_loss}
    
    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids=ids, mask=mask)

        self.all_targets.extend(targets.cpu().detach().numpy().tolist())
        
        valid_loss = self.valid_loss_fn(outputs, targets.view(-1, 1))
        return {'val_loss': valid_loss}
    
    def validation_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        logs = {'val_loss': avg_loss}
        
        print(f"val_loss: {avg_loss}")
        return {'avg_val_loss': avg_loss, 'log': logs}
    
    def configure_optimizers(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        return transformers.AdamW(optimizer_parameters, lr=Config.LR)

<div class="alert alert-warning" role="alert" align="center">    
    <h2>5. Main training and Validation</h2>
    
    Keep in mind, I'm only training here for 1 fold and not for all 5 since it will take a lot of time and my GPU quota will be exhausted in just 1 notebook!
</div>

In [None]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
        DEVICE = torch.device('cuda:0')
    else:
        print("\n[INFO] GPU not found. Using CPU: {}\n".format(platform.processor()))
        DEVICE = torch.device('cpu')
    
    data = pd.read_csv("./train.csv")
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Do Kfolds training and cross validation
    kf = StratifiedKFold(n_splits=Config.N_SPLITS)
    nb_bins = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, 'bins'] = pd.cut(data['score'], bins=nb_bins, labels=False)
    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X=data, y=data['bins'].values)):
        if fold != 0:
            continue
        print(f"\nFold: {fold}")
        print(f"{'-'*20}\n")
        
        train_data = data.loc[train_idx]
        valid_data = data.loc[valid_idx]
        
        model = Model(train_data, valid_data)
        trainer = pl.Trainer(max_epochs=Config.NB_EPOCHS, gpus=1)
        trainer.fit(model)

<center>
<img src="https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle">
</center>