# 📖 Torch Roberta - ITPT - Intra-task pre-training

![](https://storage.googleapis.com/kaggle-competitions/kaggle/31779/logos/header.png)

## Intra-task pre-training of a `roberta-large` (but trivially adaptable to any MLM model) over the [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021)


Based on this notebook by [torch](): [CommonLit Readability Prize - RoBERTa Torch|ITPT](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-itpt), which in turn is based on this script by huggingface: https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm_no_trainer.py

A good learning reference for this is also the Chapter 7 of the HuggingFace course, there is a detailed step-by-step explanation of the code atoms of this notebook: [Chapter 7 - Section 3 - Fine-tuning a masked language model](https://huggingface.co/course/chapter7/3?fw=pt).


# 🤗 Please _DO_ upvote if you find this helpful or interesting! 🤗


# Imports

In [None]:
import os
import math
import logging
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer,\
                         AdamW, DataCollatorForLanguageModeling,\
                         get_scheduler

# Configuration

In [None]:
class Config:
    model_name = 'roberta-large'
    max_length = 512
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    validation_size = 0.05
    mlm_probability = 0.15
    
    train_batch_size = 4
    eval_batch_size = 4
    
    learning_rate = 2.5e-5
    
    num_train_epochs = 3
        
    lr_scheduler_type = 'constant_with_warmup'
    num_warmup_steps = 0

args = Config()

# Create one CSV with all the texts

In [None]:
def create_mlm_csv():
    """ Read all training texts to a csv file with one column 'text' """
    texts = []
    
    for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
        with open('../input/feedback-prize-2021/train/' + f, 'r') as fp:
            texts.append(fp.read())
    
    df = pd.DataFrame({'text': texts})
    
    display(df.head())
    df.to_csv("mlm_train.csv", index=False)
    return df

df = create_mlm_csv()

# Model and Tokenizer

In [None]:
model = AutoModelForMaskedLM.from_pretrained(args.model_name)
model.to(args.device)

tokenizer = AutoTokenizer.from_pretrained(args.model_name)

# Dataset and DataLoader

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // args.max_length) * args.max_length
    result = {
        k: [t[i : i + args.max_length] for i in range(0, total_length, args.max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
raw_datasets = load_dataset("csv", data_files={'train': 'mlm_train.csv'})

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=['text'])\
                                 .map(group_texts, batched=True)

tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=args.validation_size)
tokenized_datasets['validation'] = tokenized_datasets.pop("test")


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)


dl_train = DataLoader(tokenized_datasets["train"], 
                      shuffle=True, 
                      collate_fn=data_collator, 
                      batch_size=args.train_batch_size)

dl_val = DataLoader(tokenized_datasets["validation"], collate_fn=data_collator, batch_size=args.eval_batch_size)

# Optimizer and Scheduler

In [None]:
optimizer = AdamW(model.parameters(), lr=args.learning_rate)

num_training_steps = args.num_train_epochs * len(dl_train)
lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=num_training_steps,
)

# Training/validation loop

In [None]:
print("***** Running training *****")
print(f"  Num examples = {len(tokenized_datasets['train'])}")
print(f"  Num Epochs = {args.num_train_epochs}")
print(f"  Total training steps = {num_training_steps}")

In [None]:
progress_bar = tqdm(range(num_training_steps))
completed_steps = 0

for epoch in range(args.num_train_epochs):
    model.train()
    cum_loss = 0
    for batch_idx, batch in enumerate(dl_train, 1):
        
        outputs = model(**{k: v.to(args.device) for k, v in batch.items()})
        loss = outputs.loss
        cum_loss += loss.item()
        
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        progress_bar.set_postfix({'loss': cum_loss / batch_idx})
        #if batch_idx > 100:
        #    break

    model.eval()
    losses = []
    for batch_idx, batch in enumerate(dl_val, 1):
        with torch.no_grad():
            outputs = model(**{k: v.to(args.device) for k, v in batch.items()})

        loss = outputs.loss
        losses.append(loss)
        #if batch_idx > 100:
        #    break

    losses = torch.tensor(losses)
    losses = losses[: len(tokenized_datasets['validation'])]
    perplexity = math.exp(torch.mean(losses))

    print(f"Epoch {epoch}: perplexity: {perplexity}")
    model.save_pretrained(f'roberta_large-itpt-e{epoch}')

# 🤗 Please _DO_ upvote if you find this helpful or interesting! 🤗