# [CommonLit] 🤗 distil-Roberta

Kickstarter for training model using 🤗 trainer

#### Loading Libraries

In [None]:
import torch
import transformers

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#### Loading files

In [None]:
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

#### Splitting data into training and validation set

In [None]:
train_df, val_df = train_test_split(train, test_size=.01)
len(train_df), len(val_df)

#### Loading model and tokenizer.
model is a pretrained roberta sequence classifier

In [None]:
tokenizer = AutoTokenizer.from_pretrained('../input/distilrobertabasesequenceclassification/distilroberta-base-sequence_classification/')
model = AutoModelForSequenceClassification.from_pretrained('../input/distilrobertabasesequenceclassification/distilroberta-base-sequence_classification/',num_labels=1 )


#### tokenizing the inputs

In [None]:
train_encodings = tokenizer(list(train_df.excerpt), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df.excerpt), truncation=True, padding=True)

#### creating Datasets class for training

In [None]:
class CommonLitDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
        
        
    def __len__(self):
        return len(self.labels)
    
    
train_dataset = CommonLitDataset(train_encodings, list(train_df.target))
val_dataset = CommonLitDataset(val_encodings, list(val_df.target))


#### We'll be using default hyperparameters for training

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory  
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    do_train=True,
    do_eval=True, 
    report_to="none",
    evaluation_strategy="epoch",
    logging_strategy="epoch"
    
)

In [None]:
class CommonLitTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct( logits.view(-1, self.model.config.num_labels), 
                          labels.float().view(-1, model.config.num_labels))

        return (loss, outputs) if return_outputs else loss


In [None]:
trainer = CommonLitTrainer(
    model = model, 
    args = training_args, 
    train_dataset = train_dataset, 
    eval_dataset = val_dataset,
)

#### Let's train !!!!

In [None]:
trainer.train()

Evaluating the model

In [None]:
trainer.evaluate()

#### Save

In [None]:
tokenizer.save_pretrained('roberta-model')
trainer.save_model('roberta-model')

#### Dataset class and inference method is used from [notebook](https://www.kaggle.com/abhishek/fork-of-fork-of-yum-yum-yum-93f968) by [Abhishek Thakur](https://www.kaggle.com/abhishek)

In [None]:
# Todo merge this with the Training Dataset class above 
class Dataset:
    def __init__(self, excerpt, tokenizer, max_len):
        self.excerpt = excerpt
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, item):
        text = str(self.excerpt[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
        }

In [None]:
def generate_predictions(model_path, max_len):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
    
    dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False
    )

    final_output = []

    for b_idx, data in enumerate(data_loader):
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            output = output.logits.detach().cpu().numpy().ravel().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

#### generating predictions

In [None]:
preds = generate_predictions('roberta-model', max_len=256)


#### Atlast!! Submitting predictions

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")
submission.target = preds
submission.to_csv("submission.csv", index=False)

*Work in Progress !!!*