#### About

> Commonlit readability prize.

Dataset link - https://www.kaggle.com/c/commonlitreadabilityprize

In [1]:
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
#importing modules
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [5]:
#creating dataset class
class CommonLitDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_length):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        target = self.targets.iloc[idx]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'target': torch.tensor(target, dtype=torch.float)}


In [6]:
#creating model class
class CommonLitModel(nn.Module):
    def __init__(self, model_name):
        super(CommonLitModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.linear = nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs['pooler_output']
        return self.linear(pooler_output)

In [7]:
#training loop
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target = batch['target'].to(device)
        output = model(input_ids, attention_mask)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

In [8]:
#validation loop
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target = batch['target'].to(device)
            output = model(input_ids, attention_mask)
            loss = criterion(output, target)
            running_loss += loss.item()
    return running_loss / len(dataloader)

In [9]:
#splitting the data into train, val split
train_texts, val_texts, train_targets, val_targets = train_test_split(df['excerpt'], df['target'], test_size=0.2, random_state=42)

In [10]:
#initialize the tokenizer and create datasets
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 256

train_dataset = CommonLitDataset(train_texts, train_targets, tokenizer, max_length)
val_dataset = CommonLitDataset(val_texts, val_targets, tokenizer, max_length)


In [11]:
#create dataloaders
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [12]:
for batch in train_dataloader:
    print(batch)
    break

{'input_ids': tensor([[    0,   113,   170,  ...,     1,     1,     1],
        [    0,  1213,    58,  ...,     1,     1,     1],
        [    0,   133,  2051,  ...,     1,     1,     1],
        ...,
        [    0,  1708,     6,  ...,     1,     1,     1],
        [    0,  2515,  4639,  ...,     4,     2,     1],
        [    0, 29971, 25717,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'target': tensor([-0.4455, -2.8029, -2.1589, -2.7834, -0.0268, -1.7258, -1.2391,  0.1710,
        -0.8115, -2.6296, -2.1286, -2.5245, -0.0128, -1.8778, -1.8267, -1.4839,
        -2.9348, -2.6490, -1.6965, -1.5065,  0.2470, -0.6204, -2.3627,  0.7261,
        -1.8298, -0.1298, -1.8326, -1.0071,  0.0711, -0.2123, -1.4536, -0.6980])}


In [13]:
model = CommonLitModel(model_name).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.MSELoss()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
#Training the model
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, device)
    val_loss = evaluate(model, val_dataloader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/10, Train Loss: 1.2227, Validation Loss: 1.0517
Epoch 2/10, Train Loss: 1.0902, Validation Loss: 1.0466
Epoch 3/10, Train Loss: 1.0789, Validation Loss: 1.0480
Epoch 4/10, Train Loss: 1.0847, Validation Loss: 1.0549
Epoch 5/10, Train Loss: 1.0776, Validation Loss: 1.0473
Epoch 6/10, Train Loss: 1.0784, Validation Loss: 1.0469
Epoch 7/10, Train Loss: 1.0813, Validation Loss: 1.0523
Epoch 8/10, Train Loss: 1.0861, Validation Loss: 1.0602
Epoch 9/10, Train Loss: 1.0818, Validation Loss: 1.0479
Epoch 10/10, Train Loss: 1.0789, Validation Loss: 1.0627


In [16]:
#save the model
torch.save(model.state_dict(), 'commonlit_model.pth')


In [17]:
#load the model for evaluation
model.load_state_dict(torch.load('commonlit_model.pth'))


<All keys matched successfully>

In [18]:
val_loss = evaluate(model, val_dataloader, criterion, device)
print(f"Validation Loss: {val_loss:.4f}")


Validation Loss: 1.0627


In [19]:
def get_predictions(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = model(input_ids, attention_mask)
            predictions.extend(output.squeeze().tolist())
    return predictions

val_predictions = get_predictions(model, val_dataloader, device)


In [20]:
mse = mean_squared_error(val_targets, val_predictions)
print(f"Mean Squared Error: {mse:.4f}")


Mean Squared Error: 1.0355
