In [1]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm
from copy import deepcopy
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset

2025-06-06 09:21:16.957048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749201677.184259      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749201677.249551      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
class RobertaWithPreLSTMAttention(nn.Module):
    def __init__(self, pretrained_model="roberta-base", num_labels=2,
                 lstm_hidden_size=256, num_lstm_layers=1, bidirectional=True,
                 mlp_hidden_size=256, mlp_num_layers=1):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_model)
        self.config = self.roberta.config
        self.num_labels = num_labels 

        self.attention_scorer = nn.Linear(self.config.hidden_size, 1)

        self.lstm = nn.LSTM(
            input_size=self.config.hidden_size,
            hidden_size=lstm_hidden_size,
            num_layers=num_lstm_layers,
            batch_first=True,
            bidirectional=bidirectional
        )

        classifier_input_dim = lstm_hidden_size * 2 if bidirectional else lstm_hidden_size
        
        mlp_layers = []
        for i in range(mlp_num_layers):
            mlp_layers.append(nn.Linear(classifier_input_dim if i == 0 else mlp_hidden_size, mlp_hidden_size))
            mlp_layers.append(nn.ReLU())
            mlp_layers.append(nn.Dropout(0.3))

        self.mlp = nn.Sequential(*mlp_layers)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(mlp_hidden_size, num_labels)


    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states_roberta = outputs.last_hidden_state

        att_scores = self.attention_scorer(hidden_states_roberta)
        att_weights = torch.softmax(att_scores, dim=1)

        weighted_hidden_states = att_weights * hidden_states_roberta

        _, (h_n, c_n) = self.lstm(weighted_hidden_states)

        if self.lstm.bidirectional:
            pooled_output = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        else:
            pooled_output = h_n[-1, :, :]

        mlp_output = self.mlp(pooled_output)
        logits = self.classifier(self.dropout(mlp_output))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return {"loss": loss, "logits": logits}

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.dataset['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.dataset['attention_mask'][idx], dtype=torch.long)
        }
        item["labels"] = torch.tensor(self.dataset["label"][idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.dataset["label"])

dataset = load_dataset("rotten_tomatoes")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

encoded_dataset = dataset.map(preprocess_function, batched=True)

train_dataset = CustomDataset(encoded_dataset["train"])
val_dataset = CustomDataset(encoded_dataset["validation"])
test_dataset = CustomDataset(encoded_dataset["test"])

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [4]:
def train_model(
    model, train_dataset, val_dataset,
    epochs=5, lr=2e-5, batch_size=16,
    patience=2, monitor="val_f1", mode="max",
    weight_decay=0.01, max_grad_norm=1.0,
    gradient_accumulation_steps=1, warmup_steps=0,
    num_dataloader_workers=0
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    num_training_steps = (len(train_dataset) // batch_size // gradient_accumulation_steps) * epochs
    if num_training_steps == 0:
        num_training_steps = epochs

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_dataloader_workers, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_dataloader_workers, pin_memory=True)

    scaler = torch.amp.GradScaler('cuda')

    best_score = None
    patience_counter = 0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        all_preds, all_labels = [], []
        
        optimizer.zero_grad()
        for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1} - Training")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs["loss"]
                logits = outputs["logits"]
            
            loss = loss / gradient_accumulation_steps
            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_loader):
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
                lr_scheduler.step()
                optimizer.zero_grad()

            train_loss += loss.item() * gradient_accumulation_steps
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())

        avg_train_loss = train_loss / len(train_loader)
        train_acc = accuracy_score(all_labels, all_preds)
        train_f1 = f1_score(all_labels, all_preds, average="weighted")
        train_precision = precision_score(all_labels, all_preds, average="weighted")
        train_recall = recall_score(all_labels, all_preds, average="weighted")
        print(f"Epoch {epoch+1}: Train loss={avg_train_loss:.4f}, Acc={train_acc:.4f}, F1={train_f1:.4f}, Prec={train_precision:.4f}, Rec={train_recall:.4f}")

        model.eval()
        val_loss = 0
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} - Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with torch.cuda.amp.autocast():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs["loss"]
                    logits = outputs["logits"]

                val_loss += loss.item()
                preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.detach().cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        val_acc = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average="weighted")
        val_precision = precision_score(all_labels, all_preds, average="weighted")
        val_recall = recall_score(all_labels, all_preds, average="weighted")
        print(f"Epoch {epoch+1}: Val loss={avg_val_loss:.4f}, Acc={val_acc:.4f}, F1={val_f1:.4f}, Prec={val_precision:.4f}, Rec={val_recall:.4f}")

        
        if monitor == "val_acc":
            score = val_acc
     

        if (best_score is None) or \
           (mode == "min" and score < best_score) or \
           (mode == "max" and score > best_score):
            best_score = score
            patience_counter = 0
            best_model_state = deepcopy(model.state_dict())
            print(f">>> New best model saved at epoch {epoch+1}!")
        else:
            patience_counter += 1
            print(f"Patience counter: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("Early stopping!")
                break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    return model


def test_model(model, test_dataset, batch_size=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs["logits"]
            
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.detach().cpu().numpy())
    test_acc = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average="weighted")
    test_precision = precision_score(all_labels, all_preds, average="weighted")
    test_recall = recall_score(all_labels, all_preds, average="weighted")
    print(f"Test Accuracy: {test_acc:.4f} - F1: {test_f1:.4f} - "
          f"Precision: {test_precision:.4f} - Recall: {test_recall:.4f}")

In [None]:
model = RobertaWithPreLSTMAttention(
    num_labels=2,
    lstm_hidden_size=128,   
    num_lstm_layers=1,      
    bidirectional=True,     
    mlp_hidden_size=128,    
    mlp_num_layers=1        
)

trained_model = train_model(
    model, train_dataset, val_dataset,
    epochs=10, 
    lr=2e-5,
    batch_size=32,
    patience=3, 
    monitor="val_acc", 
    mode="max",
    weight_decay=0.01,
    max_grad_norm=1.0,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    num_dataloader_workers=4
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast():
Epoch 1 - Training: 100%|██████████| 267/267 [34:54<00:00,  7.85s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1: Train loss=0.6960, Acc=0.5000, F1=0.3333, Prec=0.2500, Rec=0.5000


  with torch.cuda.amp.autocast():
Epoch 1 - Validation: 100%|██████████| 34/34 [00:33<00:00,  1.02it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1: Val loss=0.6907, Acc=0.5000, F1=0.3333, Prec=0.2500, Rec=0.5000
>>> New best model saved at epoch 1!


  with torch.cuda.amp.autocast():
Epoch 2 - Training: 100%|██████████| 267/267 [34:50<00:00,  7.83s/it]


Epoch 2: Train loss=0.5261, Acc=0.8019, F1=0.8012, Prec=0.8061, Rec=0.8019


  with torch.cuda.amp.autocast():
Epoch 2 - Validation: 100%|██████████| 34/34 [00:33<00:00,  1.03it/s]


Epoch 2: Val loss=0.3953, Acc=0.8884, F1=0.8883, Prec=0.8899, Rec=0.8884
>>> New best model saved at epoch 2!


  with torch.cuda.amp.autocast():
Epoch 3 - Training: 100%|██████████| 267/267 [34:48<00:00,  7.82s/it]


Epoch 3: Train loss=0.3985, Acc=0.8796, F1=0.8795, Prec=0.8804, Rec=0.8796


  with torch.cuda.amp.autocast():
Epoch 3 - Validation: 100%|██████████| 34/34 [00:33<00:00,  1.02it/s]


Epoch 3: Val loss=0.3807, Acc=0.8715, F1=0.8713, Prec=0.8731, Rec=0.8715
Patience counter: 1/2


  with torch.cuda.amp.autocast():
Epoch 4 - Training: 100%|██████████| 267/267 [34:57<00:00,  7.86s/it]


Epoch 4: Train loss=0.3340, Acc=0.8947, F1=0.8947, Prec=0.8952, Rec=0.8947


  with torch.cuda.amp.autocast():
Epoch 4 - Validation: 100%|██████████| 34/34 [00:32<00:00,  1.03it/s]

Epoch 4: Val loss=0.3377, Acc=0.8762, F1=0.8760, Prec=0.8777, Rec=0.8762
Patience counter: 2/2
Early stopping!





In [7]:
test_model(trained_model, test_dataset, batch_size=32)

  with torch.cuda.amp.autocast():
Testing: 100%|██████████| 34/34 [01:05<00:00,  1.91s/it]

Test Accuracy: 0.8715 - F1: 0.8712 - Precision: 0.8746 - Recall: 0.8715



