In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# 1. Load your dataset

# Load the dataset
dataset = load_dataset('imdb')

# The dataset is split into train and test
train_data = dataset['train']
test_data = dataset['test']

# You can access the text and labels like this
texts = train_data['text']
labels = train_data['label']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
# 2. Preprocess the data
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
# Parameters
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/content/roberta_sentiment.bin"

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Split data into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

# Create data loaders
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LEN)
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LEN)
val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# 3. Create the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
from tqdm import tqdm
import numpy as np
import os

# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    return (preds.argmax(dim=1) == labels).float().mean()

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    model.train()

    train_loss = 0
    train_acc = 0

    for batch in tqdm(train_data_loader, desc=f'Training Epoch {epoch + 1}/{EPOCHS}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += calculate_accuracy(outputs.logits, labels).item()

    # Calculate average training loss and accuracy
    train_loss /= len(train_data_loader)
    train_acc /= len(train_data_loader)

    # Validation loop
    model.eval()

    valid_loss = 0
    valid_acc = 0

    with torch.no_grad():
        for batch in tqdm(val_data_loader, desc=f'Validating Epoch {epoch + 1}/{EPOCHS}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            valid_loss += loss.item()
            valid_acc += calculate_accuracy(outputs.logits, labels).item()

    # Calculate average validation loss and accuracy
    valid_loss /= len(val_data_loader)
    valid_acc /= len(val_data_loader)

    # Print training and validation loss and accuracy per epoch
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Val Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')

    # Save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_path)
        print(f'Model saved to {model_path}')

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/2: 100%|██████████| 1250/1250 [15:38<00:00,  1.33it/s]
Validating Epoch 1/2: 100%|██████████| 313/313 [01:21<00:00,  3.86it/s]



Epoch 1/2
Train Loss: 0.247 | Train Acc: 90.24%
Val Loss: 0.174 | Val Acc: 93.15%
Model saved to /content/roberta_sentiment.bin


Training Epoch 2/2: 100%|██████████| 1250/1250 [15:37<00:00,  1.33it/s]
Validating Epoch 2/2: 100%|██████████| 313/313 [01:20<00:00,  3.87it/s]



Epoch 2/2
Train Loss: 0.151 | Train Acc: 94.56%
Val Loss: 0.170 | Val Acc: 93.37%
Model saved to /content/roberta_sentiment.bin


In [21]:
from torch.nn.functional import softmax

# 5. Inference
model.eval()
texts = [
    "The vacation was absolutely marvelous. The scenery was breathtaking, the food was exquisite, and the locals were incredibly welcoming. It was truly a trip to remember, rejuvenating both the mind and the body.",  # Positive
    "I am utterly dissatisfied with the service at the electronics store. Not only was the staff rude and unhelpful, but the product I purchased stopped working after a week, and the return process has been a nightmare.",  # Negative
    "The book had a very interesting start, and the storyline was kinda cool and original, but the ending felt rushed. While the characters were well-developed, the plot had some inconsistencies that were hard to overlook."  # Mixed
]

for text in texts:
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=False,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = softmax(logits, dim=1)  # Apply softmax to convert logits to probabilities
    negative_probability = probabilities[0][0].item()
    positive_probability = probabilities[0][1].item()
    print(f"Positive sentiment probability: {positive_probability:.4f}, Negative sentiment probability: {negative_probability:.4f}")

Positive sentiment probability: 0.9955, Negative sentiment probability: 0.0045
Positive sentiment probability: 0.0307, Negative sentiment probability: 0.9693
Positive sentiment probability: 0.4205, Negative sentiment probability: 0.5795
