In [1]:
!pip install torch torchvision torchaudio
!pip install transformers
!pip install scikit-learn




In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Define your dataset class
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['review'])
        label = self.data.iloc[idx]['sentiment']

        # Map string labels to numerical values
        label_map = {'positive': 1, 'negative': 0}
        label = label_map[label]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [3]:
# Define your model
class SentimentClassifier(nn.Module):
    def __init__(self, roberta_model, num_classes):
        super(SentimentClassifier, self).__init__()
        self.roberta = roberta_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.dropout(pooled_output)
        output = self.fc(output)
        return output



In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [5]:
#loading the dataset to a pandas DataFrame

df = pd.read_csv('/kaggle/input/imdb-asg/train.csv', encoding='ISO-8859-1')
df2 = pd.read_csv('/kaggle/input/imdb-asg/test.csv', encoding='ISO-8859-1')

In [14]:
import time
from tqdm import tqdm

train_texts, val_texts, train_labels, val_labels = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('FacebookAI/roberta-base')
roberta_model = RobertaModel.from_pretrained('FacebookAI/roberta-base')

    # Define constants
MAX_LEN = 128
BATCH_SIZE = 64
NUM_CLASSES = 2  # Assuming binary classification (positive/negative)
LEARNING_RATE = 5e-5


# Create datasets and dataloaders
train_dataset = SentimentDataset(df, tokenizer, MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = SentimentDataset(df2, tokenizer, MAX_LEN)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize the model
model = SentimentClassifier(roberta_model, NUM_CLASSES)

    # Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

    # Define the number of epochs
num_epochs = 3  # or any other number you prefer


In [None]:
import time

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    start_time = time.time()  # Start timing
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix({'training_loss': epoch_loss / len(train_dataloader)})  # Update progress bar with loss

    # Calculate training time
    train_time = time.time() - start_time
    
    # Validation
    model.eval()
    val_predictions = []
    val_targets = []
    start_time = time.time()  # Start timing
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)

            val_predictions.extend(preds.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())

    # Calculate validation time
    val_time = time.time() - start_time

    val_accuracy = accuracy_score(val_targets, val_predictions)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy}, Training Time: {train_time}, Validation Time: {val_time}")


                                                                                  