# Training our RoBERTa model

# Loading the dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn # Add this import statement
import torch.optim as optim # Add this import statement
import torch.nn.functional as F # Add this import statement
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'torch'

In [None]:


# Load your dataset
interaction_data = pd.read_csv('/content/updated_interaction_data.csv')  # Load your data here

def map_rating_to_category(rating):
    if rating in [1, 2]:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    elif rating in [4, 5]:
        return 'positive'
    else:
        raise ValueError("Rating out of range")

# Apply this function to your dataset
interaction_data['Category'] = interaction_data['Label'].apply(map_rating_to_category)
# Randomly sample 5,000 rows
sampled_data = interaction_data.sample(n=5000, random_state=42)

# Prepare dataset and dataloader


In [None]:
texts = sampled_data['Review'].tolist()  # Replace with your text column
labels = sampled_data['Category'].tolist()  # Replace with your labels # Replace with your labels

# Split dataset into 60% training and 40% testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.4, random_state=42
)

In [8]:
!pip install transformers



# Prepare Dataset and DataLoader

In [53]:

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Create a mapping from labels to integers
        self.label_map = {'negative': [0, 1],'neutral': [3],'positive': [4, 5]}
        #Define self.category_to_index
        self.category_to_index = {'negative': 0, 'neutral': 1, 'positive': 2}


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Convert label to integer using the label map
        label =self.category_to_index[label]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)  # Use long for classification
        }

# Initialize tokenizer and max length
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_length = 128  # Adjust based on your data

# Create datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_length)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [43]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Adjust the mapping directly to numbers
        self.category_to_index = {0: 0, 1: 0, 3: 1, 4: 2, 5: 2}
        # This assumes you want to map 0 and 1 to 'negative', 3 to 'neutral', and 4 and 5 to 'positive'

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Convert label to integer using the label map
        label = self.category_to_index[label]

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)  # Use long for classification
        }


In [21]:
import pandas as pd

# Example dataset
data = {
    'Label': [1, 2, 3, 4, 5, 1, 3, 5, 4, 2]
}
interaction_data = pd.DataFrame(data)

# Initialize mappings
self.label_map = {'negative': [1, 2], 'neutral': [3], 'positive': [4, 5]}
self.category_to_index = {'negative': 0, 'neutral': 1, 'positive': 2}

# Map ratings to categories
interaction_data['Category'] = interaction_data['Label'].apply(lambda x: map_rating_to_category(x))

# Convert categories to indices
interaction_data['CategoryIndex'] = interaction_data['Category'].map(self.category_to_index)

print(interaction_data)


NameError: name 'self' is not defined

# Train The model

In [1]:
class FineTuneRoberta(nn.Module):
    def __init__(self, model_name='roberta-base', n_classes=3): # Changed n_classes to 3
        super(FineTuneRoberta, self).__init__()
        self.model = RobertaModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.model.config.hidden_size, n_classes) # Output layer for 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(outputs.pooler_output)
        return self.out(output)
# Initialize model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FineTuneRoberta().to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, dataloader, optimizer, device):
    model.train()

    losses = []

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = F.cross_entropy(outputs, labels)  # Use cross_entropy loss for classification
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return np.mean(losses)

# Training loop
for epoch in range(3):  # Adjust the number of epochs as needed
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f'Epoch {epoch + 1}/{3}, Loss: {train_loss}')


NameError: name 'nn' is not defined

Testing and Evaluating The model

In [None]:



def evaluate_model(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # No gradient calculation needed
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)  # Get the class with the highest probability

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Convert lists to numpy arrays for sklearn metrics
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1

# Assuming test_dataset is defined and you can replace it with your actual test dataset
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluate the model
accuracy, precision, recall, f1 = evaluate_model(model, test_dataloader, device)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
