<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/Ro_berta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Import necessary libraries
%%capture
#!pip install transformers
#!pip install datasets
#!pip install torch
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_MODE"] = "disabled"

In [13]:
# Load the dataset
data = pd.read_csv('incidents_train.csv')

# Combine 'title' and 'text' columns for richer input data
data['combined_text'] = data['title'] + " " + data['text']

# Encode hazard-category to numerical labels
label_mapping = {label: idx for idx, label in enumerate(data['hazard-category'].unique())}
data['label'] = data['hazard-category'].map(label_mapping)

# Stratified train-test split
train_df, val_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# Define tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_mapping))

# Custom Dataset Class for PyTorch
class HazardDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create DataLoaders
train_dataset = HazardDataset(train_df['combined_text'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = HazardDataset(val_df['combined_text'].tolist(), val_df['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
total_steps = len(train_loader) * 5  # Assuming 5 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_train_loss = 0
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss}")

        # Validation after each epoch
        evaluate_model(model, val_loader)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print("Validation Classification Report:")
    print(classification_report(true_labels, predictions, target_names=label_mapping.keys()))

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:

# Train and evaluate the model
train_model(model, train_loader, val_loader, optimizer, scheduler)

100%|██████████| 255/255 [01:38<00:00,  2.60it/s]


Epoch 1/5 - Training Loss: 0.6784446287681075
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.93      0.97      0.95       348
                foreign bodies       0.74      0.91      0.82       112
                      chemical       0.75      1.00      0.86        57
                         fraud       0.90      0.58      0.70        74
          organoleptic aspects       0.00      0.00      0.00        11
                     allergens       0.94      0.98      0.96       371
              packaging defect       0.00      0.00      0.00        11
                  other hazard       0.67      0.15      0.24        27
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         1

                      accuracy                           0.89      1017
                     macro avg       0.49      0.46   

100%|██████████| 255/255 [01:35<00:00,  2.67it/s]


Epoch 2/5 - Training Loss: 0.27939597788979026
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.92      0.98      0.95       348
                foreign bodies       0.99      0.88      0.93       112
                      chemical       0.90      0.98      0.94        57
                         fraud       0.90      0.58      0.70        74
          organoleptic aspects       0.80      0.73      0.76        11
                     allergens       0.93      0.98      0.95       371
              packaging defect       0.73      0.73      0.73        11
                  other hazard       0.77      0.63      0.69        27
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         1

                      accuracy                           0.92      1017
                     macro avg       0.69      0.65  

100%|██████████| 255/255 [01:35<00:00,  2.68it/s]


Epoch 3/5 - Training Loss: 0.17394748540850832
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.94      0.97      0.95       348
                foreign bodies       0.94      0.91      0.93       112
                      chemical       0.86      1.00      0.93        57
                         fraud       0.94      0.65      0.77        74
          organoleptic aspects       0.88      0.64      0.74        11
                     allergens       0.93      0.99      0.96       371
              packaging defect       1.00      0.73      0.84        11
                  other hazard       0.77      0.63      0.69        27
food additives and flavourings       1.00      0.40      0.57         5
                     migration       0.00      0.00      0.00         1

                      accuracy                           0.93      1017
                     macro avg       0.83      0.69  

100%|██████████| 255/255 [01:35<00:00,  2.68it/s]


Epoch 4/5 - Training Loss: 0.11919360655706887
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.93      0.97      0.95       348
                foreign bodies       0.98      0.89      0.93       112
                      chemical       0.89      1.00      0.94        57
                         fraud       0.94      0.65      0.77        74
          organoleptic aspects       0.89      0.73      0.80        11
                     allergens       0.94      0.98      0.96       371
              packaging defect       0.89      0.73      0.80        11
                  other hazard       0.73      0.70      0.72        27
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         1

                      accuracy                           0.93      1017
                     macro avg       0.72      0.67  

100%|██████████| 255/255 [01:35<00:00,  2.68it/s]


Epoch 5/5 - Training Loss: 0.0935556901363181
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.94      0.97      0.95       348
                foreign bodies       0.93      0.91      0.92       112
                      chemical       0.88      1.00      0.93        57
                         fraud       0.90      0.70      0.79        74
          organoleptic aspects       0.80      0.73      0.76        11
                     allergens       0.95      0.98      0.97       371
              packaging defect       1.00      0.73      0.84        11
                  other hazard       0.86      0.70      0.78        27
food additives and flavourings       1.00      0.40      0.57         5
                     migration       0.00      0.00      0.00         1

                      accuracy                           0.93      1017
                     macro avg       0.83      0.71   