<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
%%capture
!pip install transformers
!pip install datasets --upgrade
#!pip install torch
import os
import pandas as pd
!pip install evaluate
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_MODE"] = "disabled"

In [2]:
# Load the dataset
train = pd.read_csv('incidents_train.csv')

# Combine 'title' and 'text' columns to create input data
train['input_text'] = train['title'] + " " + train['text']

# Encode the target labels
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['hazard-category'])

# Train-test split using stratification
train_df, test_df = train_test_split(
    train[['input_text', 'label']],
    test_size=0.2,
    stratify=train['label'],
    random_state=42
)

# Convert the dataframes into Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example['input_text'], truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set up a data collator to pad inputs dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load RoBERTa model for sequence classification
num_labels = len(label_encoder.classes_)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

# 1e-5


In [None]:
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"  # Use F1 score as the metric for saving best model
)

# Redefine the Trainer with the updated compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.1599,0.301102,0.506601
2,0.2496,0.255689,0.648452
3,0.1512,0.24505,0.691729
4,0.2465,0.252353,0.735404
5,0.0814,0.26593,0.749188


TrainOutput(global_step=2545, training_loss=0.2679591785391796, metrics={'train_runtime': 2412.6102, 'train_samples_per_second': 8.424, 'train_steps_per_second': 1.055, 'total_flos': 5342613718844556.0, 'train_loss': 0.2679591785391796, 'epoch': 5.0})

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                precision    recall  f1-score   support

                     allergens       0.95      0.99      0.97       371
                    biological       0.99      0.99      0.99       348
                      chemical       0.86      0.96      0.91        57
food additives and flavourings       1.00      0.40      0.57         5
                foreign bodies       0.98      1.00      0.99       112
                         fraud       0.83      0.70      0.76        74
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.90      0.82      0.86        11
                  other hazard       0.74      0.52      0.61        27
              packaging defect       0.91      0.91      0.91        11

                      accuracy                           0.95      1017
                     macro avg       0.82      0.73      0.76      1017
                  weighted avg       0.95      0.95      0.95 

# 2e-5

In [3]:
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"  # Use F1 score as the metric for saving best model
)

# Redefine the Trainer with the updated compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

# Train the model
trainer.train()

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1
1,0.1804,0.306435,0.511254
2,0.275,0.260729,0.690508
3,0.238,0.249397,0.760214
4,0.1985,0.251991,0.762603
5,0.0741,0.274073,0.749446


TrainOutput(global_step=2545, training_loss=0.22810142909877426, metrics={'train_runtime': 2287.8198, 'train_samples_per_second': 8.884, 'train_steps_per_second': 1.112, 'total_flos': 5342613718844556.0, 'train_loss': 0.22810142909877426, 'epoch': 5.0})

In [4]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                precision    recall  f1-score   support

                     allergens       0.96      0.99      0.97       371
                    biological       1.00      0.99      1.00       348
                      chemical       0.90      0.93      0.91        57
food additives and flavourings       1.00      0.40      0.57         5
                foreign bodies       0.99      1.00      1.00       112
                         fraud       0.79      0.73      0.76        74
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.91      0.91      0.91        11
                  other hazard       0.65      0.56      0.60        27
              packaging defect       0.91      0.91      0.91        11

                      accuracy                           0.95      1017
                     macro avg       0.81      0.74      0.76      1017
                  weighted avg       0.95      0.95      0.95 

# 1e-6

In [None]:
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"  # Use F1 score as the metric for saving best model
)

# Redefine the Trainer with the updated compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0238,0.276455,0.750454
2,0.0873,0.282345,0.745551
3,0.0664,0.282304,0.749057
4,0.1182,0.285832,0.752469
5,0.078,0.286967,0.75516


TrainOutput(global_step=2545, training_loss=0.07840342333426878, metrics={'train_runtime': 2265.6269, 'train_samples_per_second': 8.971, 'train_steps_per_second': 1.123, 'total_flos': 5342613718844556.0, 'train_loss': 0.07840342333426878, 'epoch': 5.0})

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                precision    recall  f1-score   support

                     allergens       0.96      0.98      0.97       371
                    biological       0.99      0.99      0.99       348
                      chemical       0.84      0.95      0.89        57
food additives and flavourings       1.00      0.40      0.57         5
                foreign bodies       0.98      1.00      0.99       112
                         fraud       0.82      0.73      0.77        74
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.82      0.82      0.82        11
                  other hazard       0.75      0.56      0.64        27
              packaging defect       0.91      0.91      0.91        11

                      accuracy                           0.95      1017
                     macro avg       0.81      0.73      0.76      1017
                  weighted avg       0.95      0.95      0.95 

# 1e-4

In [None]:
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"  # Use F1 score as the metric for saving best model
)

# Redefine the Trainer with the updated compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.5196,0.36977,0.618098
2,0.3645,0.305566,0.643651
3,0.1792,0.325884,0.71316
4,0.2977,0.295478,0.737854
5,0.065,0.269571,0.751587


TrainOutput(global_step=2545, training_loss=0.28006029836321156, metrics={'train_runtime': 2339.739, 'train_samples_per_second': 8.687, 'train_steps_per_second': 1.088, 'total_flos': 5342613718844556.0, 'train_loss': 0.28006029836321156, 'epoch': 5.0})

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                precision    recall  f1-score   support

                     allergens       0.96      0.99      0.97       371
                    biological       0.99      0.99      0.99       348
                      chemical       0.90      0.96      0.93        57
food additives and flavourings       0.67      0.40      0.50         5
                foreign bodies       0.99      0.99      0.99       112
                         fraud       0.78      0.73      0.76        74
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       1.00      0.82      0.90        11
                  other hazard       0.76      0.48      0.59        27
              packaging defect       0.79      1.00      0.88        11

                      accuracy                           0.95      1017
                     macro avg       0.78      0.74      0.75      1017
                  weighted avg       0.95      0.95      0.95 

# 10 epoch, 1e-4

In [None]:
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"  # Use F1 score as the metric for saving best model
)

# Redefine the Trainer with the updated compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

# Train the model
trainer.train()

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))