<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/Distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

# Import necessary libraries
%%capture
!pip install transformers
!pip install datasets --upgrade
#!pip install torch
import os
import pandas as pd
!pip install evaluate
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from transformers import Trainer, TrainingArguments, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_MODE"] = "disabled"

In [None]:
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv


In [7]:
# Load the dataset
train = pd.read_csv('incidents_train.csv')

# Combine 'title' and 'text' columns to create input data
train['input_text'] = train['title'] + " " + train['text']

# Encode the target labels
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['hazard-category'])

# Train-test split using stratification
train_df, test_df = train_test_split(
    train[['input_text', 'label']],
    test_size=0.2,
    stratify=train['label'],
    random_state=42
)

# Convert the dataframes into Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the RoBERTa tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example['input_text'], truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set up a data collator to pad inputs dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load RoBERTa model for sequence classification
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english", num_labels=num_labels, ignore_mismatched_sizes=True)


Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

In [9]:

from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
import numpy as np

# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 score as the metric for saving best model
    greater_is_better=True       # Ensure higher F1 is considered better
)

# Redefine the Trainer with the updated compute_metrics and early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

# Add EarlyStoppingCallback to the Trainer with patience of 4 epochs
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=4))

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2672,0.294584,0.576262
2,0.2807,0.260391,0.67815
3,0.2292,0.28179,0.739425
4,0.1699,0.315819,0.7197
5,0.0672,0.350992,0.756594
6,0.0015,0.300107,0.845998
7,0.0034,0.3319,0.849495
8,0.0542,0.331187,0.849165
9,0.0004,0.356708,0.847356
10,0.0382,0.372487,0.851613


TrainOutput(global_step=8144, training_loss=0.0640140985208858, metrics={'train_runtime': 3661.8684, 'train_samples_per_second': 55.504, 'train_steps_per_second': 6.95, 'total_flos': 8607604479882900.0, 'train_loss': 0.0640140985208858, 'epoch': 16.0})

In [10]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                precision    recall  f1-score   support

                     allergens       0.96      0.98      0.97       371
                    biological       0.99      1.00      1.00       348
                      chemical       0.86      0.89      0.88        57
food additives and flavourings       1.00      0.60      0.75         5
                foreign bodies       0.94      1.00      0.97       112
                         fraud       0.84      0.73      0.78        74
                     migration       1.00      1.00      1.00         1
          organoleptic aspects       1.00      0.82      0.90        11
                  other hazard       0.79      0.56      0.65        27
              packaging defect       0.69      0.82      0.75        11

                      accuracy                           0.95      1017
                     macro avg       0.91      0.84      0.87      1017
                  weighted avg       0.95      0.95      0.95 