<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/Distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

# Import necessary libraries
%%capture
!pip install transformers
!pip install datasets --upgrade
#!pip install torch
import os
import pandas as pd
!pip install evaluate
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from transformers import Trainer, TrainingArguments, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

In [None]:
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_val.csv


# hazard-category

In [None]:
# Load the dataset
train = pd.read_csv('incidents_train.csv')

# Combine 'title' and 'text' columns to create input data
train['input_text'] = train['title'] + " " + train['text']

# Encode the target labels
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['hazard-category'])

# Train-test split using stratification
train_df, test_df = train_test_split(
    train[['input_text', 'label']],
    test_size=0.2,
    stratify=train['label'],
    random_state=42
)

# Convert the dataframes into Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the RoBERTa tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example['input_text'], truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set up a data collator to pad inputs dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load RoBERTa model for sequence classification
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english", num_labels=num_labels, ignore_mismatched_sizes=True)


Map:   0%|          | 0/4065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1017 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
import numpy as np

# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 score as the metric for saving best model
    greater_is_better=True       # Ensure higher F1 is considered better
)

# Redefine the Trainer with the updated compute_metrics and early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

# Add EarlyStoppingCallback to the Trainer with patience of 4 epochs
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=4))

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2847,0.31726,0.471777
2,0.3435,0.254979,0.653625
3,0.2015,0.23316,0.749141
4,0.1461,0.249783,0.752435
5,0.1068,0.286031,0.77977
6,0.0039,0.261767,0.790258
7,0.0014,0.315997,0.776759
8,0.1131,0.327987,0.764923
9,0.0008,0.364389,0.871849
10,0.0004,0.395325,0.868732


TrainOutput(global_step=8653, training_loss=0.0953774205392274, metrics={'train_runtime': 3868.9637, 'train_samples_per_second': 52.533, 'train_steps_per_second': 6.578, 'total_flos': 9145623054042900.0, 'train_loss': 0.0953774205392274, 'epoch': 17.0})

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                precision    recall  f1-score   support

                     allergens       0.96      0.97      0.97       371
                    biological       1.00      0.99      1.00       348
                      chemical       0.90      0.93      0.91        57
food additives and flavourings       1.00      0.80      0.89         5
                foreign bodies       0.98      0.99      0.99       112
                         fraud       0.75      0.74      0.75        74
                     migration       1.00      1.00      1.00         1
          organoleptic aspects       0.90      0.82      0.86        11
                  other hazard       0.76      0.59      0.67        27
              packaging defect       0.85      1.00      0.92        11

                      accuracy                           0.95      1017
                     macro avg       0.91      0.88      0.89      1017
                  weighted avg       0.95      0.95      0.95 

In [None]:
# Load the test data
test = pd.read_csv("incidents_val.csv")
# Combine 'title' and 'text' columns to create input text
test['input_text'] = test['title'] + " " + test['text']
# Tokenize the test dataset using the same tokenizer and tokenization function
def tokenize_function(example):
    return tokenizer(example['input_text'], truncation=True)
# Convert test data to Hugging Face Dataset format
test_dataset = Dataset.from_pandas(test[['input_text']])
# Tokenize the test dataset
test_dataset = test_dataset.map(tokenize_function, batched=True)
# Use the trained model to make predictions on the test data
predictions = trainer.predict(test_dataset)
# Get the predicted labels (argmax on logits)
preds = np.argmax(predictions.predictions, axis=-1)
# Map the numeric predictions back to the original labels
predicted_labels = label_encoder.inverse_transform(preds)
# Add the predicted labels to the original test DataFrame under 'hazard-category' column
test['hazard-category'] = predicted_labels
# Save the updated test DataFrame with the predictions to a new CSV
test.to_csv("incidents_test_with_predictions.csv", index=False)
print("Predictions saved to 'incidents_test_with_predictions.csv'.")


Map:   0%|          | 0/565 [00:00<?, ? examples/s]

Predictions saved to 'incidents_test_with_predictions.csv'.


# product-category

In [None]:
# Load the dataset
train = pd.read_csv('incidents_train.csv')

# Combine 'title' and 'text' columns to create input data
train['input_text'] = train['title'] + " " + train['text']

# Encode the target labels
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['product-category'])

# Train-test split using stratification
train_df, test_df = train_test_split(
    train[['input_text', 'label']],
    test_size=0.2,
    stratify=train['label'],
    random_state=42
)

# Convert the dataframes into Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the RoBERTa tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example['input_text'], truncation=True)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set up a data collator to pad inputs dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load RoBERTa model for sequence classification
num_labels = len(label_encoder.classes_)
model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english", num_labels=num_labels, ignore_mismatched_sizes=True)


In [7]:

# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Update training arguments to focus on F1 score
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 score as the metric for saving best model
    greater_is_better=True       # Ensure higher F1 is considered better
)

# Redefine the Trainer with the updated compute_metrics and early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Use F1 macro for evaluation
)

# Add EarlyStoppingCallback to the Trainer with patience of 4 epochs
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=4))

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0514,1.866203,0.616211
2,0.0002,2.056648,0.613752
3,0.0003,1.924056,0.67047


Epoch,Training Loss,Validation Loss,F1
1,0.0514,1.866203,0.616211
2,0.0002,2.056648,0.613752
3,0.0003,1.924056,0.67047
4,0.0022,2.14992,0.674018
5,0.0061,2.339184,0.657881
6,0.0001,2.19409,0.651852
7,0.0203,2.212574,0.659787
8,0.0358,2.404621,0.642238


TrainOutput(global_step=4072, training_loss=0.023178718715367787, metrics={'train_runtime': 1792.5731, 'train_samples_per_second': 113.384, 'train_steps_per_second': 14.197, 'total_flos': 4304401295226024.0, 'train_loss': 0.023178718715367787, 'epoch': 8.0})

In [8]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Classification report
target_names = label_encoder.classes_
print(classification_report(test_df['label'], preds, target_names=target_names))

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.92      0.92      0.92        12
                      cereals and bakery products       0.75      0.77      0.76       134
     cocoa and cocoa preparations, coffee and tea       0.83      0.71      0.77        42
                                    confectionery       0.69      0.65      0.67        34
dietetic foods, food supplements, fortified foods       0.76      0.73      0.75        26
                                    fats and oils       0.67      0.50      0.57         4
                                   feed materials       0.50      1.00      0.67         1
                   food additives and flavourings       0.20      0.50      0.29         2
                           food contact materials       0.00      0.00      0.00         1
                            fruits and vegetables       0.78      0.78      0.78       10

In [9]:
# Load the test data
test = pd.read_csv("incidents_val.csv")
# Combine 'title' and 'text' columns to create input text
test['input_text'] = test['title'] + " " + test['text']
# Tokenize the test dataset using the same tokenizer and tokenization function
def tokenize_function(example):
    return tokenizer(example['input_text'], truncation=True)
# Convert test data to Hugging Face Dataset format
test_dataset = Dataset.from_pandas(test[['input_text']])
# Tokenize the test dataset
test_dataset = test_dataset.map(tokenize_function, batched=True)
# Use the trained model to make predictions on the test data
predictions = trainer.predict(test_dataset)
# Get the predicted labels (argmax on logits)
preds = np.argmax(predictions.predictions, axis=-1)
# Map the numeric predictions back to the original labels
predicted_labels = label_encoder.inverse_transform(preds)
# Add the predicted labels to the original test DataFrame under 'hazard-category' column
test['product-category'] = predicted_labels
# Save the updated test DataFrame with the predictions to a new CSV
test.to_csv("incidents_test_with_predictions.csv", index=False)
print("Predictions saved to 'incidents_test_with_predictions.csv'.")


Map:   0%|          | 0/565 [00:00<?, ? examples/s]

Predictions saved to 'incidents_test_with_predictions.csv'.
