<a href="https://colab.research.google.com/github/food-hazard-detection-semeval-2025/food-hazard-detection-semeval-2025.github.io/blob/main/code/The_Food_Hazard_Detection_Challenge_SemEval_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
data = pd.read_csv('../../data/incidents_train.csv', index_col=0)
data.sample()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
486,2010,7,1,us,2010 - Azteca Linda Corp. Recalls QUESO FRESCO...,"FOR IMMEDIATE RELEASE - July 01, 2010 - Azteca...",biological,"meat, egg and dairy products",listeria monocytogenes,cheese


In [4]:
from sklearn.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.2, random_state=2024)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

text_clf_lr = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2,5), max_df=0.5, min_df=5)),
     ('clf', LogisticRegression(max_iter=1000)),
    ])

# Evaluation
* On 20% of the training data
* As was suggested by [Randl et al (2023)](https://aclanthology.org/2024.findings-acl.459)

In [9]:
from sklearn.metrics import classification_report, f1_score

predictions = pd.DataFrame()

for label in ('hazard-category', 'product-category', 'hazard', 'product'):
  print(label.upper())
  text_clf_lr.fit(trainset.title, trainset[label])
  lr_predictions = text_clf_lr.predict(testset.title)
  print(f'macro: {f1_score(testset[label], lr_predictions, zero_division=0, average="macro"):.2f}')
  print(f'micro: {f1_score(testset[label], lr_predictions, zero_division=0, average="micro"):.2f}')
  # saving the predictions
  predictions[label] = lr_predictions

HAZARD-CATEGORY
macro: 0.46
micro: 0.81
PRODUCT-CATEGORY
macro: 0.39
micro: 0.66
HAZARD
macro: 0.14
micro: 0.54
PRODUCT
macro: 0.07
micro: 0.27


In [10]:
from sklearn.metrics import f1_score

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.



In [19]:
print(f"Score Sub-Task 1: {compute_score(testset['hazard-category'], testset['product-category'], predictions['hazard-category'], predictions['product-category']):.3f}")
print(f"Score Sub-Task 2: {compute_score(testset['hazard'], testset['product'], predictions['hazard'], predictions['product']):.3f}")

Score Sub-Task 1: 0.449
Score Sub-Task 2: 0.121


In [16]:
predictions.head()

Unnamed: 0,hazard-category,product-category,hazard,product
2193,biological,prepared dishes and snacks,salmonella,sandwiches
2016,biological,"meat, egg and dairy products",salmonella,chicken based products
742,other hazard,"meat, egg and dairy products",other,beef
117,allergens,cereals and bakery products,milk and products thereof,cookies
1243,allergens,herbs and spices,peanuts and products thereof,ice cream


In [20]:
predictions.sample()

Unnamed: 0,hazard-category,product-category,hazard,product
5315,foreign bodies,prepared dishes and snacks,glass fragment,ice cream


In [21]:
import os
from shutil import make_archive

# save predictions to a new folder:
os.makedirs('./submission/', exist_ok=True)
predictions.to_csv('./submission/submission.csv')

# zip the folder (zipfile can be directly uploaded to codalab):
make_archive('./submission', 'zip', './submission')

'/content/submission.zip'