<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/HC_Mix_85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers datasets scikit-learn torch
!pip install evaluate

In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from nltk.corpus import wordnet
import gensim
import torch
from tqdm import tqdm
import re
from datasets import Dataset
import evaluate
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import os
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_valid.csv
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_test.csv

--2025-01-23 20:07:12--  https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13415981 (13M) [text/plain]
Saving to: ‘incidents_train.csv’


2025-01-23 20:07:12 (144 MB/s) - ‘incidents_train.csv’ saved [13415981/13415981]

--2025-01-23 20:07:13--  https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_valid.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1369261 (1.3M) [text/plain]
Saving to: 

In [7]:
# -------------------------------
# Load Dataset
# -------------------------------
csv1 = pd.read_csv('incidents_train.csv')
csv2 = pd.read_csv('incidents_valid.csv')
train = pd.concat([csv1, csv2], ignore_index = True )

# Combine title and text columns
train['combined_text'] = train['title'] + ' ' + train['text']

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train['hazard_category_encoded'] = label_encoder.fit_transform(train['hazard-category'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    train['combined_text'], train['hazard_category_encoded'], test_size=0.2, random_state=42, stratify=train['hazard_category_encoded']
)

# -------------------------------
# Data Augmentation: Synonym Replacement
# -------------------------------
from nltk.corpus import wordnet
import random

def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words[word_idx] = synonym
    return ' '.join(new_words)

X_train_augmented = X_train.apply(lambda x: synonym_replacement(x))

# Combine original and augmented data
X_train_combined = pd.concat([X_train, X_train_augmented])
y_train_combined = pd.concat([y_train, y_train])


In [8]:
# -------------------------------
# Improved Preprocessing
# -------------------------------
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

X_train_preprocessed = X_train_combined.apply(preprocess_text)
X_val_preprocessed = X_val.apply(preprocess_text)

# -------------------------------
# Word2Vec Embeddings
# -------------------------------
X_train_tokenized = X_train_preprocessed.apply(lambda x: gensim.utils.simple_preprocess(x))
X_val_tokenized = X_val_preprocessed.apply(lambda x: gensim.utils.simple_preprocess(x))

word2vec_model = gensim.models.Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=2, workers=4)
word2vec_model.train(X_train_tokenized, total_examples=len(X_train_tokenized), epochs=10)

def get_average_word2vec(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

X_train_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in X_train_tokenized])
X_val_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in X_val_tokenized])

# -------------------------------
# TF-IDF Features
# -------------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed).toarray()
X_val_tfidf = tfidf_vectorizer.transform(X_val_preprocessed).toarray()





In [9]:
# -------------------------------
# DistilBERT Embeddings
# -------------------------------
tokenizer = DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english')

train_encodings = tokenizer(X_train_preprocessed.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val_preprocessed.tolist(), truncation=True, padding=True, max_length=512)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train_combined.tolist()
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': y_val.tolist()
})

model = DistilBertForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english', num_labels=len(label_encoder.classes_),
                                                             ignore_mismatched_sizes=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# -------------------------------
# F1 Metric and Training Arguments
# -------------------------------
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 score as the metric for saving the best model
    greater_is_better=True       # Ensure higher F1 is considered better
)

# Early stopping callback
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3961,0.341498,0.560275
2,0.2613,0.361204,0.671682
3,0.1191,0.366414,0.738171
4,0.0041,0.41287,0.723455
5,0.0002,0.457718,0.735854
6,0.0139,0.49868,0.720903


TrainOutput(global_step=7860, training_loss=0.1639559187167727, metrics={'train_runtime': 3248.3233, 'train_samples_per_second': 64.513, 'train_steps_per_second': 8.066, 'total_flos': 8329148557516800.0, 'train_loss': 0.1639559187167727, 'epoch': 6.0})

In [12]:
# Generate DistilBERT embeddings for ensemble
model.eval()
# Ensure model is on the appropriate device (e.g., CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_distilbert_embeddings(encodings, model):
    embeddings = []
    for i in tqdm(range(0, len(encodings['input_ids']), 8)):
        # Move input tensors to the same device as the model
        batch_input_ids = torch.tensor(encodings['input_ids'][i:i+8]).to(device)
        batch_attention_mask = torch.tensor(encodings['attention_mask'][i:i+8]).to(device)
        with torch.no_grad():
            outputs = model.distilbert(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

X_train_distilbert = get_distilbert_embeddings(train_encodings, model)
X_val_distilbert = get_distilbert_embeddings(val_encodings, model)

100%|██████████| 1310/1310 [02:31<00:00,  8.64it/s]
100%|██████████| 164/164 [00:18<00:00,  8.73it/s]


In [13]:
# -------------------------------
# Combine Features and Ensemble
# -------------------------------
from sklearn.decomposition import PCA

# Reduce DistilBERT embeddings to 100 dimensions using PCA
pca = PCA(n_components=100, random_state=42)
X_train_distilbert_reduced = pca.fit_transform(X_train_distilbert)
X_val_distilbert_reduced = pca.transform(X_val_distilbert)

# Compute cosine similarity between Word2Vec and reduced DistilBERT embeddings
train_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_train_word2vec, X_train_distilbert_reduced)
])
val_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_val_word2vec, X_val_distilbert_reduced)
])

# Combine features for the MLP Classifier
X_train_combined = np.hstack([X_train_word2vec, X_train_tfidf, X_train_distilbert_reduced, train_cosine_similarity.reshape(-1, 1)])
X_val_combined = np.hstack([X_val_word2vec, X_val_tfidf, X_val_distilbert_reduced, val_cosine_similarity.reshape(-1, 1)])

# Train MLPClassifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
mlp_classifier.fit(X_train_combined, y_train_combined)

# Predict and evaluate
y_val_pred = mlp_classifier.predict(X_val_combined)
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))


                                precision    recall  f1-score   support

                     allergens       0.94      0.96      0.95       433
                    biological       0.97      0.98      0.97       443
                      chemical       0.91      0.91      0.91       105
food additives and flavourings       0.50      0.17      0.25         6
                foreign bodies       0.99      0.98      0.98       166
                         fraud       0.70      0.72      0.71        90
                     migration       1.00      0.33      0.50         3
          organoleptic aspects       0.73      0.79      0.76        14
                  other hazard       0.78      0.56      0.65        32
              packaging defect       0.74      0.78      0.76        18

                      accuracy                           0.93      1310
                     macro avg       0.83      0.72      0.75      1310
                  weighted avg       0.93      0.93      0.93 

In [14]:
# Load the test dataset
test = pd.read_csv('incidents_test.csv')

# Combine title and text columns in the test dataset
test['combined_text'] = test['title'] + ' ' + test['text']

# Preprocess the test data (same preprocessing as the training data)
test_preprocessed = test['combined_text'].apply(preprocess_text)

# Tokenize the test data for Word2Vec
test_tokenized = test_preprocessed.apply(lambda x: gensim.utils.simple_preprocess(x))

# Generate Word2Vec embeddings for the test data
X_test_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in test_tokenized])

# Generate TF-IDF features for the test data
X_test_tfidf = tfidf_vectorizer.transform(test_preprocessed).toarray()

# Tokenize test data for DistilBERT
test_encodings = tokenizer(test_preprocessed.tolist(), truncation=True, padding=True, max_length=512)

# Generate DistilBERT embeddings for the test data
X_test_distilbert = get_distilbert_embeddings(test_encodings, model)
# Reduce DistilBERT embeddings to 100 dimensions using PCA
X_test_distilbert_reduced = pca.transform(X_test_distilbert)
# Compute cosine similarity for the test data
test_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_test_word2vec, X_test_distilbert_reduced)
])
# Combine all features for the test data
X_test_combined = np.hstack([
    X_test_word2vec, X_test_tfidf, X_test_distilbert_reduced, test_cosine_similarity.reshape(-1, 1)
])

# Use the trained MLPClassifier to predict labels for the test data
test_predictions = mlp_classifier.predict(X_test_combined)
# Map the numerical predictions back to their original label names
test['hazard-category'] = label_encoder.inverse_transform(test_predictions)
# Save the predictions to a new CSV file
test[['hazard-category']].to_csv('hazard-category.csv', index=False)
print("Predictions saved to 'hazard-category.csv'")


100%|██████████| 125/125 [00:13<00:00,  9.42it/s]


Predictions saved to 'hazard-category.csv'
