<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/HC_Mix_85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
%%capture
!pip install transformers datasets scikit-learn torch
!pip install evaluate

In [33]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from nltk.corpus import wordnet
import gensim
import torch
from tqdm import tqdm
import re
from datasets import Dataset
import evaluate
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [28]:
# -------------------------------
# Load Dataset
# -------------------------------
train = pd.read_csv('incidents_train.csv')

# Combine title and text columns
train['combined_text'] = train['title'] + ' ' + train['text']

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train['hazard_category_encoded'] = label_encoder.fit_transform(train['hazard-category'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    train['combined_text'], train['hazard_category_encoded'], test_size=0.2, random_state=42, stratify=train['hazard_category_encoded']
)

# -------------------------------
# Data Augmentation: Synonym Replacement
# -------------------------------
from nltk.corpus import wordnet
import random

def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words[word_idx] = synonym
    return ' '.join(new_words)

X_train_augmented = X_train.apply(lambda x: synonym_replacement(x))

# Combine original and augmented data
X_train_combined = pd.concat([X_train, X_train_augmented])
y_train_combined = pd.concat([y_train, y_train])


In [29]:
# -------------------------------
# Improved Preprocessing
# -------------------------------
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

X_train_preprocessed = X_train_combined.apply(preprocess_text)
X_val_preprocessed = X_val.apply(preprocess_text)

# -------------------------------
# Word2Vec Embeddings
# -------------------------------
X_train_tokenized = X_train_preprocessed.apply(lambda x: gensim.utils.simple_preprocess(x))
X_val_tokenized = X_val_preprocessed.apply(lambda x: gensim.utils.simple_preprocess(x))

word2vec_model = gensim.models.Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=2, workers=4)
word2vec_model.train(X_train_tokenized, total_examples=len(X_train_tokenized), epochs=10)

def get_average_word2vec(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

X_train_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in X_train_tokenized])
X_val_word2vec = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in X_val_tokenized])

# -------------------------------
# TF-IDF Features
# -------------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed).toarray()
X_val_tfidf = tfidf_vectorizer.transform(X_val_preprocessed).toarray()





In [30]:
# -------------------------------
# DistilBERT Embeddings
# -------------------------------
tokenizer = DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english')

train_encodings = tokenizer(X_train_preprocessed.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val_preprocessed.tolist(), truncation=True, padding=True, max_length=512)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train_combined.tolist()
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': y_val.tolist()
})

model = DistilBertForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english', num_labels=len(label_encoder.classes_),
                                                             ignore_mismatched_sizes=True)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# -------------------------------
# F1 Metric and Training Arguments
# -------------------------------
# Load the F1 metric and specify macro averaging
f1_metric = evaluate.load("f1")

# Define the compute_metrics function to maximize F1 macro average
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Compute the F1 macro average
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    return {"f1": f1["f1"]}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 score as the metric for saving the best model
    greater_is_better=True       # Ensure higher F1 is considered better
)

# Early stopping callback
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.0303,0.297009,0.726214
2,0.0842,0.280447,0.726295
3,0.0189,0.359803,0.856389
4,0.0009,0.355132,0.842547
5,0.0021,0.404481,0.863704
6,0.0001,0.426576,0.846251
7,0.0001,0.46179,0.85644
8,0.0,0.468527,0.848298


TrainOutput(global_step=8136, training_loss=0.05991286058624762, metrics={'train_runtime': 3476.4838, 'train_samples_per_second': 46.771, 'train_steps_per_second': 5.851, 'total_flos': 8616908795904000.0, 'train_loss': 0.05991286058624762, 'epoch': 8.0})

In [37]:
# Generate DistilBERT embeddings for ensemble
model.eval()
# Ensure model is on the appropriate device (e.g., CUDA if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_distilbert_embeddings(encodings, model):
    embeddings = []
    for i in tqdm(range(0, len(encodings['input_ids']), 8)):
        # Move input tensors to the same device as the model
        batch_input_ids = torch.tensor(encodings['input_ids'][i:i+8]).to(device)
        batch_attention_mask = torch.tensor(encodings['attention_mask'][i:i+8]).to(device)
        with torch.no_grad():
            outputs = model.distilbert(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

X_train_distilbert = get_distilbert_embeddings(train_encodings, model)
X_val_distilbert = get_distilbert_embeddings(val_encodings, model)

100%|██████████| 1017/1017 [02:01<00:00,  8.38it/s]
100%|██████████| 128/128 [00:15<00:00,  8.34it/s]


In [40]:
# -------------------------------
# Combine Features and Ensemble
# -------------------------------
from sklearn.decomposition import PCA

# Reduce DistilBERT embeddings to 100 dimensions using PCA
pca = PCA(n_components=100, random_state=42)
X_train_distilbert_reduced = pca.fit_transform(X_train_distilbert)
X_val_distilbert_reduced = pca.transform(X_val_distilbert)

# Compute cosine similarity between Word2Vec and reduced DistilBERT embeddings
train_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_train_word2vec, X_train_distilbert_reduced)
])
val_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_val_word2vec, X_val_distilbert_reduced)
])

# Combine features for the MLP Classifier
X_train_combined = np.hstack([X_train_word2vec, X_train_tfidf, X_train_distilbert_reduced, train_cosine_similarity.reshape(-1, 1)])
X_val_combined = np.hstack([X_val_word2vec, X_val_tfidf, X_val_distilbert_reduced, val_cosine_similarity.reshape(-1, 1)])

# Train MLPClassifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
mlp_classifier.fit(X_train_combined, y_train_combined)

# Predict and evaluate
y_val_pred = mlp_classifier.predict(X_val_combined)
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))


                                precision    recall  f1-score   support

                     allergens       0.95      0.98      0.96       371
                    biological       1.00      0.99      0.99       348
                      chemical       0.87      0.93      0.90        57
food additives and flavourings       0.67      0.40      0.50         5
                foreign bodies       0.99      0.98      0.99       112
                         fraud       0.74      0.72      0.73        74
                     migration       1.00      1.00      1.00         1
          organoleptic aspects       0.90      0.82      0.86        11
                  other hazard       0.75      0.67      0.71        27
              packaging defect       0.90      0.82      0.86        11

                      accuracy                           0.94      1017
                     macro avg       0.88      0.83      0.85      1017
                  weighted avg       0.94      0.94      0.94 