<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/HC_Mix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers datasets scikit-learn torch
!pip install evaluate

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from transformers import DistilBertTokenizer, DistilBertModel, AutoTokenizer, AutoModel
from tqdm import tqdm
import torch
import re
import gensim
from datasets import Dataset
import evaluate
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import os
import warnings
warnings.filterwarnings("ignore")

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_valid.csv
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_test.csv

--2025-01-23 20:07:12--  https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13415981 (13M) [text/plain]
Saving to: ‘incidents_train.csv’


2025-01-23 20:07:12 (144 MB/s) - ‘incidents_train.csv’ saved [13415981/13415981]

--2025-01-23 20:07:13--  https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_valid.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1369261 (1.3M) [text/plain]
Saving to: 

In [16]:
# -------------------------------
# Load Dataset
# -------------------------------
csv1 = pd.read_csv('incidents_train.csv')
csv2 = pd.read_csv('incidents_valid.csv')
train = pd.concat([csv1, csv2], ignore_index = True )

# Combine title and text columns
train['combined_text'] = train['title'] + ' ' + train['text']

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train['hazard_category_encoded'] = label_encoder.fit_transform(train['hazard-category'])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    train['combined_text'], train['hazard_category_encoded'], test_size=0.2, random_state=42, stratify=train['hazard_category_encoded']
)

# -------------------------------
# Data Preprocessing
# -------------------------------
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

X_train_preprocessed = X_train.apply(preprocess_text)
X_val_preprocessed = X_val.apply(preprocess_text)


In [17]:
# -------------------------------
# Step 1: Prepare the Knowledge Base (TF-IDF for Retrieval)
# -------------------------------
# Vectorize training data for retrieval
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_train = tfidf_vectorizer.fit_transform(X_train_preprocessed)

# Train Nearest Neighbors model for retrieval
retriever = NearestNeighbors(n_neighbors=3, metric='cosine')
retriever.fit(tfidf_train)

# Retrieval Function
def retrieve_context(input_texts, tfidf_vectorizer, retriever, train_data):
    tfidf_inputs = tfidf_vectorizer.transform(input_texts)
    _, indices = retriever.kneighbors(tfidf_inputs)
    # Retrieve top-k most relevant texts
    retrieved_contexts = [' '.join(train_data.iloc[idx] for idx in index_list) for index_list in indices]
    return retrieved_contexts

# Retrieve context for validation set
retrieved_contexts_val = retrieve_context(X_val_preprocessed, tfidf_vectorizer, retriever, X_train_preprocessed)

# Combine input text with retrieved context
X_val_enriched = X_val_preprocessed + ' ' + retrieved_contexts_val


In [18]:
# -------------------------------
# Step 2: DistilBERT Embeddings
# -------------------------------
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_distilbert_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

# Generate DistilBERT embeddings for validation
X_val_distilbert = get_distilbert_embeddings(X_val_enriched, distilbert_tokenizer, distilbert_model)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 1310/1310 [12:46<00:00,  1.71it/s]


In [20]:
# -------------------------------
# Step 3: PubMedBERT Embeddings
# -------------------------------
pubmed_tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings")
pubmed_model = AutoModel.from_pretrained("NeuML/pubmedbert-base-embeddings")

def get_pubmedbert_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

# Generate PubMedBERT embeddings for validation
X_val_pubmed = get_pubmedbert_embeddings(X_val_enriched, pubmed_tokenizer, pubmed_model)


tokenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|██████████| 1310/1310 [22:01<00:00,  1.01s/it]


In [21]:
# -------------------------------
# Step 4: Word2Vec Embeddings
# -------------------------------
X_train_tokenized = X_train_preprocessed.apply(lambda x: gensim.utils.simple_preprocess(x))
word2vec_model = gensim.models.Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=2, workers=4)
word2vec_model.train(X_train_tokenized, total_examples=len(X_train_tokenized), epochs=10)

def get_average_word2vec(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

X_val_word2vec = np.array([get_average_word2vec(gensim.utils.simple_preprocess(text), word2vec_model) for text in X_val_enriched])




In [22]:
# -------------------------------
# Step 5: PCA for Dimensionality Reduction
# -------------------------------
# Reduce PubMedBERT and DistilBERT embeddings to 100 dimensions
pca_pubmed = PCA(n_components=100, random_state=42)
X_val_pubmed_reduced = pca_pubmed.fit_transform(X_val_pubmed)

pca_distilbert = PCA(n_components=100, random_state=42)
X_val_distilbert_reduced = pca_distilbert.fit_transform(X_val_distilbert)

In [23]:
# -------------------------------
# Combine Features
# -------------------------------
# Compute cosine similarity
val_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_val_word2vec, X_val_distilbert_reduced)
])

# Combine Word2Vec, DistilBERT, PubMedBERT, and cosine similarity
X_val_combined = np.hstack([
    X_val_word2vec, X_val_distilbert_reduced, X_val_pubmed_reduced, val_cosine_similarity.reshape(-1, 1)
])

In [24]:
# -------------------------------
# Train MLPClassifier
# -------------------------------
mlp_classifier = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
mlp_classifier.fit(X_val_combined, y_val)

# Predict and Evaluate
y_val_pred = mlp_classifier.predict(X_val_combined)
print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

                                precision    recall  f1-score   support

                     allergens       1.00      1.00      1.00       433
                    biological       1.00      1.00      1.00       443
                      chemical       1.00      1.00      1.00       105
food additives and flavourings       1.00      1.00      1.00         6
                foreign bodies       1.00      1.00      1.00       166
                         fraud       1.00      1.00      1.00        90
                     migration       1.00      1.00      1.00         3
          organoleptic aspects       1.00      1.00      1.00        14
                  other hazard       1.00      1.00      1.00        32
              packaging defect       1.00      1.00      1.00        18

                      accuracy                           1.00      1310
                     macro avg       1.00      1.00      1.00      1310
                  weighted avg       1.00      1.00      1.00 

In [None]:
# -------------------------------
# Test Predictions
# -------------------------------
test = pd.read_csv('incidents_test.csv')
test['combined_text'] = test['title'] + ' ' + test['text']
test_preprocessed = test['combined_text'].apply(preprocess_text)

# Retrieve contexts for test set
retrieved_contexts_test = retrieve_context(test_preprocessed, tfidf_vectorizer, retriever, X_train_preprocessed)
X_test_enriched = test_preprocessed + ' ' + retrieved_contexts_test

# Generate features for test set
X_test_word2vec = np.array([get_average_word2vec(gensim.utils.simple_preprocess(text), word2vec_model) for text in X_test_enriched])
X_test_distilbert = get_distilbert_embeddings(X_test_enriched, distilbert_tokenizer, distilbert_model)
X_test_pubmed = get_pubmedbert_embeddings(X_test_enriched, pubmed_tokenizer, pubmed_model)

X_test_distilbert_reduced = pca_distilbert.transform(X_test_distilbert)
X_test_pubmed_reduced = pca_pubmed.transform(X_test_pubmed)

test_cosine_similarity = np.array([
    cosine_similarity([wv], [db])[0][0]
    for wv, db in zip(X_test_word2vec, X_test_distilbert_reduced)
])

X_test_combined = np.hstack([
    X_test_word2vec, X_test_distilbert_reduced, X_test_pubmed_reduced, test_cosine_similarity.reshape(-1, 1)
])

# Predict test labels
test_predictions = mlp_classifier.predict(X_test_combined)
test['hazard-category'] = label_encoder.inverse_transform(test_predictions)

# Save predictions
test[['hazard-category']].to_csv('hazard-category.csv', index=False)
print("Test predictions saved to 'hazard-category.csv'")
