<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/ML_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import vstack
import warnings
warnings.filterwarnings("ignore")

In [None]:
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_train.csv
!wget https://raw.githubusercontent.com/SushovitNanda/SemEval-Food-Hazards/main/Datasets/incidents_val.csv

In [30]:

# Load the dataset
train = pd.read_csv('incidents_train.csv')
test = pd.read_csv('incidents_val.csv')

# Combine title and text columns into a single feature
train['combined_text'] = train['title'] + ' ' + train['text']
test['combined_text'] = test['title'] + ' ' + test['text']

# Encode target variables
label_encoder_hazard = LabelEncoder()
label_encoder_product = LabelEncoder()

train['hazard_encoded'] = label_encoder_hazard.fit_transform(train['hazard'])
train['product_encoded'] = label_encoder_product.fit_transform(train['product'])

# Define features (X) and targets (y)
X = train['combined_text']
y_hazard = train['hazard_encoded']
y_product = train['product_encoded']

# Split data into training and test sets
X_train, X_val, y_hazard_train, y_hazard_val = train_test_split(
    X, y_hazard, test_size=0.2, random_state=42
)
_, _, y_product_train, y_product_val = train_test_split(
    X, y_product, test_size=0.2, random_state=42
)

# TF-IDF feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test['combined_text'])

# GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}
grid_search_hazard = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_search_product = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [None]:
# Train separate classifiers for hazard and product
print("Tuning hyperparameters for hazard prediction...")
grid_search_hazard.fit(X_train_tfidf, y_hazard_train)
print("Best parameters for hazard:", grid_search_hazard.best_params_)

print("Tuning hyperparameters for product prediction...")
grid_search_product.fit(X_train_tfidf, y_product_train)
print("Best parameters for product:", grid_search_product.best_params_)

classifier_hazard = grid_search_hazard.best_estimator_
classifier_product = grid_search_product.best_estimator_

In [None]:
# Cross-validation scores
cv_scores_hazard = cross_val_score(classifier_hazard, X_train_tfidf, y_hazard_train, cv=5, scoring='accuracy')
cv_scores_product = cross_val_score(classifier_product, X_train_tfidf, y_product_train, cv=5, scoring='accuracy')

print("\nHazard Cross-Validation Accuracy:", np.mean(cv_scores_hazard))
print("Product Cross-Validation Accuracy:", np.mean(cv_scores_product))

# Final evaluation on validation set
classifier_hazard.fit(X_train_tfidf, y_hazard_train)
classifier_product.fit(X_train_tfidf, y_product_train)

y_hazard_pred = classifier_hazard.predict(X_val_tfidf)
y_product_pred = classifier_product.predict(X_val_tfidf)

Validation Results


In [None]:
print("Hazard Classification Report:")
print(classification_report(y_hazard_val, y_hazard_pred, target_names=label_encoder_hazard.classes_))

In [None]:
print("\nProduct Classification Report:")
print(classification_report(y_product_val, y_product_pred, target_names=label_encoder_product.classes_))

In [None]:
# Predict on test data
test['hazard_prediction'] = label_encoder_hazard.inverse_transform(classifier_hazard.predict(X_test_tfidf))
test['product_prediction'] = label_encoder_product.inverse_transform(classifier_product.predict(X_test_tfidf))

# Save predictions to submission.csv
test[['hazard_prediction', 'product_prediction']].to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")