<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/ML_combined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score,make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Load data
train = pd.read_csv('incidents_labelled.csv')

# Combine title and text columns into a single feature
train['combined_text'] = train['title'] + ' ' + train['text']

# Encode the target variable 'hazard-category'
label_encoder = LabelEncoder()
train['hazard_category_encoded'] = label_encoder.fit_transform(train['hazard-category'])

# Define the features (X) and target (y)
X = train['combined_text']
y = train['hazard_category_encoded']

# Split data into training and test sets (Stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Vectorize text data with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Default Testing

In [1]:

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "MLP Classifier": MLPClassifier(max_iter=300),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("\n" + "="*60 + "\n")


Model: Logistic Regression
Accuracy: 0.8571
                                precision    recall  f1-score   support

                     allergens       0.91      0.96      0.94       391
                    biological       0.84      0.97      0.90       404
                      chemical       0.74      0.67      0.70       100
food additives and flavourings       0.60      0.60      0.60         5
                foreign bodies       0.86      0.91      0.88       154
                         fraud       0.82      0.55      0.66        82
                     migration       0.00      0.00      0.00         3
          organoleptic aspects       0.00      0.00      0.00        13
                  other hazard       0.67      0.14      0.23        29
              packaging defect       1.00      0.12      0.22        16

                      accuracy                           0.86      1197
                     macro avg       0.64      0.49      0.51      1197
                  

# Cross-Validation on MLP / Gradient Boosting

In [None]:
# Define models
mlp_model = MLPClassifier(max_iter=300, random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Define a scoring function
scoring = make_scorer(accuracy_score)

# Cross-validation for MLP Classifier
print("Performing cross-validation for MLP Classifier...")
mlp_cv_scores = cross_val_score(mlp_model, X_train_tfidf, y_train, cv=500, scoring=scoring)
print(f"MLP Classifier Cross-Validation Accuracy: {np.mean(mlp_cv_scores):.4f} ± {np.std(mlp_cv_scores):.4f}")

# Cross-validation for Gradient Boosting Classifier
print("Performing cross-validation for Gradient Boosting Classifier...")
gb_cv_scores = cross_val_score(gb_model, X_train_tfidf, y_train, cv=500, scoring=scoring)
print(f"Gradient Boosting Classifier Cross-Validation Accuracy: {np.mean(gb_cv_scores):.4f} ± {np.std(gb_cv_scores):.4f}")


Performing cross-validation for MLP Classifier...


# GridSearchCV

In [None]:
# MLP Classifier Hyperparameter Grid
mlp_param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [300]
}

# Gradient Boosting Classifier Hyperparameter Grid
gb_param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Initialize the classifiers
mlp_model = MLPClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

# Grid Search for MLP Classifier
print("Performing Grid Search for MLP Classifier...")
mlp_grid_search = GridSearchCV(
    estimator=mlp_model,
    param_grid=mlp_param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
mlp_grid_search.fit(X_train_tfidf, y_train)
print(f"Best MLP Parameters: {mlp_grid_search.best_params_}")
print(f"Best MLP Cross-Validation Accuracy: {mlp_grid_search.best_score_:.4f}")

# Grid Search for Gradient Boosting Classifier
print("\nPerforming Grid Search for Gradient Boosting Classifier...")
gb_grid_search = GridSearchCV(
    estimator=gb_model,
    param_grid=gb_param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
gb_grid_search.fit(X_train_tfidf, y_train)
print(f"Best Gradient Boosting Parameters: {gb_grid_search.best_params_}")
print(f"Best Gradient Boosting Cross-Validation Accuracy: {gb_grid_search.best_score_:.4f}")


# RandomSearchCV

In [None]:
# Randomized Search for MLP Classifier
print("Performing Randomized Search for MLP Classifier...")
mlp_random_search = RandomizedSearchCV(
    estimator=MLPClassifier(random_state=42),
    param_distributions={
        'hidden_layer_sizes': [(100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [300]
    },
    n_iter=20,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42
)
mlp_random_search.fit(X_train_tfidf, y_train)
print(f"Best MLP Parameters: {mlp_random_search.best_params_}")
print(f"Best MLP Cross-Validation Accuracy: {mlp_random_search.best_score_:.4f}")

# Evaluate best MLP model on test data
best_mlp_model = mlp_random_search.best_estimator_
y_pred_mlp = best_mlp_model.predict(X_test_tfidf)
print("Classification Report for Best MLP Model:\n")
print(classification_report(y_test, y_pred_mlp, target_names=label_encoder.classes_))


In [None]:
# Randomized Search for Gradient Boosting Classifier
print("\nPerforming Randomized Search for Gradient Boosting Classifier...")
gb_random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions={
        'n_estimators': randint(50, 200),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 10)
    },
    n_iter=20,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42
)
gb_random_search.fit(X_train_tfidf, y_train)
print(f"Best Gradient Boosting Parameters: {gb_random_search.best_params_}")
print(f"Best Gradient Boosting Cross-Validation Accuracy: {gb_random_search.best_score_:.4f}")

# Evaluate best Gradient Boosting model on test data
best_gb_model = gb_random_search.best_estimator_
y_pred_gb = best_gb_model.predict(X_test_tfidf)
print("Classification Report for Best Gradient Boosting Model:\n")
print(classification_report(y_test, y_pred_gb, target_names=label_encoder.classes_))
