# Modules and Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.stats import randint

In [2]:
# Load datasets
trainset = pd.read_csv('incidents_labelled.csv', index_col=0)
testset = pd.read_csv('incidents_val.csv', index_col=0)

# Logistic Regression

In [11]:
text_clf_lr = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', LogisticRegression(max_iter=1000)),
])

In [12]:
# Function to train and evaluate the model with train-test split
def train_and_evaluate(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # Fit the model on the training data
    text_clf_lr.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = text_clf_lr.predict(X_test)

    # Print classification report
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [13]:
# Evaluate model for each label
for label in ('hazard-category', 'product-category', 'hazard', 'product'):
    train_and_evaluate(label)

# Once satisfied with the model performance, make predictions on the actual testset
predictions = pd.DataFrame()

Classification report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.76      0.87      0.81       377
                    biological       0.73      0.91      0.81       398
                      chemical       0.75      0.48      0.58       107
food additives and flavourings       0.00      0.00      0.00         7
                foreign bodies       0.72      0.62      0.66       166
                         fraud       0.65      0.40      0.50        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.00      0.00      0.00        13
                  other hazard       1.00      0.27      0.43        33
              packaging defect       0.00      0.00      0.00        18

                      accuracy                           0.74      1197
                     macro avg       0.46      0.36      0.38      1197
                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for product-category:

                                                   precision    recall  f1-score   support

                              alcoholic beverages       1.00      0.07      0.13        14
                      cereals and bakery products       0.56      0.72      0.63       149
     cocoa and cocoa preparations, coffee and tea       0.68      0.64      0.66        44
                                    confectionery       0.91      0.26      0.41        38
dietetic foods, food supplements, fortified foods       0.85      0.55      0.67        31
                                    fats and oils       0.00      0.00      0.00         4
                                   feed materials       0.00      0.00      0.00         3
                   food additives and flavourings       0.00      0.00      0.00         1
                           food contact materials       0.00      0.00      0.00         1
                            fruits and veget

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for hazard:

                                                   precision    recall  f1-score   support

                                        Aflatoxin       0.00      0.00      0.00         3
                                  abnormal colour       0.00      0.00      0.00         1
                                  alcohol content       0.00      0.00      0.00         2
                           aliphatic hydrocarbons       0.00      0.00      0.00         1
                                        alkaloids       1.00      0.14      0.25         7
                                        allergens       0.00      0.00      0.00         3
                                           almond       0.33      0.08      0.13        12
             altered organoleptic characteristics       0.00      0.00      0.00         1
                           antibiotics, vet drugs       0.00      0.00      0.00         1
                                    bacillus spp.     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for product:

                                                                precision    recall  f1-score   support

                                        Catfishes (freshwater)       0.00      0.00      0.00         1
                                         Fishes not identified       0.30      0.38      0.33         8
                                      Not classified pork meat       0.00      0.00      0.00         3
                                    Pangas catfishes (generic)       0.00      0.00      0.00         2
                           Precooked cooked pork meat products       0.00      0.00      0.00         3
                                                 Veggie Burger       0.00      0.00      0.00         3
                                           alcoholic beverages       0.00      0.00      0.00         1
                                               alfalfa sprouts       0.50      1.00      0.67         1
                           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# SVM

In [None]:
text_clf_svm = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', SVC(kernel='linear', max_iter=1000)),
])

In [None]:
def train_and_evaluate(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # Fit the model on the training data
    text_clf_svm.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = text_clf_svm.predict(X_test)

    # Print classification report
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [None]:
# Evaluate model for each label
for label in ('hazard-category', 'product-category', 'hazard', 'product'):
    train_and_evaluate(label)

# Once satisfied with the model performance, make predictions on the actual testset
predictions = pd.DataFrame()

Classification report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.78      0.88      0.82       377
                    biological       0.78      0.90      0.83       398
                      chemical       0.68      0.49      0.57       107
food additives and flavourings       1.00      0.29      0.44         7
                foreign bodies       0.73      0.66      0.70       166
                         fraud       0.66      0.43      0.52        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.80      0.31      0.44        13
                  other hazard       0.84      0.48      0.62        33
              packaging defect       1.00      0.17      0.29        18

                      accuracy                           0.76      1197
                     macro avg       0.73      0.46      0.52      1197
                  

# RandomForest

In [None]:
text_clf_rf = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42)),  # RandomForest with 100 trees
])


In [None]:
def train_and_evaluate(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # Fit the model on the training data
    text_clf_rf.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = text_clf_rf.predict(X_test)

    # Print classification report
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [None]:
# Evaluate model for each label
for label in ('hazard-category', 'product-category', 'hazard', 'product'):
    train_and_evaluate(label)


Classification report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.81      0.86      0.83       377
                    biological       0.69      0.94      0.80       398
                      chemical       0.77      0.45      0.57       107
food additives and flavourings       1.00      0.43      0.60         7
                foreign bodies       0.78      0.59      0.67       166
                         fraud       0.74      0.48      0.58        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       1.00      0.08      0.14        13
                  other hazard       0.81      0.39      0.53        33
              packaging defect       0.00      0.00      0.00        18

                      accuracy                           0.75      1197
                     macro avg       0.66      0.42      0.47      1197
                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for product-category:

                                                   precision    recall  f1-score   support

                              alcoholic beverages       1.00      0.21      0.35        14
                      cereals and bakery products       0.55      0.71      0.62       149
     cocoa and cocoa preparations, coffee and tea       0.68      0.68      0.68        44
                                    confectionery       0.91      0.26      0.41        38
dietetic foods, food supplements, fortified foods       0.77      0.55      0.64        31
                                    fats and oils       1.00      0.25      0.40         4
                                   feed materials       0.00      0.00      0.00         3
                   food additives and flavourings       0.00      0.00      0.00         1
                           food contact materials       0.00      0.00      0.00         1
                            fruits and veget

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for hazard:

                                                   precision    recall  f1-score   support

                                        Aflatoxin       0.00      0.00      0.00         3
                                  abnormal colour       0.00      0.00      0.00         1
                                  alcohol content       0.00      0.00      0.00         2
                           aliphatic hydrocarbons       0.00      0.00      0.00         1
                                        alkaloids       0.67      0.29      0.40         7
                                        allergens       0.00      0.00      0.00         3
                                           almond       0.78      0.58      0.67        12
             altered organoleptic characteristics       1.00      1.00      1.00         1
                           antibiotics, vet drugs       0.00      0.00      0.00         1
                                    bacillus spp.     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification report for product:

                                                                precision    recall  f1-score   support

                                        Catfishes (freshwater)       0.33      1.00      0.50         1
                                         Fishes not identified       0.60      0.38      0.46         8
                                      Not classified pork meat       0.00      0.00      0.00         3
                                    Pangas catfishes (generic)       0.50      0.50      0.50         2
                           Precooked cooked pork meat products       0.67      0.67      0.67         3
                                                 Veggie Burger       0.50      0.33      0.40         3
                                               adobo seasoning       0.00      0.00      0.00         0
                                           alcoholic beverages       0.00      0.00      0.00         1
                           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# MLP Classifier

In [None]:
text_clf_nn = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)),
])

In [None]:
def train_and_evaluate(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # Fit the model on the training data
    text_clf_nn.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = text_clf_nn.predict(X_test)

    # Print classification report
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [None]:
# Evaluate model for each label
for label in ('hazard-category', 'product-category', 'hazard', 'product'):
    train_and_evaluate(label)

Classification report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.79      0.87      0.83       377
                    biological       0.82      0.87      0.85       398
                      chemical       0.65      0.56      0.60       107
food additives and flavourings       1.00      0.43      0.60         7
                foreign bodies       0.72      0.70      0.71       166
                         fraud       0.57      0.49      0.53        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.71      0.38      0.50        13
                  other hazard       0.81      0.52      0.63        33
              packaging defect       0.75      0.17      0.27        18

                      accuracy                           0.77      1197
                     macro avg       0.68      0.50      0.55      1197
                  

# GradBoost

In [14]:
text_clf_gb = Pipeline([
    ('vect', TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(2, 5), max_df=0.5, min_df=5)),
    ('clf', GradientBoostingClassifier(n_estimators=100, random_state=42))  # Using GradientBoostingClassifier
])

In [15]:
def train_and_evaluate(label):
    # Split the trainset into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(trainset['title'], trainset[label], test_size=0.2, random_state=42)

    # Fit the model on the training data
    text_clf_gb.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = text_clf_gb.predict(X_test)

    # Print classification report
    print(f"Classification report for {label}:\n")
    print(classification_report(y_test, y_pred))

In [17]:
train_and_evaluate('hazard-category')

Classification report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.80      0.78      0.79       377
                    biological       0.67      0.89      0.77       398
                      chemical       0.72      0.41      0.52       107
food additives and flavourings       0.25      0.29      0.27         7
                foreign bodies       0.73      0.58      0.65       166
                         fraud       0.58      0.42      0.48        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.43      0.46      0.44        13
                  other hazard       0.57      0.36      0.44        33
              packaging defect       0.56      0.28      0.37        18

                      accuracy                           0.71      1197
                     macro avg       0.53      0.45      0.47      1197
                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
train_and_evaluate('product-category')

Classification report for product-category:

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.36      0.29      0.32        14
                      cereals and bakery products       0.68      0.61      0.65       149
     cocoa and cocoa preparations, coffee and tea       0.66      0.61      0.64        44
                                    confectionery       0.62      0.34      0.44        38
dietetic foods, food supplements, fortified foods       0.60      0.58      0.59        31
                                    fats and oils       0.50      0.25      0.33         4
                                   feed materials       0.00      0.00      0.00         3
                   food additives and flavourings       0.00      0.00      0.00         1
                           food contact materials       0.00      0.00      0.00         1
                            fruits and veget

In [19]:
train_and_evaluate('hazard')

Classification report for hazard:

                                                   precision    recall  f1-score   support

                                        Aflatoxin       0.00      0.00      0.00         3
                                  abnormal colour       0.00      0.00      0.00         1
                                   abnormal smell       0.00      0.00      0.00         0
                                  alcohol content       0.00      0.00      0.00         2
                           aliphatic hydrocarbons       0.00      0.00      0.00         1
                                        alkaloids       0.50      0.14      0.22         7
                                        allergens       0.00      0.00      0.00         3
                                           almond       0.33      0.33      0.33        12
             altered organoleptic characteristics       0.00      0.00      0.00         1
                           antibiotics, vet drugs     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
train_and_evaluate('product')

# downloading predictions and saving it

In [None]:
for label in ('hazard-category', 'product-category'):
    text_clf_nn.fit(trainset['title'], trainset[label])
    predictions[label] = text_clf_nn.predict(testset['title'])

In [None]:
predictions = pd.DataFrame()
for label in ('hazard', 'product'):
    text_clf_rf.fit(trainset['title'], trainset[label])
    predictions[label] = text_clf_rf.predict(testset['title'])

In [None]:
import os
from shutil import make_archive

output_folder = r'C:\Users\Sushovit\Desktop\RF'
os.makedirs(output_folder, exist_ok=True)

# Save the predictions to a CSV file inside the "SemEval-Hazard" folder
csv_path = os.path.join(output_folder, 'submission.csv')
predictions.to_csv(csv_path)

# Zip the folder and save the zip file inside the "SemEval-Hazard" folder
zip_path = os.path.join(output_folder, 'submission')
make_archive(zip_path, 'zip', output_folder)

print("Predictions saved and zipped successfully in SemEval-Hazard folder.")

Predictions saved and zipped successfully in SemEval-Hazard folder.
