In [17]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np
import nltk
from nltk.corpus import stopwords
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# Load the dataset
train_data = pd.read_csv('incidents_labelled.csv')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply text preprocessing to the title column
train_data['cleaned_title'] = train_data['title'].apply(preprocess_text)

# Tokenize the cleaned_title column
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary to top 5000 words
tokenizer.fit_on_texts(train_data['cleaned_title'].values)

# Convert texts to sequences of integers
X = tokenizer.texts_to_sequences(train_data['cleaned_title'].values)

# Padding sequences to ensure uniform input length
X = pad_sequences(X, maxlen=100)



In [23]:

# Converting hazard-category to binary format using LabelBinarizer (for multi-class classification)
lb_hazard = LabelBinarizer()
y_hazard = lb_hazard.fit_transform(train_data['hazard-category'])

# Split into training and validation sets
X_train, X_val, y_train_hazard, y_val_hazard = train_test_split(X, y_hazard, test_size=0.2, random_state=42, stratify=y_product)

# RNN model using LSTM
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(lb_hazard.classes_), activation='softmax'))  # Softmax for multi-class classification as it determines
# probability distribution over multiple classes, and we want to select the most probable class.

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the RNN model
hazard_train = model.fit(
    X_train, y_train_hazard,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_hazard),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for hazard-category
y_pred_hazard = model.predict(X_val)
y_pred_hazard_classes = np.argmax(y_pred_hazard, axis=1)


Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 470ms/step - accuracy: 0.4317 - loss: 1.7408 - val_accuracy: 0.6266 - val_loss: 1.1348
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 247ms/step - accuracy: 0.6635 - loss: 1.0134 - val_accuracy: 0.7009 - val_loss: 0.9130
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 240ms/step - accuracy: 0.7766 - loss: 0.7152 - val_accuracy: 0.7510 - val_loss: 0.8256
Epoch 4/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 289ms/step - accuracy: 0.8477 - loss: 0.5331 - val_accuracy: 0.7477 - val_loss: 0.8346
Epoch 5/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 438ms/step - accuracy: 0.8787 - loss: 0.3954 - val_accuracy: 0.7694 - val_loss: 0.8273
Epoch 6/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 294ms/step - accuracy: 0.9142 - loss: 0.3033 - val_accuracy: 0.7786 - val_loss: 0.8742
Epoch 6: early stoppin

In [24]:
# Generate classification report for hazard-category
classification_rep_hazard = classification_report(np.argmax(y_val_hazard, axis=1), y_pred_hazard_classes, target_names=lb_hazard.classes_)
print("Classification Report for Hazard-Category:")
print(classification_rep_hazard)


Classification Report for Hazard-Category:
                                precision    recall  f1-score   support

                     allergens       0.86      0.85      0.85       374
                    biological       0.81      0.92      0.86       406
                      chemical       0.66      0.64      0.65       108
food additives and flavourings       0.50      0.17      0.25         6
                foreign bodies       0.65      0.70      0.67       152
                         fraud       0.73      0.55      0.63        89
                     migration       0.00      0.00      0.00         4
          organoleptic aspects       0.00      0.00      0.00        13
                  other hazard       0.59      0.46      0.52        37
              packaging defect       1.00      0.12      0.22         8

                      accuracy                           0.78      1197
                     macro avg       0.58      0.44      0.46      1197
                  w

In [25]:
# Convert product-category to binary format using LabelBinarizer
lb_product = LabelBinarizer()
y_product = lb_product.fit_transform(train_data['product-category'])

# Stratified split
X_train, X_val, y_train_product, y_val_product = train_test_split(X, y_product, test_size=0.2, random_state=42, stratify=y_product)

# RNN model
model = Sequential()
model.add(Embedding(input_dim=5001, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(lb_product.classes_), activation='softmax'))  # Softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
product_train = model.fit(
    X_train, y_train_product,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val_product),
    callbacks=[early_stopping],
    verbose=1
)

# Predict on validation set for product-category
y_pred_product = model.predict(X_val)
y_pred_product_classes = np.argmax(y_pred_product, axis=1)

# Ensure that we include all unique labels from y_val and predictions
unique_classes = unique_labels(np.argmax(y_val_product, axis=1), y_pred_product_classes)

Epoch 1/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 253ms/step - accuracy: 0.2582 - loss: 2.6241 - val_accuracy: 0.2832 - val_loss: 2.3164
Epoch 2/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 251ms/step - accuracy: 0.3267 - loss: 2.2176 - val_accuracy: 0.4127 - val_loss: 1.9579
Epoch 3/10
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 226ms/step - accuracy: 0.4525 - loss: 1.8034 - val_accuracy: 0.5104 - val_loss: 1.6504
Epoch 3: early stopping
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step


In [26]:
# Generate classification report with the correct labels
classification_rep_product = classification_report(
    np.argmax(y_val_product, axis=1),
    y_pred_product_classes,
    target_names=[lb_product.classes_[i] for i in unique_classes],  # Use only the unique classes
    labels=unique_classes
)

print("Classification Report for Product-Category:")
print(classification_rep_product)


Classification Report for Product-Category:
                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.00      0.00      0.00        15
                      cereals and bakery products       0.30      0.84      0.44       156
     cocoa and cocoa preparations, coffee and tea       0.35      0.24      0.29        49
                                    confectionery       0.00      0.00      0.00        39
dietetic foods, food supplements, fortified foods       0.00      0.00      0.00        34
                                    fats and oils       0.00      0.00      0.00         4
                                   feed materials       0.00      0.00      0.00         1
                   food additives and flavourings       0.00      0.00      0.00         2
                           food contact materials       0.00      0.00      0.00         5
                            fruits and vegeta