Threshold is introduced to handle class imbalance by grouping infrequent classes into a single "Other" category. In datasets with a large number of unique labels, many of these labels have very few samples, making it difficult for the model to learn effectively for such rare classes

# Libraries

In [45]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [32]:
# Step 1: Load your datasets
train = pd.read_csv('incidents_labelled.csv')
test = pd.read_csv('incidents_val.csv')


# Preprocessing

In [33]:
# Step 2: Group rare categories into 'Other' in the 'hazard' and 'product' columns
threshold = 10  # Categories with <= 10 occurrences will be grouped
train['hazard'] = train['hazard'].apply(lambda x: x if train['hazard'].value_counts()[x] > threshold else 'Other')
train['product'] = train['product'].apply(lambda x: x if train['product'].value_counts()[x] > threshold else 'Other')

# Step 3: Apply label encoding for each of the target labels
labels = ['hazard-category', 'product-category', 'hazard', 'product']
label_encoders = {label: LabelEncoder() for label in labels}

for label in labels:
    train[label] = label_encoders[label].fit_transform(train[label])

# Step 4: Vectorizing text data (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(train['text']).toarray()
X_test_tfidf = tfidf.transform(test['text']).toarray()

# Step 5: Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, train[labels].values, test_size=0.2, random_state=42)


# Classes

In [34]:
# Step 6: Create PyTorch Dataset class for multi-label classification
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Step 7: Define DataLoaders
train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)


In [51]:
# Step 8: Define FFNN model
class FFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(FFNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Step 9: Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Ensure input has 3 dimensions: (batch_size, sequence_length, input_size)
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # Add a sequence dimension if it's missing

        lstm_out, (h_n, c_n) = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])  # Use the last output in the sequence
        return out


# Step 10: Define RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence length dimension
        _, h_n = self.rnn(x)
        x = self.relu(self.fc(h_n[-1]))
        return x

In [49]:
# Step 11: Define loss function and optimizer
def initialize_model(model_class, input_size, output_size, hidden_size=128):
    model = model_class(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCEWithLogitsLoss()
    return model, optimizer, loss_fn


def train_model(model, optimizer, loss_fn, train_dataloader, val_dataloader, epochs=100):
    model.train()  # Set model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0

        # Training Loop
        for batch_X, batch_y in train_dataloader:
            optimizer.zero_grad()
            output = model(batch_X)

            # Calculate loss
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()

            # Update running loss
            running_loss += loss.item()

            # Calculate accuracy
            preds = torch.sigmoid(output).round()  # Sigmoid + Round to get binary predictions
            correct += (preds == batch_y).sum().item()
            total += batch_y.numel()

        # Calculate training accuracy and loss for the epoch
        accuracy = correct / total
        avg_loss = running_loss / len(train_dataloader)

        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


In [42]:

def evaluate_model(model, val_dataloader, label_encoders):
    model.eval()
    val_preds = []
    val_labels = []

    # No gradient calculation during evaluation
    with torch.no_grad():
        for batch_X, batch_y in val_dataloader:
            output = model(batch_X)
            preds = (output > 0.5).cpu().numpy()  # Convert to binary predictions (0 or 1)
            val_preds.append(preds)
            val_labels.append(batch_y.cpu().numpy())

    # Stack predictions and true labels
    val_preds = np.vstack(val_preds)
    val_labels = np.vstack(val_labels)

    # For each label (hazard-category, product-category, etc.)
    for i, label_name in enumerate(label_encoders.keys()):
        print(f"Classification report for {label_name}:")

        # Use the correct labels based on the label encoder
        unique_classes = np.unique(val_labels[:, i])
        labels = list(range(len(label_encoders[label_name].classes_)))  # Correct number of classes

        # Ensure that all expected classes are represented
        print(classification_report(
            val_labels[:, i], val_preds[:, i],
            target_names=label_encoders[label_name].classes_,
            labels=labels
        ))

# Ensure labels parameter matches the number of expected classes for each category


In [38]:
# Initialize and train models
input_size = X_train.shape[1]
output_size = y_train.shape[1]


# FFNN

In [39]:
print("Training FFNN...")
model_ffnn, optimizer_ffnn, loss_fn_ffnn = initialize_model(FFNN, input_size, output_size)
train_model(model_ffnn, optimizer_ffnn, loss_fn_ffnn, train_dataloader, val_dataloader)

Training FFNN...
Epoch 1, Loss: -681.9484, Accuracy: 0.1348
Epoch 2, Loss: -5980.6841, Accuracy: 0.1350
Epoch 3, Loss: -17938.2440, Accuracy: 0.1350
Epoch 4, Loss: -36497.8477, Accuracy: 0.1350
Epoch 5, Loss: -61341.7183, Accuracy: 0.1350
Epoch 6, Loss: -91907.0101, Accuracy: 0.1350
Epoch 7, Loss: -128240.9930, Accuracy: 0.1350
Epoch 8, Loss: -169988.6468, Accuracy: 0.1350
Epoch 9, Loss: -217043.4944, Accuracy: 0.1350
Epoch 10, Loss: -268613.7477, Accuracy: 0.1350
Epoch 11, Loss: -324979.6140, Accuracy: 0.1350
Epoch 12, Loss: -385752.6675, Accuracy: 0.1350
Epoch 13, Loss: -450486.6077, Accuracy: 0.1350
Epoch 14, Loss: -519645.5483, Accuracy: 0.1350
Epoch 15, Loss: -592703.6267, Accuracy: 0.1350
Epoch 16, Loss: -668985.4738, Accuracy: 0.1350
Epoch 17, Loss: -749234.3515, Accuracy: 0.1350
Epoch 18, Loss: -833025.8967, Accuracy: 0.1350
Epoch 19, Loss: -920515.0154, Accuracy: 0.1350
Epoch 20, Loss: -1011693.6296, Accuracy: 0.1350
Epoch 21, Loss: -1105970.7892, Accuracy: 0.1350
Epoch 22, Lo

In [46]:
evaluate_model(model_ffnn, val_dataloader, label_encoders)

Classification report for hazard-category:
                                precision    recall  f1-score   support

                     allergens       0.00      0.00      0.00       377
                    biological       0.33      1.00      0.50       398
                      chemical       0.00      0.00      0.00       107
food additives and flavourings       0.00      0.00      0.00         7
                foreign bodies       0.00      0.00      0.00       166
                         fraud       0.00      0.00      0.00        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.00      0.00      0.00        13
                  other hazard       0.00      0.00      0.00        33
              packaging defect       0.00      0.00      0.00        18

                      accuracy                           0.33      1197
                     macro avg       0.03      0.10      0.05      1197
                  w

# LSTM

In [52]:
print("Training LSTM...")
input_size = X_train.shape[1]
output_size = len(labels)
hidden_size = 128
model_lstm, optimizer_lstm, loss_fn_lstm = initialize_model(LSTMModel, input_size, output_size, hidden_size)
train_model(model_lstm, optimizer_lstm, loss_fn_lstm, train_dataloader, val_dataloader)

Training LSTM...
Epoch 1, Loss: -106.0281, Accuracy: 0.1347
Epoch 2, Loss: -576.1330, Accuracy: 0.1350
Epoch 3, Loss: -1026.8414, Accuracy: 0.1350
Epoch 4, Loss: -1416.6749, Accuracy: 0.1350
Epoch 5, Loss: -1777.0710, Accuracy: 0.1350
Epoch 6, Loss: -2121.5612, Accuracy: 0.1350
Epoch 7, Loss: -2452.6797, Accuracy: 0.1350
Epoch 8, Loss: -2779.3123, Accuracy: 0.1350
Epoch 9, Loss: -3099.3456, Accuracy: 0.1350
Epoch 10, Loss: -3415.1215, Accuracy: 0.1350
Epoch 11, Loss: -3726.4666, Accuracy: 0.1350
Epoch 12, Loss: -4039.5747, Accuracy: 0.1350
Epoch 13, Loss: -4345.1934, Accuracy: 0.1350
Epoch 14, Loss: -4654.6112, Accuracy: 0.1350
Epoch 15, Loss: -4961.5514, Accuracy: 0.1350
Epoch 16, Loss: -5265.8329, Accuracy: 0.1350
Epoch 17, Loss: -5569.6178, Accuracy: 0.1350
Epoch 18, Loss: -5877.4511, Accuracy: 0.1350
Epoch 19, Loss: -6177.2573, Accuracy: 0.1350
Epoch 20, Loss: -6483.0497, Accuracy: 0.1350
Epoch 21, Loss: -6778.4583, Accuracy: 0.1350
Epoch 22, Loss: -7082.3903, Accuracy: 0.1350
Epoc

In [53]:
evaluate_model(model_lstm, val_dataloader, label_encoders)

Classification report for hazard-category:
                                precision    recall  f1-score   support

                     allergens       0.00      0.00      0.00       377
                    biological       0.33      1.00      0.50       398
                      chemical       0.00      0.00      0.00       107
food additives and flavourings       0.00      0.00      0.00         7
                foreign bodies       0.00      0.00      0.00       166
                         fraud       0.00      0.00      0.00        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.00      0.00      0.00        13
                  other hazard       0.00      0.00      0.00        33
              packaging defect       0.00      0.00      0.00        18

                      accuracy                           0.33      1197
                     macro avg       0.03      0.10      0.05      1197
                  w

# RNN

In [54]:
print("Training RNN...")
model_rnn, optimizer_rnn, loss_fn_rnn = initialize_model(RNNModel, input_size, output_size)
train_model(model_rnn, optimizer_rnn, loss_fn_rnn, train_dataloader, val_dataloader)


Training RNN...
Epoch 1, Loss: -192.9865, Accuracy: 0.1345
Epoch 2, Loss: -775.9897, Accuracy: 0.1350
Epoch 3, Loss: -1282.8210, Accuracy: 0.1350
Epoch 4, Loss: -1740.4189, Accuracy: 0.1350
Epoch 5, Loss: -2174.8073, Accuracy: 0.1350
Epoch 6, Loss: -2599.8480, Accuracy: 0.1350
Epoch 7, Loss: -3014.4494, Accuracy: 0.1350
Epoch 8, Loss: -3426.8468, Accuracy: 0.1350
Epoch 9, Loss: -3831.7996, Accuracy: 0.1350
Epoch 10, Loss: -4235.7724, Accuracy: 0.1350
Epoch 11, Loss: -4640.5496, Accuracy: 0.1350
Epoch 12, Loss: -5038.6816, Accuracy: 0.1350
Epoch 13, Loss: -5436.8084, Accuracy: 0.1350
Epoch 14, Loss: -5835.4516, Accuracy: 0.1350
Epoch 15, Loss: -6235.6565, Accuracy: 0.1350
Epoch 16, Loss: -6630.4103, Accuracy: 0.1350
Epoch 17, Loss: -7026.2826, Accuracy: 0.1350
Epoch 18, Loss: -7416.9000, Accuracy: 0.1350
Epoch 19, Loss: -7815.6826, Accuracy: 0.1350
Epoch 20, Loss: -8214.8928, Accuracy: 0.1350
Epoch 21, Loss: -8601.3812, Accuracy: 0.1350
Epoch 22, Loss: -8999.7233, Accuracy: 0.1350
Epoch

In [55]:
evaluate_model(model_rnn, val_dataloader, label_encoders)

Classification report for hazard-category:
                                precision    recall  f1-score   support

                     allergens       0.00      0.00      0.00       377
                    biological       0.33      1.00      0.50       398
                      chemical       0.00      0.00      0.00       107
food additives and flavourings       0.00      0.00      0.00         7
                foreign bodies       0.00      0.00      0.00       166
                         fraud       0.00      0.00      0.00        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.00      0.00      0.00        13
                  other hazard       0.00      0.00      0.00        33
              packaging defect       0.00      0.00      0.00        18

                      accuracy                           0.33      1197
                     macro avg       0.03      0.10      0.05      1197
                  w

# Predict

In [None]:
# Predict using FFNN
test_predictions = predict_on_test(model_ffnn)

# Step 11: Convert predictions back to original label format
decoded_predictions = {
    label: label_encoders[label].inverse_transform(test_predictions[:, i].astype(int))
    for i, label in enumerate(labels)
}

# Step 12: Add predictions to the test dataframe
for label in labels:
    test[f'predicted_{label}'] = decoded_predictions[label]

# View the test dataframe with predictions
print(test[['title', 'predicted_hazard-category', 'predicted_product-category', 'predicted_hazard', 'predicted_product']].head())