<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/FFNN_Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertModel
import copy
import warnings
warnings.filterwarnings("ignore")

In [18]:
# Load and preprocess data
train_df = pd.read_csv('incidents_labelled.csv')

# Combine title and text for the input feature
train_df['combined_text'] = train_df['title'] + " " + train_df['text']

# Label encode the target variable
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['hazard-category'])
num_classes = len(label_encoder.classes_)

# Split the data
train_data, test_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Tokenizer setup
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_length = 128

# Custom Dataset class
class HazardDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['combined_text']
        label = self.data.iloc[idx]['label']

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders
train_dataset = HazardDataset(train_data, tokenizer, max_length)
test_dataset = HazardDataset(test_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# FFNN

In [23]:
# Define the FFNN model
class FFNNModel(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(FFNNModel, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc1 = nn.Linear(self.distilbert.config.hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # [CLS] token output
        x = self.fc1(pooled_output)
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc2(x)

# Initialize the model, loss function, and optimizer with L2 regularization
model = FFNNModel(hidden_size=64, num_classes=num_classes)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# Convert class weights to float32
class_weights = torch.tensor([1.0 / count for count in train_df['hazard-category'].value_counts().values], dtype=torch.float32)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

# Define L2 regularization strength (e.g., 1e-4)
l2_lambda = 1e-4
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=l2_lambda)

# Early stopping parameters
patience = 2
best_loss = float('inf')
early_stop_counter = 0
best_model_state = copy.deepcopy(model.state_dict())

# Training function
def train_model(model, data_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask).float()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

# Evaluation function
def eval_model(model, data_loader):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return total_loss / len(data_loader), predictions, true_labels

# Train and evaluate with early stopping
epochs = 10
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer)
    val_loss, _, _ = eval_model(model, test_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

    # Check for early stopping
    if val_loss < best_loss:
        best_loss = val_loss
        best_model_state = copy.deepcopy(model.state_dict())
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered")
            break

# Load the best model state before evaluation
model.load_state_dict(best_model_state)

# Get predictions and evaluate
_, preds, true_labels = eval_model(model, test_loader)
print(classification_report(true_labels, preds, target_names=label_encoder.classes_))


Epoch 1/10, Train Loss: 2.0276, Validation Loss: 1.6833
Epoch 2/10, Train Loss: 1.3438, Validation Loss: 1.2465
Epoch 3/10, Train Loss: 0.9927, Validation Loss: 1.1016
Epoch 4/10, Train Loss: 0.8280, Validation Loss: 0.9541
Epoch 5/10, Train Loss: 0.6748, Validation Loss: 1.1575
Epoch 6/10, Train Loss: 0.6062, Validation Loss: 1.0606
Early stopping triggered
                                precision    recall  f1-score   support

                     allergens       0.92      0.89      0.90       377
                    biological       0.99      0.87      0.93       398
                      chemical       0.88      0.50      0.63       107
food additives and flavourings       0.00      0.00      0.00         7
                foreign bodies       0.98      0.74      0.84       166
                         fraud       0.58      0.48      0.52        77
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.75      0.23      0.35 