<a href="https://colab.research.google.com/github/SushovitNanda/SemEval-Food-Hazards/blob/main/BERT_Lr_edits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [17]:
# Load the dataset
data = pd.read_csv('incidents_labelled.csv')

# Combine 'title' and 'text' columns for richer input data
data['combined_text'] = data['title'] + " " + data['text']

# Encode hazard-category to numerical labels
label_mapping = {label: idx for idx, label in enumerate(data['hazard-category'].unique())}
data['label'] = data['hazard-category'].map(label_mapping)

# Stratified train-test split
train_df, val_df = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# Define tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Custom Dataset Class for PyTorch
class HazardDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create DataLoaders
train_dataset = HazardDataset(train_df['combined_text'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = HazardDataset(val_df['combined_text'].tolist(), val_df['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-4, correct_bias=False)
total_steps = len(train_loader) * 5  # Assuming 5 epochs (was *3 initially for 3 epochs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [23]:
# Training function
def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_train_loss = 0
        for batch in tqdm(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            model.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {avg_train_loss}")

        # Validation after each epoch
        evaluate_model(model, val_loader)

# Evaluation function
def evaluate_model(model, val_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print("Validation Classification Report:")
    print(classification_report(true_labels, predictions, target_names=label_mapping.keys()))


In [24]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#with lr = 1e-4

# Train and evaluate the model
train_model(model, train_loader, val_loader, optimizer, scheduler)

100%|██████████| 300/300 [02:18<00:00,  2.17it/s]


Epoch 1/5 - Training Loss: 1.6196963859846194
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.34      1.00      0.50       404
                foreign bodies       0.00      0.00      0.00       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.00      0.00      0.00       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.34      1197
                     macro avg       0.03      0.10   

100%|██████████| 300/300 [02:13<00:00,  2.24it/s]


Epoch 2/5 - Training Loss: 1.638235467672348
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.34      1.00      0.50       404
                foreign bodies       0.00      0.00      0.00       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.00      0.00      0.00       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.34      1197
                     macro avg       0.03      0.10    

100%|██████████| 300/300 [02:17<00:00,  2.18it/s]


Epoch 3/5 - Training Loss: 1.6310232178370159
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.34      1.00      0.50       404
                foreign bodies       0.00      0.00      0.00       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.00      0.00      0.00       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.34      1197
                     macro avg       0.03      0.10   

100%|██████████| 300/300 [02:14<00:00,  2.24it/s]


Epoch 4/5 - Training Loss: 1.626112804412842
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.34      1.00      0.50       404
                foreign bodies       0.00      0.00      0.00       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.00      0.00      0.00       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.34      1197
                     macro avg       0.03      0.10    

100%|██████████| 300/300 [02:13<00:00,  2.24it/s]


Epoch 5/5 - Training Loss: 1.6262320601940154
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.34      1.00      0.50       404
                foreign bodies       0.00      0.00      0.00       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.00      0.00      0.00       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.34      1197
                     macro avg       0.03      0.10   

In [21]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#with lr = 2e-5

# Train and evaluate the model
train_model(model, train_loader, val_loader, optimizer, scheduler)

100%|██████████| 300/300 [02:23<00:00,  2.09it/s]


Epoch 1/5 - Training Loss: 0.7113679797016084
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.97      0.92      0.95       404
                foreign bodies       0.80      0.91      0.85       154
                      chemical       0.72      0.89      0.80       100
                         fraud       0.77      0.54      0.63        82
          organoleptic aspects       0.40      0.62      0.48        13
                     allergens       0.89      0.97      0.92       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.15      0.07      0.10        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.86      1197
                     macro avg       0.47      0.49   

100%|██████████| 300/300 [02:17<00:00,  2.18it/s]


Epoch 2/5 - Training Loss: 0.33219497340420884
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.97      0.94      0.95       404
                foreign bodies       0.79      0.95      0.87       154
                      chemical       0.80      0.90      0.85       100
                         fraud       0.80      0.57      0.67        82
          organoleptic aspects       1.00      0.38      0.56        13
                     allergens       0.92      0.96      0.94       391
              packaging defect       0.70      0.44      0.54        16
                  other hazard       0.67      0.62      0.64        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.89      1197
                     macro avg       0.66      0.58  

100%|██████████| 300/300 [02:16<00:00,  2.20it/s]


Epoch 3/5 - Training Loss: 0.21671192827634514
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.96      0.95      0.95       404
                foreign bodies       0.90      0.90      0.90       154
                      chemical       0.80      0.89      0.84       100
                         fraud       0.76      0.59      0.66        82
          organoleptic aspects       0.80      0.62      0.70        13
                     allergens       0.90      0.97      0.94       391
              packaging defect       0.67      0.50      0.57        16
                  other hazard       0.65      0.59      0.62        29
food additives and flavourings       1.00      0.40      0.57         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.90      1197
                     macro avg       0.74      0.64  

100%|██████████| 300/300 [02:24<00:00,  2.07it/s]


Epoch 4/5 - Training Loss: 0.14096784167923032
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.93      0.96      0.95       404
                foreign bodies       0.89      0.88      0.89       154
                      chemical       0.83      0.89      0.86       100
                         fraud       0.75      0.60      0.67        82
          organoleptic aspects       0.86      0.46      0.60        13
                     allergens       0.92      0.96      0.94       391
              packaging defect       0.64      0.56      0.60        16
                  other hazard       0.73      0.66      0.69        29
food additives and flavourings       1.00      0.40      0.57         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.90      1197
                     macro avg       0.76      0.64  

100%|██████████| 300/300 [02:16<00:00,  2.19it/s]


Epoch 5/5 - Training Loss: 0.08474937721310805
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.96      0.96      0.96       404
                foreign bodies       0.89      0.88      0.89       154
                      chemical       0.81      0.89      0.85       100
                         fraud       0.72      0.65      0.68        82
          organoleptic aspects       0.88      0.54      0.67        13
                     allergens       0.93      0.95      0.94       391
              packaging defect       0.53      0.56      0.55        16
                  other hazard       0.73      0.66      0.69        29
food additives and flavourings       0.67      0.40      0.50         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.90      1197
                     macro avg       0.71      0.65  

In [5]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

#with lr = 1e-6

# Train and evaluate the model
train_model(model, train_loader, val_loader, optimizer, scheduler)

100%|██████████| 300/300 [02:27<00:00,  2.03it/s]


Epoch 1/5 - Training Loss: 1.5788312151034674
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.93      0.89      0.91       404
                foreign bodies       0.55      0.12      0.19       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.50      0.99      0.66       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.64      1197
                     macro avg       0.20      0.20   

100%|██████████| 300/300 [02:18<00:00,  2.17it/s]


Epoch 2/5 - Training Loss: 1.0569116338094076
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.95      0.90      0.92       404
                foreign bodies       0.48      0.93      0.63       154
                      chemical       0.00      0.00      0.00       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.73      0.96      0.83       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.74      1197
                     macro avg       0.22      0.28   

100%|██████████| 300/300 [02:15<00:00,  2.21it/s]


Epoch 3/5 - Training Loss: 0.8908883117636045
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.94      0.90      0.92       404
                foreign bodies       0.47      0.95      0.63       154
                      chemical       0.90      0.19      0.31       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.80      0.97      0.88       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.76      1197
                     macro avg       0.31      0.30   

100%|██████████| 300/300 [02:15<00:00,  2.21it/s]


Epoch 4/5 - Training Loss: 0.819423645734787
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.94      0.91      0.92       404
                foreign bodies       0.47      0.94      0.62       154
                      chemical       0.88      0.42      0.57       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.84      0.97      0.90       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.78      1197
                     macro avg       0.31      0.32    

100%|██████████| 300/300 [02:15<00:00,  2.21it/s]


Epoch 5/5 - Training Loss: 0.7870138581593832
Validation Classification Report:
                                precision    recall  f1-score   support

                    biological       0.93      0.91      0.92       404
                foreign bodies       0.49      0.94      0.64       154
                      chemical       0.86      0.50      0.63       100
                         fraud       0.00      0.00      0.00        82
          organoleptic aspects       0.00      0.00      0.00        13
                     allergens       0.84      0.97      0.90       391
              packaging defect       0.00      0.00      0.00        16
                  other hazard       0.00      0.00      0.00        29
food additives and flavourings       0.00      0.00      0.00         5
                     migration       0.00      0.00      0.00         3

                      accuracy                           0.79      1197
                     macro avg       0.31      0.33   