In [6]:
%%capture
!pip install datasets
import torch
import pandas as pd
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_scheduler, DataCollatorWithPadding
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
from torch.utils.data import DataLoader # Keep DataLoader import from torch
from datasets import Dataset # Import Dataset from 'datasets' instead of 'torch.utils.data'
import pandas as pd
from sklearn.model_selection import train_test_split


In [8]:
# Step 1: Load your datasets
train = pd.read_csv('incidents_labelled.csv')  # Training dataset with labels
test = pd.read_csv('incidents_val.csv')  # Test dataset without labels

# Step 2: Define Tokenization function using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['title'], padding='max_length', truncation=True, max_length=128)

# Step 3: Label Encoding only for 'hazard-category' label
label_encoder = LabelEncoder()
train['hazard-category'] = label_encoder.fit_transform(train['hazard-category'])

# Step 4: Train-Test Split
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test)

# Step 5: Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Step 6: Create DataCollator for Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 7: Set dataset format, including the 'hazard-category' label
train_dataset = train_dataset.map(lambda examples: {'labels': examples['hazard-category']})
val_dataset = val_dataset.map(lambda examples: {'labels': examples['hazard-category']})
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Step 8: Define Custom Model for Single-Label Classification on 'hazard-category'
class CustomBERTForHazardCategory(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 1  # Single binary label

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(-1)  # Adjust logits for single label
        loss = None
        if labels is not None:
            loss = torch.nn.BCEWithLogitsLoss()(logits, labels.float())
        return {'loss': loss, 'logits': logits} if loss is not None else {'logits': logits}

# Initialize model and set to GPU if available
model = CustomBERTForHazardCategory.from_pretrained('bert-base-uncased', num_labels=1)
model.to('cuda')

# Step 9: Set optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Step 10: Training Loop for Custom Model
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        labels = batch.pop('labels').float().to('cuda')
        batch = {k: v.to('cuda') for k, v in batch.items()}

        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
        loss = outputs['loss']
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# Step 11: Evaluation on Validation Set for 'hazard-category' Label
model.eval()
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        labels = batch.pop('labels').cpu().numpy()
        val_labels.append(labels)

        batch = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        preds = torch.sigmoid(outputs['logits']).cpu().numpy() > 0.5  # Apply thresholding

        val_preds.append(preds)

# Convert predictions and labels to numpy arrays
val_preds = np.concatenate(val_preds).astype(int)
val_labels = np.concatenate(val_labels).astype(int)

# Classification Report for 'hazard-category'
print("Classification report for 'hazard-category':")
print(classification_report(val_labels, val_preds))




Map:   0%|          | 0/4787 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

Map:   0%|          | 0/4787 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of CustomBERTForHazardCategory were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1797 [00:00<?, ?it/s]

Classification report for 'hazard-category':
              precision    recall  f1-score   support

           0       0.94      0.63      0.75       377
           1       0.42      1.00      0.59       398
           2       0.00      0.00      0.00       107
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00       166
           5       0.00      0.00      0.00        77
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00        18

    accuracy                           0.53      1197
   macro avg       0.14      0.16      0.13      1197
weighted avg       0.44      0.53      0.43      1197



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
