In [None]:
%%capture
!pip install datasets
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler, DataCollatorWithPadding
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Step 1: Load your datasets
train = pd.read_csv('incidents_labelled.csv')  # Training dataset with labels
test = pd.read_csv('incidents_val.csv')  # Test dataset without labels

# Step 2: Define Tokenization function using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




In [None]:
def tokenize_function(examples):
    return tokenizer(examples['title'], padding='max_length', truncation=True, max_length=128)

# Step 3: Label Encoding for each label (Ensure there are 4 labels)
labels = ['hazard-category', 'product-category', 'hazard', 'product']
label_encoders = {label: LabelEncoder() for label in labels}

# Apply label encoding for each label in the train dataset
for label in labels:
    train[label] = label_encoders[label].fit_transform(train[label])

# Combine all label columns into a single column of lists (multi-label format)
train['labels'] = train[labels].values.tolist()

# Step 4: Train-Test Split without stratification (80%-20%)
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test)

# Step 5: Tokenize the datasets using BERT tokenizer
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Step 6: Create DataCollator for Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 7: Set dataset format, including the 'labels' column
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Step 8: Initialize the BERT model for 4-label multi-label classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4, problem_type="multi_label_classification")
model.to('cuda')  # Move model to GPU if available


Map:   0%|          | 0/4787 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Step 9: Set optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Step 10: Training Loop
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Move batch to GPU if available
        labels = batch.pop('labels').float().to('cuda')  # Extract and move labels to GPU
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move the rest of the batch to GPU

        # Forward pass with BERT model
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)




  0%|          | 0/1797 [00:00<?, ?it/s]

In [None]:
# Step 11: Evaluation on Validation Set
model.eval()
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        labels = batch.pop('labels').cpu().numpy()  # Extract labels to CPU for comparison
        val_labels.append(labels)

        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move the rest of the batch to GPU

        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        preds = torch.sigmoid(outputs.logits).cpu().numpy() > 0.5  # Apply thresholding

        val_preds.append(preds)

# Convert predictions and labels to numpy arrays
val_preds = np.vstack(val_preds)
val_labels = np.vstack(val_labels)

# Ensure shapes match the number of labels (4 in this case)
assert val_preds.shape[1] == 4, f"Expected 4 labels but got {val_preds.shape[1]}."

# Generate the classification report for each label
for i, label in enumerate(labels):
    print(f"Classification report for {label}:")
    print(classification_report(val_labels[:, i], val_preds[:, i]))


Classification report for [  0  14  89 264]:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       377
           1       0.33      1.00      0.50       398
           2       0.00      0.00      0.00       107
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00       166
           5       0.00      0.00      0.00        77
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00        13
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00        18

    accuracy                           0.33      1197
   macro avg       0.03      0.10      0.05      1197
weighted avg       0.11      0.33      0.17      1197

Classification report for [  0  13 138 744]:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.12      1.00      0.2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

IndexError: index 4 is out of bounds for axis 1 with size 4

In [None]:
# Step 12: Prediction on the Test Set (Decoding the Predictions)
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}  # Move the batch to GPU

        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        preds = torch.sigmoid(outputs.logits).cpu().numpy() > 0.5  # Apply thresholding

        test_predictions.extend(preds)

test_predictions = np.array(test_predictions)


In [None]:
# Step 13: Decoding the test set predictions to their original labels using LabelEncoder
decoded_predictions = {label: label_encoders[label].inverse_transform(test_predictions[:, i].astype(int)) for i, label in enumerate(labels)}

# Step 14: Add predictions to the test dataframe
test['predicted_hazard-category'] = decoded_predictions['hazard-category']
test['predicted_product-category'] = decoded_predictions['product-category']
test['predicted_hazard'] = decoded_predictions['hazard']
test['predicted_product'] = decoded_predictions['product']


TypeError: unhashable type: 'numpy.ndarray'