In [51]:
%%capture
!pip install datasets

In [52]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from transformers import DataCollatorWithPadding
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import classification_report, precision_recall_curve
from torch.nn import BCEWithLogitsLoss
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords for cleaning text
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Load your datasets
train = pd.read_csv('incidents_labelled.csv')  # Training dataset with labels
test = pd.read_csv('incidents_val.csv')  # Test dataset without labels


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
# Preprocessing function to clean text data
def preprocess_text(text):
    # Remove special characters, numbers, and extra whitespace
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower().strip()  # Lowercase and strip whitespaces

    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Apply text preprocessing on the 'text' column
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

# Step 3: Define Tokenization function using BERT tokenizer
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [54]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Step 4: Label Encoding and Oversampling for Class Imbalance
labels = ['hazard-category', 'product-category', 'hazard', 'product']
label_encoders = {label: LabelEncoder() for label in labels}

# Apply label encoding
for label in labels:
    train[label] = label_encoders[label].fit_transform(train[label])

# Combine all label columns into a single column of lists (multi-label format)
train['labels'] = train[labels].values.tolist()

# Perform oversampling on the minority classes in the training set
from sklearn.utils import resample


In [55]:

def oversample_minority(train_df, labels):
    # Separate majority and minority classes
    majority_df = train_df[train_df[labels[0]] == train_df[labels[0]].mode().values[0]]

    # Resample each minority class to balance with the majority
    resampled_dfs = [majority_df]
    for label in labels:
        minority_df = train_df[train_df[label] != train_df[label].mode().values[0]]
        resampled_df = resample(minority_df, replace=True, n_samples=len(majority_df), random_state=42)
        resampled_dfs.append(resampled_df)

    # Combine the majority and oversampled minority classes
    train_df_oversampled = pd.concat(resampled_dfs)
    return train_df_oversampled


In [56]:

# Apply oversampling on the training dataset
train_df_oversampled = oversample_minority(train, labels)

# Step 5: Train-Test Split
# Split train data into training and validation sets (80-20 split)
train_df, val_df = train_test_split(train_df_oversampled, test_size=0.2, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test)

# Step 6: Tokenize the datasets using BERT tokenizer
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove columns not needed for training
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Map:   0%|          | 0/8080 [00:00<?, ? examples/s]

Map:   0%|          | 0/2020 [00:00<?, ? examples/s]

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

In [57]:
# Step 7: Create DataCollator for Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

# Step 8: Handle Class Imbalance with Class Weights
# Calculate class weights based on label distribution in the training set
label_counts = train[labels].sum(axis=0)
class_weights = 1.0 / label_counts
class_weights_tensor = torch.tensor(class_weights.values).float().to('cuda')

# Custom loss function with class weights for multi-label classification
loss_fn = BCEWithLogitsLoss(pos_weight=class_weights_tensor)


In [58]:
# Step 9: Initialize the BERT Model for Multi-Label Classification
# Load BERT model for sequence classification with 4 labels (multi-label classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4, problem_type="multi_label_classification")
model.to('cuda')  # Move model to GPU if available

# Step 10: Set Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


In [59]:
# Step 11: Training Loop with Class Weights
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_dataloader:
        labels = batch.pop('labels').float().to('cuda')
        batch = {k: v.to('cuda') for k, v in batch.items()}

        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  8%|▊         | 536/6315 [06:44<1:12:45,  1.32it/s]
100%|██████████| 5050/5050 [16:12<00:00,  5.19it/s]

In [None]:
# Step 12: Dynamic Threshold Tuning for Label-Wise Prediction
# Dynamic thresholding function for finding the best threshold for each label
def find_best_thresholds(y_true, y_pred):
    thresholds = []
    for i in range(y_true.shape[1]):  # Iterate through each label
        precision, recall, thresholds_ = precision_recall_curve(y_true[:, i], y_pred[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)  # Avoid division by zero
        best_idx = np.argmax(f1_scores)  # Find index of best F1 score
        thresholds.append(thresholds_[best_idx])
    return thresholds

# Evaluate model and calculate optimal thresholds on validation set
model.eval()
val_preds = []
val_labels = []

In [None]:
with torch.no_grad():
    for batch in val_dataloader:
        labels = batch.pop('labels').cpu().numpy()
        val_labels.append(labels)

        batch = {k: v.to('cuda') for k, v in batch.items()}

        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Sigmoid for multi-label classification
        val_preds.append(preds)

val_preds = np.vstack(val_preds)  # Stack predictions
val_labels = np.vstack(val_labels)  # Stack true labels


In [None]:
# Step 13: Find the best thresholds for each label
best_thresholds = find_best_thresholds(val_labels, val_preds)

# Apply thresholds on validation set predictions
final_preds = (val_preds > np.array(best_thresholds)).astype(int)

# Step 14: Label-Wise Evaluation Using Tuned Thresholds
# Label-wise evaluation using the tuned thresholds
for i, label_name in enumerate(['hazard-category', 'product-category', 'hazard', 'product']):
    print(f"Classification report for {label_name}:")
    print(classification_report(val_labels[:, i], final_preds[:, i]))
