In [None]:
!pip install transformers
!pip install torch
!pip install tiktoken
!pip install sentencepiece==0.1.96
!pip install focal-loss-torch

In [3]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()
torch.cuda.empty_cache()

In [14]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, DebertaV2Tokenizer, DebertaV2Model,DebertaV2ForSequenceClassification
from sklearn.model_selection import StratifiedKFold
import tiktoken
from transformers import DebertaV2Tokenizer
import sentencepiece
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from focal_loss import FocalLoss


# Load your dataset (adjust file path)
df = pd.read_csv("Data/incidents_train.csv", encoding='utf-8')  # Replace with your file path

# Drop missing values
df = df.dropna(subset=["title", "product-category", "hazard-category"])

In [15]:
# Convert labels to integers (for classification)
df["product-category"] = df["product-category"].astype("category").cat.codes
df["hazard-category"] = df["hazard-category"].astype("category").cat.codes


# Create stratification label by combining product and hazard categories
df["stratify_label"] = df["product-category"].astype(str) + "_" + df["hazard-category"].astype(str)

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Get train-test split indices
for train_index, test_index in skf.split(df["title"], df["stratify_label"]):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]
    break  # Use the first fold as train-test split

# Extract final train and test sets
X_train, X_test = train_df["title"], test_df["title"]
y_train_product, y_test_product = train_df["product-category"], test_df["product-category"]
y_train_hazard, y_test_hazard = train_df["hazard-category"], test_df["hazard-category"]
# Check class distribution in the training set before applying weights
print("Class distribution before applying class weights (Product Category):")
print(y_train_product.value_counts())

print("\nClass distribution before applying class weights (Hazard Category):")
print(y_train_hazard.value_counts())

MODEL_NAME = "microsoft/deberta-v3-base"  # Change to DeBERTa Large
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
model = DebertaV2Model.from_pretrained(MODEL_NAME)

# Move tokenized input to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def tokenize_text(texts):
    encodings = tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors="pt")
    return encodings.to(device)

train_encodings = tokenize_text(X_train)
test_encodings = tokenize_text(X_test)

# Convert labels to tensors
train_labels_product = torch.tensor(y_train_product.values)
train_labels_hazard = torch.tensor(y_train_hazard.values)
test_labels_product = torch.tensor(y_test_product.values)
test_labels_hazard = torch.tensor(y_test_hazard.values)

# Compute Class Weights for the resampled data
product_class_weights = compute_class_weight("balanced", classes=np.unique(y_train_product), y=y_train_product)
hazard_class_weights = compute_class_weight("balanced", classes=np.unique(y_train_hazard), y=y_train_hazard)

# Convert class weights to tensors and move them to GPU if available
product_class_weights = torch.tensor(product_class_weights, dtype=torch.float).to(device)
hazard_class_weights = torch.tensor(hazard_class_weights, dtype=torch.float).to(device)
# Convert Pandas Series to NumPy arrays for proper indexing
y_train_product_np = y_train_product.to_numpy()
y_train_hazard_np = y_train_hazard.to_numpy()



Class distribution before applying class weights (Product Category):
13    1149
1      536
9      428
18     375
19     214
20     211
15     210
12     178
2      168
3      136
14     106
4      104
10     103
0       47
16      44
5       15
17      15
7        7
11       6
6        5
8        4
21       4
Name: product-category, dtype: int64

Class distribution before applying class weights (Hazard Category):
0    1485
1    1390
4     449
5     298
2     232
8     106
9      43
7      41
3      19
6       2
Name: hazard-category, dtype: int64


In [16]:
import torch.nn as nn
from transformers import AutoModel
import torch.utils.checkpoint as checkpoint

class ProductDeBERTa(nn.Module):
    def __init__(self, model_name, num_product_labels):
        super(ProductDeBERTa, self).__init__()
        self.deberta = DebertaV2Model.from_pretrained(model_name)
        self.product_classifier = nn.Linear(self.deberta.config.hidden_size, num_product_labels)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        # DeBERTa outputs a BaseModelOutput object
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Extract last hidden state
        cls_hidden_state = hidden_states[:, 0, :]  # Get [CLS] token representation
        cls_hidden_state = self.dropout(cls_hidden_state)  # Apply dropout
        product_logits = self.product_classifier(cls_hidden_state)  # Apply classifier on [CLS] token
        return product_logits  # Return the logits directly
        
class HazardDeBERTa(nn.Module):
    def __init__(self, model_name, num_hazard_labels):
        super(HazardDeBERTa, self).__init__()
        self.deberta = DebertaV2Model.from_pretrained(model_name)
        self.hazard_classifier = nn.Linear(self.deberta.config.hidden_size, num_hazard_labels)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        cls_hidden_state = hidden_states[:, 0, :]  # [CLS] token
        cls_hidden_state = self.dropout(cls_hidden_state)
        hazard_logits = self.hazard_classifier(cls_hidden_state)
        return hazard_logits
# Get number of classes for each task
num_product_labels = len(df["product-category"].unique())
num_hazard_labels = len(df["hazard-category"].unique())

# Initialize both models separately
product_model = ProductDeBERTa(MODEL_NAME, num_product_labels)
hazard_model = HazardDeBERTa(MODEL_NAME, num_hazard_labels)


In [17]:
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from torch.utils.data import WeightedRandomSampler

# Create Dataset for product task
class ProductDataset(Dataset):
    def __init__(self, encodings, labels_product):
        self.encodings = encodings
        self.labels_product = labels_product

    def __len__(self):
        return len(self.labels_product)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels_product"] = self.labels_product[idx]
        return item

# Create Dataset for hazard task
class HazardDataset(Dataset):
    def __init__(self, encodings, labels_hazard):
        self.encodings = encodings
        self.labels_hazard = labels_hazard

    def __len__(self):
        return len(self.labels_hazard)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels_hazard"] = self.labels_hazard[idx]
        return item

# Create Dataset for product classification task
product_train_dataset = ProductDataset(train_encodings, train_labels_product)
product_test_dataset = ProductDataset(test_encodings, test_labels_product)

# Create Dataset for hazard classification task
hazard_train_dataset = HazardDataset(train_encodings, train_labels_hazard)
hazard_test_dataset = HazardDataset(test_encodings, test_labels_hazard)

# Create DataLoader for the product classification task
product_train_loader = DataLoader(product_train_dataset, batch_size=16, shuffle=True)
product_test_loader = DataLoader(product_test_dataset, batch_size=16, shuffle=False)

# Create DataLoader for the hazard classification task
hazard_train_loader = DataLoader(hazard_train_dataset, batch_size=16, shuffle=True)
hazard_test_loader = DataLoader(hazard_test_dataset, batch_size=16, shuffle=False)


In [18]:
from transformers import get_scheduler
import torch.nn as nn
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

# Assuming product_class_weights and hazard_class_weights are tensors
loss_fn_product = FocalLoss(gamma=2, reduction='mean')
loss_fn_hazard = FocalLoss(gamma=2, reduction='mean')

# Separate optimizers for product and hazard models
optimizer_product = AdamW(product_model.parameters(), lr=2e-5, weight_decay=0.01)
optimizer_hazard = AdamW(hazard_model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning rate scheduler for both models (if necessary)
num_training_steps_product = len(product_train_loader) * 5  # 5 epochs for product model
num_training_steps_hazard = len(hazard_train_loader) * 5  # 5 epochs for hazard model
lr_scheduler_product = get_scheduler("linear", optimizer=optimizer_product, num_warmup_steps=0, num_training_steps=num_training_steps_product)
lr_scheduler_hazard = get_scheduler("linear", optimizer=optimizer_hazard, num_warmup_steps=0, num_training_steps=num_training_steps_hazard)

product_model = product_model.to(device)
hazard_model = hazard_model.to(device)

# Training Loop for both models
epochs = 5
for epoch in range(epochs):
    # Train Product Model
    product_model.train()
    total_loss_product = 0
    for batch in product_train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch items to the correct device

        optimizer_product.zero_grad()  # Zero gradients for product model

        # Forward pass for product model
        product_logits = product_model(batch["input_ids"], batch["attention_mask"])

        product_probs = F.softmax(product_logits, dim=-1)  # For product classification

        # Ensure labels are in LongTensor format
        batch["labels_product"] = batch["labels_product"].to(device).long()

        # Compute loss for product model
        loss_product = loss_fn_product(product_probs, batch["labels_product"])

        # Backpropagation for product model
        loss_product.backward()
        optimizer_product.step()

        # Step learning rate scheduler
        lr_scheduler_product.step()

        total_loss_product += loss_product.item()

    # Average loss for product model after each epoch
    avg_loss_product = total_loss_product / len(product_train_loader)
    print(f"Epoch {epoch+1} | Product Model Loss: {avg_loss_product:.4f}")

    # Train Hazard Model
    hazard_model.train()
    total_loss_hazard = 0
    for batch in hazard_train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch items to the correct device

        optimizer_hazard.zero_grad()  # Zero gradients for hazard model

        # Forward pass for hazard model
        hazard_logits = hazard_model(batch["input_ids"], batch["attention_mask"])

        hazard_probs = F.softmax(hazard_logits, dim=-1)    # For hazard classification

        # Ensure labels are in LongTensor format
        batch["labels_hazard"] = batch["labels_hazard"].to(device).long()

        # Compute loss for hazard model
        loss_hazard = loss_fn_hazard(hazard_probs, batch["labels_hazard"])

        # Backpropagation for hazard model
        loss_hazard.backward()
        optimizer_hazard.step()

        # Step learning rate scheduler
        lr_scheduler_hazard.step()

        total_loss_hazard += loss_hazard.item()

    # Average loss for hazard model after each epoch
    avg_loss_hazard = total_loss_hazard / len(hazard_train_loader)
    print(f"Epoch {epoch+1} | Hazard Model Loss: {avg_loss_hazard:.4f}")

print("Training complete!")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1 | Product Model Loss: 1.8739


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1 | Hazard Model Loss: 0.7347
Epoch 2 | Product Model Loss: 1.0483
Epoch 2 | Hazard Model Loss: 0.3920
Epoch 3 | Product Model Loss: 0.7004
Epoch 3 | Hazard Model Loss: 0.2956
Epoch 4 | Product Model Loss: 0.5175
Epoch 4 | Hazard Model Loss: 0.2291
Epoch 5 | Product Model Loss: 0.4358
Epoch 5 | Hazard Model Loss: 0.1813
Training complete!


In [6]:
from sklearn.metrics import classification_report

def evaluate_product_model(model, dataloader):
    model.eval()
    all_preds_product = []
    all_labels_product = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            product_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_product = torch.argmax(product_logits, dim=1).cpu().numpy()
            all_preds_product.extend(preds_product)
            all_labels_product.extend(batch["labels_product"].cpu().numpy())
    
    print("Product Category Classification Report:")
    print(classification_report(all_labels_product, all_preds_product))
    return all_labels_product, all_preds_product

def evaluate_hazard_model(model, dataloader):
    model.eval()
    all_preds_hazard = []
    all_labels_hazard = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            hazard_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_hazard = torch.argmax(hazard_logits, dim=1).cpu().numpy()
            all_preds_hazard.extend(preds_hazard)
            all_labels_hazard.extend(batch["labels_hazard"].cpu().numpy())
    
    print("Hazard Category Classification Report:")
    print(classification_report(all_labels_hazard, all_preds_hazard))
    return all_labels_hazard, all_preds_hazard

# Evaluate the product and hazard models separately
y_test_product, all_preds_product = evaluate_product_model(product_model, product_test_loader)
y_test_hazard, all_preds_hazard = evaluate_hazard_model(hazard_model, hazard_test_loader)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Product Category Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.60      0.55        15
           1       0.70      0.75      0.72       155
           2       0.61      0.79      0.69        48
           3       0.54      0.41      0.46        37
           4       0.53      0.51      0.52        35
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         4
           9       0.68      0.73      0.70       133
          10       0.50      0.63      0.56        35
          11       0.00      0.00      0.00         1
          12       0.90      0.80      0.85        46
          13       0.86      0.91      0.88       339
          14       0.81      0.61      0.70        36
          15       0.68      0.77      0.72        65
          16       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Hazard Category Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       392
           1       0.81      0.87      0.84       403
           2       0.65      0.64      0.65       100
           3       0.00      0.00      0.00         6
           4       0.61      0.66      0.64       155
           5       0.81      0.57      0.67        82
           6       0.00      0.00      0.00         3
           7       0.80      0.36      0.50        11
           8       0.45      0.34      0.39        29
           9       1.00      0.25      0.40        16

    accuracy                           0.78      1197
   macro avg       0.60      0.46      0.49      1197
weighted avg       0.77      0.78      0.77      1197



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
from sklearn.metrics import f1_score
import numpy as np

def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Convert to NumPy arrays for Boolean indexing
    hazards_true = np.array(hazards_true)
    products_true = np.array(products_true)
    hazards_pred = np.array(hazards_pred)
    products_pred = np.array(products_pred)

    # Compute F1-score for hazards
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')

    # Compute F1-score for products (only where hazard prediction is correct)
    correct_hazard_mask = hazards_pred == hazards_true
    f1_products = f1_score(
        products_true[correct_hazard_mask], 
        products_pred[correct_hazard_mask], 
        average='macro'
    ) if np.any(correct_hazard_mask) else 0  # Handle case where no correct hazards

    # Compute final score
    return (f1_hazards + f1_products) / 2.

score = compute_score(y_test_hazard, y_test_product, all_preds_hazard, all_preds_product)
print(f"Custom F1 Score: {score:.4f}")

Custom F1 Score: 0.5079


In [10]:
# Load new test file (adjust file path)
new_test_df = pd.read_csv("Data/incidents_test.csv")  # Replace with your actual file

# Ensure it has the same preprocessing
new_test_df = new_test_df.dropna(subset=["title", "product-category", "hazard-category"])
new_test_df["product-category"] = new_test_df["product-category"].astype("category").cat.codes
new_test_df["hazard-category"] = new_test_df["hazard-category"].astype("category").cat.codes

# Tokenize new test set
new_test_encodings = tokenize_text(new_test_df["title"])

# Convert labels to tensors
new_test_labels_product = torch.tensor(new_test_df["product-category"].values)
new_test_labels_hazard = torch.tensor(new_test_df["hazard-category"].values)

# Create Dataset for product classification task
product_new_test_dataset = ProductDataset(new_test_encodings, new_test_labels_product)

# Create Dataset for hazard classification task
hazard_new_test_dataset = HazardDataset(new_test_encodings, new_test_labels_hazard)

# Create DataLoader for the product classification task
product_new_test_loader = DataLoader(product_new_test_dataset, batch_size=16, shuffle=False)

# Create DataLoader for the hazard classification task
hazard_new_test_loader = DataLoader(hazard_new_test_dataset, batch_size=16, shuffle=False)

# Evaluate the product model
def evaluate_product_model(model, dataloader):
    model.eval()
    all_preds_product = []
    all_labels_product = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            product_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_product = torch.argmax(product_logits, dim=1).cpu().numpy()
            all_preds_product.extend(preds_product)
            all_labels_product.extend(batch["labels_product"].cpu().numpy())
    
    return all_labels_product, all_preds_product

# Evaluate the hazard model
def evaluate_hazard_model(model, dataloader):
    model.eval()
    all_preds_hazard = []
    all_labels_hazard = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            hazard_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_hazard = torch.argmax(hazard_logits, dim=1).cpu().numpy()
            all_preds_hazard.extend(preds_hazard)
            all_labels_hazard.extend(batch["labels_hazard"].cpu().numpy())
    
    return all_labels_hazard, all_preds_hazard

# Evaluate product and hazard models separately
y_new_test_product, all_preds_product = evaluate_product_model(product_model, product_new_test_loader)
y_new_test_hazard, all_preds_hazard = evaluate_hazard_model(hazard_model, hazard_new_test_loader)

# Compute custom F1 score
new_score = compute_score(y_new_test_hazard, y_new_test_product, all_preds_hazard, all_preds_product)
print(f"Custom F1 Score on New Test Data: {new_score:.4f}")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Custom F1 Score on New Test Data: 0.4036
