# 3rd assignment-ST1

In case you don't have the libraries below installed, please install them and afterwards make the commands comments to avoid reinstallation.

In [13]:
!pip install transformers
!pip install torch
!pip install tiktoken
!pip install sentencepiece==0.1.96
!pip install focal-loss-torch



Below we are importing all the libraries that are needed for the development. Afterwards, we are importing 2 files, one for training and one for validation.

In [14]:
# Basic libraries
import random
import numpy as np
import pandas as pd
from collections import Counter

# Scikit-learn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report

# NLTK (Natural Language Toolkit)
import nltk
from nltk.corpus import wordnet

# PyTorch & Deep Learning Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Transformers (Hugging Face)
from transformers import BertTokenizer, BertModel, DebertaV2Tokenizer
from transformers import get_scheduler

# Tokenization & Preprocessing Libraries
import tiktoken
import sentencepiece

# Custom Loss Function
from focal_loss import FocalLoss


# Load train and validation datasets )
df_train = pd.read_csv("Data/incidents_train.csv", encoding='utf-8')
df_valid = pd.read_csv("Data/incidents_valid.csv", encoding='utf-8')

# Prepare the training data
df_train = df_train.dropna(subset=["title", "product-category", "hazard-category"])
df_train["product-category"] = df_train["product-category"].astype("category").cat.codes
df_train["hazard-category"] = df_train["hazard-category"].astype("category").cat.codes

# Prepare the validation data
df_valid = df_valid.dropna(subset=["title", "product-category", "hazard-category"])
df_valid["product-category"] = df_valid["product-category"].astype("category").cat.codes
df_valid["hazard-category"] = df_valid["hazard-category"].astype("category").cat.codes

# Extract features and labels for training and validation sets
X_train = df_train["title"]
y_train_product = df_train["product-category"]
y_train_hazard = df_train["hazard-category"]

X_test = df_valid["title"]
y_test_product = df_valid["product-category"]
y_test_hazard = df_valid["hazard-category"]

Below, there are some methods for augmentation using synonyms

In [15]:
# Download necessary NLTK data (run once) Afterwards, add them as comment
nltk.download("wordnet")
nltk.download("omw-1.4")

# Synonym replacement function using WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

# Function for synonym augmentation
def synonym_augmentation(text, aug_p=0.1):
    words = text.split()
    augmented_text = words[:]
    
    for i, word in enumerate(words):
        if random.random() < aug_p:  # Probability of replacing the word
            synonyms = get_synonyms(word)
            if synonyms:
                augmented_text[i] = random.choice(synonyms)
    
    return " ".join(augmented_text)
class_counts_products = df_train['product-category'].value_counts() 
class_counts_hazards = df_train['hazard-category'].value_counts() 

minor_product =200
minor_hazard=54
under_represented_classes_product = class_counts_products[class_counts_products < minor_product].index.tolist() 
under_represented_classes_hazard = class_counts_hazards[class_counts_hazards < minor_hazard].index.tolist() 

minority_df_products = df_train[df_train["product-category"].isin(under_represented_classes_product)] 
minority_df_hazards = df_train[df_train["hazard-category"].isin(under_represented_classes_hazard)] 
max_class_size_product = 1434 
max_class_size_hazard = 1854

def compute_augmentation_factor(class_size, max_class_size): 
    return min(int(np.log2(max_class_size / class_size) + 1), 15)

augmentation_factors_products={ 
    category: compute_augmentation_factor(count, max_class_size_product) 
    for category, count in class_counts_products.items() 
}
augmentation_factors_hazards={ 
    category: compute_augmentation_factor(count, max_class_size_hazard) 
    for category, count in class_counts_hazards.items()
}

augmented_data_products = []
for index, row in minority_df_products.iterrows(): 
    original_title = row['title'] 
    product_category = row['product-category'] 
    num_augmentations = augmentation_factors_products[product_category] 
    for _ in range(num_augmentations): 
        augmented_title=synonym_augmentation(original_title) 
        augmented_data_products.append({'title': augmented_title, 'product-category': product_category}) 
augmented_product_df = pd.DataFrame(augmented_data_products) 
augmented_product_df=augmented_product_df.dropna(subset=['title']) 

# Merge original and augmented product data
df_train_product_augmented = pd.concat([df_train, augmented_product_df], ignore_index=True)

# Extract updated X_train_product and y_train_product
X_train_product = df_train_product_augmented["title"]

augmented_data_hazards = []
for index, row in minority_df_hazards.iterrows(): 
    original_title = row['title'] 
    hazard_category = row['hazard-category'] 
    num_augmentations = augmentation_factors_hazards[hazard_category] 
    for _ in range(num_augmentations): 
        augmented_title=synonym_augmentation(original_title) 
        augmented_data_hazards.append({'title': augmented_title, 'hazard-category': hazard_category}) 
augmented_hazard_df = pd.DataFrame(augmented_data_hazards) 
augmented_hazard_df=augmented_hazard_df.dropna(subset=['title']) 

# Merge original and augmented product data
df_train_hazard_augmented = pd.concat([df_train, augmented_hazard_df], ignore_index=True)

# Extract updated X_train_product and y_train_product
X_train_hazard = df_train_hazard_augmented["title"]


[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


This code sets up a BERT model for both product and hazard prediction, tokenizes the text, and prepares labels as tensors for training and evaluation.

In [16]:

# Model Name (using BERT)
MODEL_NAME = "bert-base-uncased"  # Change to BERT (you can use bert-large-uncased if you want a larger model)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

# Move tokenized input to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def tokenize_text(texts):
    encodings = tokenizer(list(texts), padding=True, truncation=True, max_length=128, return_tensors="pt")
    return encodings.to(device)

# Tokenize texts for both models (product and hazard)
train_encodings_product = tokenize_text(X_train_product)  # Tokenized inputs for product model
train_encodings_hazard = tokenize_text(X_train_hazard)
test_encodings = tokenize_text(X_test)

# Convert labels to tensors
train_labels_product = torch.tensor(y_train_product.values)
train_labels_hazard = torch.tensor(y_train_hazard.values)
test_labels_product = torch.tensor(y_test_product.values)
test_labels_hazard = torch.tensor(y_test_hazard.values)


This code defines two separate BERT-based classification models—one for product classification and one for hazard classification—using PyTorch and Hugging Face’s BERT.

In [17]:
class ProductBERT(nn.Module):
    def __init__(self, model_name, num_product_labels):
        super(ProductBERT, self).__init__()
        # Change to BERT
        self.bert = BertModel.from_pretrained(model_name)
        self.product_classifier = nn.Linear(self.bert.config.hidden_size, num_product_labels)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        # BERT outputs a BaseModelOutput object
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Extract last hidden state
        cls_hidden_state = hidden_states[:, 0, :]  # Get [CLS] token representation
        cls_hidden_state = self.dropout(cls_hidden_state)  # Apply dropout
        product_logits = self.product_classifier(cls_hidden_state)  # Apply classifier on [CLS] token
        return product_logits  # Return the logits directly
        
class HazardBERT(nn.Module):
    def __init__(self, model_name, num_hazard_labels):
        super(HazardBERT, self).__init__()
        # Change to BERT
        self.bert = BertModel.from_pretrained(model_name)
        self.hazard_classifier = nn.Linear(self.bert.config.hidden_size, num_hazard_labels)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        cls_hidden_state = hidden_states[:, 0, :]  # [CLS] token
        cls_hidden_state = self.dropout(cls_hidden_state)
        hazard_logits = self.hazard_classifier(cls_hidden_state)
        return hazard_logits

# Get number of classes for each task
num_product_labels = len(df_train["product-category"].unique())
num_hazard_labels = len(df_train["hazard-category"].unique())

# Initialize both models separately using BERT instead of DeBERTa
product_model = ProductBERT("bert-base-uncased", num_product_labels)  # Use BERT model
hazard_model = HazardBERT("bert-base-uncased", num_hazard_labels)  # Use BERT model


This code prepares the dataset and data loaders for training and evaluating the BERT-based product and hazard classification models using PyTorch’s Dataset and DataLoader classes. 

In [18]:
# Create Dataset for product task
class ProductDataset(Dataset):
    def __init__(self, encodings, labels_product):
        self.encodings = encodings
        self.labels_product = labels_product

    def __len__(self):
        return len(self.labels_product)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels_product"] = self.labels_product[idx]
        return item

# Create Dataset for hazard task
class HazardDataset(Dataset):
    def __init__(self, encodings, labels_hazard):
        self.encodings = encodings
        self.labels_hazard = labels_hazard

    def __len__(self):
        return len(self.labels_hazard)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels_hazard"] = self.labels_hazard[idx]
        return item

# Create WeightedRandomSampler for the product category
#weighted_sampler_product = WeightedRandomSampler(weights=sample_weights_product, num_samples=len(sample_weights_product), replacement=True)

# Create WeightedRandomSampler for the hazard category (optional to apply to both)
#weighted_sampler_hazard = WeightedRandomSampler(weights=sample_weights_hazard, num_samples=len(sample_weights_hazard), replacement=True)

# Create Dataset for product classification task
product_train_dataset = ProductDataset(train_encodings_product, train_labels_product)
product_test_dataset = ProductDataset(test_encodings, test_labels_product)

# Create Dataset for hazard classification task
hazard_train_dataset = HazardDataset(train_encodings_hazard, train_labels_hazard)
hazard_test_dataset = HazardDataset(test_encodings, test_labels_hazard)

# Create DataLoader for the product classification task
product_train_loader = DataLoader(product_train_dataset, batch_size=16, shuffle=True)
product_test_loader = DataLoader(product_test_dataset, batch_size=16, shuffle=False)

# Create DataLoader for the hazard classification task
hazard_train_loader = DataLoader(hazard_train_dataset, batch_size=16,  shuffle=True)
hazard_test_loader = DataLoader(hazard_test_dataset, batch_size=16, shuffle=False)


This code trains two separate BERT-based models—one for product classification and one for hazard classification—using a custom training loop

In [None]:
# Assuming product_class_weights and hazard_class_weights are tensors
loss_fn_product = FocalLoss(gamma=2, reduction='mean')
loss_fn_hazard = FocalLoss(gamma=2, reduction='mean')

# Separate optimizers for product and hazard models
optimizer_product = AdamW(product_model.parameters(), lr=2e-5, weight_decay=0.01)
optimizer_hazard = AdamW(hazard_model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning rate scheduler for both models (if necessary)
num_training_steps_product = len(product_train_loader) * 5  # 5 epochs for product model
num_training_steps_hazard = len(hazard_train_loader) * 5  # 5 epochs for hazard model
lr_scheduler_product = get_scheduler("linear", optimizer=optimizer_product, num_warmup_steps=0, num_training_steps=num_training_steps_product)
lr_scheduler_hazard = get_scheduler("linear", optimizer=optimizer_hazard, num_warmup_steps=0, num_training_steps=num_training_steps_hazard)

product_model = product_model.to(device)
hazard_model = hazard_model.to(device)

# Training Loop for both models
epochs = 6
for epoch in range(epochs):
    # Train Product Model
    product_model.train()
    total_loss_product = 6
    for batch in product_train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch items to the correct device

        optimizer_product.zero_grad()  # Zero gradients for product model

        # Forward pass for product model
        product_logits = product_model(batch["input_ids"], batch["attention_mask"])

        product_probs = F.softmax(product_logits, dim=-1)  # For product classification

        # Ensure labels are in LongTensor format
        batch["labels_product"] = batch["labels_product"].to(device).long()

        # Compute loss for product model
        loss_product = loss_fn_product(product_probs, batch["labels_product"])

        # Backpropagation for product model
        loss_product.backward()
        optimizer_product.step()

        # Step learning rate scheduler
        lr_scheduler_product.step()

        total_loss_product += loss_product.item()

    # Average loss for product model after each epoch
    avg_loss_product = total_loss_product / len(product_train_loader)
    print(f"Epoch {epoch+1} | Product Model Loss: {avg_loss_product:.4f}")

    # Train Hazard Model
    hazard_model.train()
    total_loss_hazard = 0
    for batch in hazard_train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch items to the correct device

        optimizer_hazard.zero_grad()  # Zero gradients for hazard model

        # Forward pass for hazard model
        hazard_logits = hazard_model(batch["input_ids"], batch["attention_mask"])

        hazard_probs = F.softmax(hazard_logits, dim=-1)    # For hazard classification

        # Ensure labels are in LongTensor format
        batch["labels_hazard"] = batch["labels_hazard"].to(device).long()

        # Compute loss for hazard model
        loss_hazard = loss_fn_hazard(hazard_probs, batch["labels_hazard"])

        # Backpropagation for hazard model
        loss_hazard.backward()
        optimizer_hazard.step()

        # Step learning rate scheduler
        lr_scheduler_hazard.step()

        total_loss_hazard += loss_hazard.item()

    # Average loss for hazard model after each epoch
    avg_loss_hazard = total_loss_hazard / len(hazard_train_loader)
    print(f"Epoch {epoch+1} | Hazard Model Loss: {avg_loss_hazard:.4f}")

print("Training complete!")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1 | Product Model Loss: 1.3546


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1 | Hazard Model Loss: 0.5448
Epoch 2 | Product Model Loss: 0.6378


This part evaluates the model based on the validation dataset. It evaluates for production model and hazard model separately

In [None]:
def evaluate_product_model(model, dataloader):
    model.eval()
    all_preds_product = []
    all_labels_product = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            product_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_product = torch.argmax(product_logits, dim=1).cpu().numpy()
            all_preds_product.extend(preds_product)
            all_labels_product.extend(batch["labels_product"].cpu().numpy())
    
    print("Product Category Classification Report:")
    print(classification_report(all_labels_product, all_preds_product))
    return all_labels_product, all_preds_product

def evaluate_hazard_model(model, dataloader):
    model.eval()
    all_preds_hazard = []
    all_labels_hazard = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            hazard_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_hazard = torch.argmax(hazard_logits, dim=1).cpu().numpy()
            all_preds_hazard.extend(preds_hazard)
            all_labels_hazard.extend(batch["labels_hazard"].cpu().numpy())
    
    print("Hazard Category Classification Report:")
    print(classification_report(all_labels_hazard, all_preds_hazard))
    return all_labels_hazard, all_preds_hazard

# Evaluate the product and hazard models separately
y_test_product, all_preds_product = evaluate_product_model(product_model, product_test_loader)
y_test_hazard, all_preds_hazard = evaluate_hazard_model(hazard_model, hazard_test_loader)


This part calculates f1 score according to the assignments evaluation model.

In [None]:
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
    # Convert to NumPy arrays for Boolean indexing
    hazards_true = np.array(hazards_true)
    products_true = np.array(products_true)
    hazards_pred = np.array(hazards_pred)
    products_pred = np.array(products_pred)

    # Compute F1-score for hazards
    f1_hazards = f1_score(hazards_true, hazards_pred, average='macro')

    # Compute F1-score for products (only where hazard prediction is correct)
    correct_hazard_mask = hazards_pred == hazards_true
    f1_products = f1_score(
        products_true[correct_hazard_mask], 
        products_pred[correct_hazard_mask], 
        average='macro'
    ) if np.any(correct_hazard_mask) else 0  # Handle case where no correct hazards

    # Compute final score
    return (f1_hazards + f1_products) / 2.

score = compute_score(y_test_hazard, y_test_product, all_preds_hazard, all_preds_product)
print(f"Custom F1 Score: {score:.4f}")

This is the evaluation of the test dataset and the f1 score

In [None]:
# Load new test file (adjust file path)
new_test_df = pd.read_csv("Data/incidents_test.csv")  # Replace with your actual file

# Ensure it has the same preprocessing
new_test_df = new_test_df.dropna(subset=["title", "product-category", "hazard-category"])
new_test_df["product-category"] = new_test_df["product-category"].astype("category").cat.codes
new_test_df["hazard-category"] = new_test_df["hazard-category"].astype("category").cat.codes

# Tokenize new test set
new_test_encodings = tokenize_text(new_test_df["title"])

# Convert labels to tensors
new_test_labels_product = torch.tensor(new_test_df["product-category"].values)
new_test_labels_hazard = torch.tensor(new_test_df["hazard-category"].values)

# Create Dataset for product classification task
product_new_test_dataset = ProductDataset(new_test_encodings, new_test_labels_product)

# Create Dataset for hazard classification task
hazard_new_test_dataset = HazardDataset(new_test_encodings, new_test_labels_hazard)

# Create DataLoader for the product classification task
product_new_test_loader = DataLoader(product_new_test_dataset, batch_size=16, shuffle=False)

# Create DataLoader for the hazard classification task
hazard_new_test_loader = DataLoader(hazard_new_test_dataset, batch_size=16, shuffle=False)

# Evaluate the product model
def evaluate_product_model(model, dataloader):
    model.eval()
    all_preds_product = []
    all_labels_product = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            product_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_product = torch.argmax(product_logits, dim=1).cpu().numpy()
            all_preds_product.extend(preds_product)
            all_labels_product.extend(batch["labels_product"].cpu().numpy())

    print("Product Category Classification Report:")
    print(classification_report(all_labels_product, all_preds_product))
    return all_labels_product, all_preds_product

    
    return all_labels_product, all_preds_product

# Evaluate the hazard model
def evaluate_hazard_model(model, dataloader):
    model.eval()
    all_preds_hazard = []
    all_labels_hazard = []
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            hazard_logits = model(batch["input_ids"], batch["attention_mask"])
            preds_hazard = torch.argmax(hazard_logits, dim=1).cpu().numpy()
            all_preds_hazard.extend(preds_hazard)
            all_labels_hazard.extend(batch["labels_hazard"].cpu().numpy())
    
    print("Hazard Category Classification Report:")
    print(classification_report(all_labels_hazard, all_preds_hazard))
    return all_labels_hazard, all_preds_hazard

    
    return all_labels_hazard, all_preds_hazard

# Evaluate product and hazard models separately
y_new_test_product, all_preds_product = evaluate_product_model(product_model, product_new_test_loader)
y_new_test_hazard, all_preds_hazard = evaluate_hazard_model(hazard_model, hazard_new_test_loader)

# Compute custom F1 score
new_score = compute_score(y_new_test_hazard, y_new_test_product, all_preds_hazard, all_preds_product)
print(f"Custom F1 Score on New Test Data: {new_score:.4f}")