In [29]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import seaborn as sns
import shutil, sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, jaccard_score


train_df = pd.read_pickle('/kaggle/input/stacksample/train_df.pkl')
val_df = pd.read_pickle('/kaggle/input/stacksample/val_df.pkl')
test_df = pd.read_pickle('/kaggle/input/stacksample/test_df.pkl')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device} as device \n')

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=100)
# print(model)

Using cuda as device 



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
def train_model(model, train_dataset, val_dataset, num_epochs=5, batch_size=36, learning_rate=1e-5, patience=2, device='cuda'):
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Move model to the correct device
    model.to(device)

    # Define the optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = BCEWithLogitsLoss()

    # Initialize variables for early stopping
    best_valid_loss = float('inf')
    trials = 0

    num_training_steps = num_epochs * len(train_loader)
    progress_bar = tqdm(total=num_training_steps, desc="Training")

    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        train_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
            labels = batch['Tag'].to(device)

            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            progress_bar.update(1)

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()  # Set model to evaluation mode
        valid_loss = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
                labels = batch['Tag'].to(device)

                outputs = model(**inputs)
                loss = loss_fn(outputs.logits, labels)
                valid_loss += loss.item()

        valid_loss /= len(val_loader)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')

        # Check for early stopping
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best_model.pt')
            trials = 0
        else:
            trials += 1
            if trials >= patience:
                print(f'Early stopping triggered at epoch {epoch+1}.')
                break

    progress_bar.close()

# Example usage
train_model(model, train_dataset, val_dataset, num_epochs=25, batch_size=36, learning_rate=2e-5, patience=2, device=device)


Training:   0%|          | 0/3475 [00:00<?, ?it/s]

Epoch 1, Train Loss: 0.0157, Valid Loss: 0.0333
Epoch 2, Train Loss: 0.0140, Valid Loss: 0.0329
Epoch 3, Train Loss: 0.0126, Valid Loss: 0.0329
Epoch 4, Train Loss: 0.0113, Valid Loss: 0.0331
Early stopping triggered at epoch 4.


In [39]:
# Function to evaluate the model
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in data_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
            labels = batch['Tag'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            pred = torch.sigmoid(logits) > 0.5 # Using sigmoid since it's BCEWithLogitsLoss
            predictions.append(pred.cpu().numpy())
            actuals.append(labels.cpu().numpy())

    # Concatenate all batches
    all_predictions = np.vstack(predictions)
    all_actuals = np.vstack(actuals)

    # Calculate Jaccard Score and Micro F1 Score
    jaccard = jaccard_score(all_actuals, all_predictions, average='samples')
    micro_f1 = f1_score(all_actuals, all_predictions, average='micro')
    
    return jaccard, micro_f1

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Evaluate the model
jaccard, micro_f1 = evaluate_model(model, test_loader, device)

print(f"Jaccard Score on Test Set: {jaccard:.2f}")
print(f"Micro F1 Score on Test Set: {micro_f1:.2f}")

Jaccard Score on Test Set: 0.58
Micro F1 Score on Test Set: 0.65


XGBOOST performance on the Test Dataset:

    Micro F1 Score: 0.31

    Jaccard Score: 0.21

DistilBERT performance on the Test Dataset:

    Micro F1 Score: 0.65

    Jaccard Score: 0.58
    
    
Performance Overview:
        XGBoost scored lower with a Micro F1 Score of 0.31 and a Jaccard Score of 0.21, indicating challenges in precision, recall, and label similarity. DistilBERT significantly outperformed XGBoost, achieving a Micro F1 Score of 0.65 and a Jaccard Score of 0.58, showcasing superior accuracy and label matching.

Model Insights:
        XGBoost is optimal for tabular data but struggled in this context, possibly due to complex patterns or text data. DistilBERT excels in natural language processing, capturing semantic relationships effectively, which explains its higher performance and at the same time is a distilled/smaller version of the BERT model. (Meaning if we wanted, we could push for more performance)

Choosing the Right Model:
        The choice between XGBoost and DistilBERT should be based on data type (structured vs. text) and available computational resources, with DistilBERT requiring more. If we are aiming for performance, the go to choice should be the transformers model.
