In [1]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import seaborn as sns
import shutil, sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, jaccard_score


train_df = pd.read_pickle('/kaggle/input/stacksample/train_df.pkl')
val_df = pd.read_pickle('/kaggle/input/stacksample/val_df.pkl')
test_df = pd.read_pickle('/kaggle/input/stacksample/test_df.pkl')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device} as device \n')

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=91)

Using cuda as device 



tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# Define a max length for padding/truncation
MAX_LENGTH = 512
batch_size = 36

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")

# Tokenize the text data in the DataFrames
train_encodings = tokenize_function(train_df['input'].tolist())
val_encodings = tokenize_function(val_df['input'].tolist())
test_encodings = tokenize_function(test_df['input'].tolist())

# Define custom dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['Tag'] = torch.tensor(self.labels[idx], dtype=torch.float) # FLOAT NEEDED FOR BCELOSS
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = CustomDataset(train_encodings, train_df['Tag'].values)
val_dataset = CustomDataset(val_encodings, val_df['Tag'].values)
test_dataset = CustomDataset(test_encodings, test_df['Tag'].values)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Epochs / Learning Rate Scheduler
num_epochs = 15
num_epochs=2
best_valid_loss = float('inf')
patience, trials = 2, 0  # patience: number of epochs to wait for improvement before stopping, trials: count of epochs without improvement

num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Assuming your model is correctly set up for multi-label classification:
model.to(device)  # Make sure to send your model to the right device

# Define the loss function for multi-label classification
loss_fn = BCEWithLogitsLoss()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    train_loss = 0.0
    model.train()  # Ensure the model is in training mode
    
    for batch in train_loader:
        # Separate inputs from labels and send them to the device
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
        labels = batch['Tag'].to(device)
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Calculate loss
        loss = loss_fn(logits, labels)
        
        # Backward pass and optimize
        optimizer.zero_grad() # just not between the backward and step
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        train_loss += loss.item()
        
        progress_bar.update(1)
        
    train_loss /= len(train_loader)
    
    # Validation phase
    model.eval() # Disaple dropout, batch normalization to moving average instead of batch average
    valid_loss = 0
    with torch.no_grad(): # Disable grads req and graph 
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
            labels = batch['Tag'].to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels.float())
            valid_loss += loss.item()
    
    valid_loss /= len(val_loader)
    
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')
    
    # Early stopping logic
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')  # Save your best model
        trials = 0
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch+1}.')
            break  # Stop training

    # Optionally, here you can save your checkpoints
    # save_ckp({'epoch': epoch+1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, False, checkpoint_path)


  0%|          | 0/176 [00:00<?, ?it/s]

Epoch 1, Train Loss: 0.5564, Valid Loss: 0.4586
Epoch 2, Train Loss: 0.4311, Valid Loss: 0.4075


In [3]:
# Set the model to evaluation mode
model.eval()

# Store predictions and true labels
predictions = []
true_labels = []

# Disable gradient calculation for evaluation to save memory and computations
with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
        labels = batch['Tag'].to(device)

        # Forward pass, get logits
        outputs = model(**inputs)
        logits = outputs.logits

        # Apply sigmoid to logits to get predictions in [0,1]
        probs = torch.sigmoid(logits).cpu().numpy()
        # Convert probabilities to binary predictions
        batch_preds = np.where(probs > 0.5, 1, 0)

        predictions.append(batch_preds)
        true_labels.append(labels.cpu().numpy())

# Concatenate all the batches
predictions = np.vstack(predictions)
true_labels = np.vstack(true_labels)

# Calculate Jaccard Score
jaccard = jaccard_score(true_labels, predictions, average='samples')  # 'samples' for multilabel classification

# Calculate micro-averaged F1 score
micro_f1 = f1_score(true_labels, predictions, average='micro')

print(f'Jaccard Score: {jaccard}')
print(f'Micro-averaged F1 Score: {micro_f1}')


Jaccard Score: 0.0
Micro-averaged F1 Score: 0.0


In [None]:
# Function to evaluate the model
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in data_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'Tag'}
            labels = batch['Tag'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            pred = torch.sigmoid(logits) > 0.5 # Using sigmoid since it's BCEWithLogitsLoss
            predictions.append(pred.cpu().numpy())
            actuals.append(labels.cpu().numpy())

    # Concatenate all batches
    all_predictions = np.vstack(predictions)
    all_actuals = np.vstack(actuals)

    # Calculate Jaccard Score and Micro F1 Score
    jaccard = jaccard_score(all_actuals, all_predictions, average='samples')
    micro_f1 = f1_score(all_actuals, all_predictions, average='micro')
    
    return jaccard, micro_f1

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Evaluate the model
jaccard, micro_f1 = evaluate_model(model, test_loader, device)

print(f"Jaccard Score on Test Set: {jaccard:.1f}")
print(f"Micro F1 Score on Test Set: {micro_f1:.1f}")