In [2]:
%pip install torch transformers scikit-learn pandas tqdm





[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: C:\Users\tsk\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score
from tqdm.auto import tqdm

# Configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED = 42
MAX_LEN = 128 
BATCH_SIZE = 32
EPOCHS = 3
MODEL_NAME = "distilbert-base-uncased" 
TARGET_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
NUM_LABELS = len(TARGET_COLUMNS)

  from .autonotebook import tqdm as notebook_tqdm


Load Preprocessed data and tokenizer

In [4]:
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

print(f"Using device: {DEVICE}")

DATA_DIR = '../data'
MODELS_DIR = '../models'

# Load the data split from the preprocessing notebook
X_train = np.load(os.path.join(DATA_DIR, 'X_train.npy'), allow_pickle=True)
X_val = np.load(os.path.join(DATA_DIR, 'X_val.npy'), allow_pickle=True)
y_train = np.load(os.path.join(DATA_DIR, 'y_train.npy'))
y_val = np.load(os.path.join(DATA_DIR, 'y_val.npy'))

print(f"Training comments: {len(X_train)}, Validation comments: {len(X_val)}")

# Load and save the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Save the tokenizer for deployment (Crucial Step!)
tokenizer_output_dir = os.path.join(MODELS_DIR, 'tokenizer')
if not os.path.exists(tokenizer_output_dir):
    os.makedirs(tokenizer_output_dir)
tokenizer.save_pretrained(tokenizer_output_dir)

print(f"Tokenizer saved to {tokenizer_output_dir}")


Using device: cpu
Training comments: 143613, Validation comments: 15958


Tokenizer saved to ../models\tokenizer


Dataset and DataLoader

In [5]:
class ToxicCommentsDataset(Dataset):
    def __init__(self, comments, targets, tokenizer, max_len):
        self.comments = comments
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'comment_text': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

In [6]:
# Create Dataset objects
train_dataset = ToxicCommentsDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = ToxicCommentsDataset(X_val, y_val, tokenizer, MAX_LEN)

# Create DataLoader objects
train_data_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)
val_data_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=2
)
print("Dataset and DataLoaders created.")

Dataset and DataLoaders created.


In [7]:

class ToxicClassifier(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased", num_labels=6, dropout_rate=0.1):
        super(ToxicClassifier, self).__init__()
        # Load the base transformer model
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_rate)
        # Classification head for 6 labels
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Use the embedding of the [CLS] token (first token in the sequence)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize the model
model = ToxicClassifier(model_name=MODEL_NAME, num_labels=NUM_LABELS)
model = model.to(DEVICE)
print("Model initialized.")

Model initialized.


In [8]:
!pip install hf_xet




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: C:\Users\tsk\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### 5. Training Setup: Optimizer, Scheduler, and LOSS (with Class Weights!) ###

In [9]:
# 5.1. Calculate Class Weights for Imbalanced Data
# Calculate positive weight for BCEWithLogitsLoss
positive_counts = y_train.sum(axis=0)
negative_counts = len(y_train) - positive_counts

# Calculate pos_weight for BCEWithLogitsLoss: weight_for_positive = negative_count / positive_count
pos_weight = torch.tensor(negative_counts / positive_counts, dtype=torch.float).to(DEVICE)

# Loss Function: BCEWithLogitsLoss (ideal for multi-label classification)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Scheduler
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
print("Training components initialized.")

Training components initialized.


Training and Evaluation Functions

In [10]:
def train_epoch(model, data_loader, criterion, optimizer, device, scheduler):
    model.train()
    losses = []
    
    for d in tqdm(data_loader, desc="Training"):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = criterion(outputs, targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Clip gradients to prevent exploding gradients
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses)

def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    
    all_targets = []
    all_outputs = []
    
    with torch.no_grad():
        for d in tqdm(data_loader, desc="Evaluation"):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"]

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            loss = criterion(outputs, targets.to(device))
            losses.append(loss.item())
            
            # Move data back to CPU for metric calculation
            all_targets.extend(targets.cpu().numpy())
            all_outputs.extend(torch.sigmoid(outputs).cpu().numpy())
            
    # Calculate metrics
    y_true = np.array(all_targets)
    y_pred_proba = np.array(all_outputs)
    
    # ROC AUC Score (Multi-label)
    roc_auc = roc_auc_score(y_true, y_pred_proba, average='macro')
    
    # F1 Score (requires converting probabilities to binary predictions)
    # Using 0.5 as threshold for F1
    y_pred_binary = (y_pred_proba >= 0.5).astype(int)
    f1_macro = f1_score(y_true, y_pred_binary, average='macro')

    return np.mean(losses), roc_auc, f1_macro

In [None]:
history = pd.DataFrame(columns=['epoch', 'train_loss', 'val_loss', 'val_roc_auc', 'val_f1_macro'])
best_roc_auc = 0
MODEL_PATH = os.path.join(MODELS_DIR, 'toxic_classifier.pth')

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")
    
    # Training
    train_loss = train_epoch(
        model,
        train_data_loader,
        criterion,
        optimizer,
        DEVICE,
        scheduler
    )
    
    # Evaluation
    val_loss, val_roc_auc, val_f1_macro = eval_model(
        model,
        val_data_loader,
        criterion,
        DEVICE
    )

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f}")
    print(f"Val ROC AUC (Macro): {val_roc_auc:.4f}")
    print(f"Val F1 Score (Macro): {val_f1_macro:.4f}")
    
    # Save the best model based on validation ROC AUC
    if val_roc_auc > best_roc_auc:
        torch.save(model.state_dict(), MODEL_PATH)
        best_roc_auc = val_roc_auc
        print(f"*** Model saved! New best ROC AUC: {best_roc_auc:.4f} ***")

    # Record history
    history.loc[epoch] = [epoch + 1, train_loss, val_loss, val_roc_auc, val_f1_macro]


--- Epoch 1/3 ---


Training:   0%|          | 0/4488 [00:00<?, ?it/s]

In [None]:
print("\n--- Training Complete ---")
print(f"Best Validation ROC AUC achieved: {best_roc_auc:.4f}")
print(f"Trained model state_dict saved to: {MODEL_PATH}")
print(f"Tokenizer saved to: {tokenizer_output_dir}")

# Display training history
print("\nTraining History:")
display(history)

# Final check: Load the best model and evaluate
model.load_state_dict(torch.load(MODEL_PATH))
final_loss, final_roc_auc, final_f1_macro = eval_model(model, val_data_loader, criterion, DEVICE)

print("\n--- Final Evaluation with Best Model ---")
print(f"Final Val ROC AUC: {final_roc_auc:.4f}")
print(f"Final Val F1 Macro: {final_f1_macro:.4f}")

# Push artifacts to GitHub if repository is cloned (Manual step in Colab)
# print("\nRun the following commands in Colab to commit artifacts:")
# print(f"!git add {MODEL_PATH}")
# print(f"!git add {tokenizer_output_dir}/*")
# print(f"!git add notebooks/02_PyTorch_Training.ipynb")
# print('!git commit -m "FEAT: Trained new PyTorch toxic classifier"')
# print('!git push origin main')