In [2]:
# === 02_intent_classifier.ipynb ===

# Imports
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import os
import pickle
import logging

# Logging for better tracking
logging.basicConfig(level=logging.INFO)

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load preprocessed data
df = pd.read_csv("../data/processed/clean_emails.csv")
df = df[["clean_body", "label"]].dropna()
print(f"Loaded {len(df)} emails.")

# Reduce dataset to 5% for fast CPU training
df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)

# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])
num_labels = len(label_encoder.classes_)
print(f"Detected {num_labels} unique intent labels.")

# Split into train/test
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["clean_body"].tolist(),
    df["label_id"].tolist(),
    test_size=0.2,
    stratify=df["label_id"],
    random_state=42
)

# Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Dataset class
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

# Load model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
).to(device)

# Training args for transformers==4.52.4
training_args = TrainingArguments(
    output_dir="../models/intent_classifier",
    logging_dir="../logs",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=1000,
    save_total_limit=1,
    do_eval=True
)

# Evaluation metrics
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "accuracy": report["accuracy"],
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Final Evaluation
metrics = trainer.evaluate()
print(metrics)

# Save model, tokenizer, and label encoder
model.save_pretrained("../models/intent_classifier")
tokenizer.save_pretrained("../models/intent_classifier")
with open("../models/intent_classifier/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("✅ Model and tokenizer saved to /models/intent_classifier")


Using device: cpu
Loaded 425552 emails.
Detected 16 unique intent labels.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,2.492
20,1.9807
30,2.0252
40,2.0407
50,1.9779
60,1.9591
70,1.9319
80,1.8736
90,1.9233
100,1.8891




{'eval_loss': 0.3541795611381531, 'eval_accuracy': 0.9069548872180451, 'eval_precision': 0.9032811778854439, 'eval_recall': 0.9069548872180451, 'eval_f1': 0.9028847807322551, 'eval_runtime': 241.9951, 'eval_samples_per_second': 17.587, 'eval_steps_per_second': 1.099, 'epoch': 2.0}
✅ Model and tokenizer saved to /models/intent_classifier
