### Imports & Setup

In [1]:
# 03_finetune_distilbert.ipynb
# ---------------------------------
# Week 3: DistilBERT Fine-tuning 
# Purpose: fine-tune distilbert-base-uncased on the cleaned ticket dataset, evaluate, save artifacts,
# and log everything to MLflow (with safety checks for active runs).

import os
import json
from datetime import datetime
import numpy as np
import pandas as pd


import torch
from torch.utils.data import Dataset

import transformers


from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


import mlflow
import joblib

  from .autonotebook import tqdm as notebook_tqdm


# Paths and config

In [2]:
CLEANED_DATA_PATH = "../data/file-for-fineTuning.csv" 
MODEL_DIR = "../models/distilbert-ticket-classifier"
ARTIFACTS_DIR = "../artifacts"
RESULTS_DIR = "../results"
MLFLOW_TRACKING_URI = "file:../mlruns"
EXPERIMENT_NAME = "transformer_finetuning_experiment"


# Training hyperparameters
NUM_EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_LENGTH = 128
SEED = 42


os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)


# MLflow setup
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)


# Ensure deterministic behavior
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x241f112bab0>

#### Cell 3: Load data

In [3]:
print("Loading cleaned dataset:", CLEANED_DATA_PATH)
df = pd.read_csv(CLEANED_DATA_PATH)
df = df.dropna().reset_index(drop=True)


# Update these names if your columns differ
TEXT_COL = "text"
LABEL_COL = "label"


if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
    print("Dataset columns:", df.columns.tolist())
    raise ValueError(f"Expected columns '{TEXT_COL}' and '{LABEL_COL}' in the cleaned CSV")


# Convert labels to integer ids and keep mapping
df[LABEL_COL] = df[LABEL_COL].astype('category')
label2id = {c: i for i, c in enumerate(df[LABEL_COL].cat.categories)}
id2label = {i: c for c, i in label2id.items()}
df['label_id'] = df[LABEL_COL].cat.codes


print("Classes:", label2id)

Loading cleaned dataset: ../data/file-for-fineTuning.csv
Classes: {'account': 0, 'billing': 1, 'other': 2, 'tech_support': 3}


#### Train/validation split

In [4]:
from sklearn.model_selection import train_test_split


texts = df[TEXT_COL].astype(str).tolist()
labels = df['label_id'].tolist()


train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, stratify=labels, random_state=SEED
)


print(f"Train size: {len(train_texts)}, Val size: {len(val_texts)}")

Train size: 396, Val size: 99


### Tokenizer & encodings

In [5]:
from transformers import (
DistilBertTokenizerFast,
DistilBertForSequenceClassification,
TrainingArguments,
)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_LENGTH)

### Dataset class

In [6]:
class TicketDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


train_dataset = TicketDataset(train_encodings, train_labels)
val_dataset = TicketDataset(val_encodings, val_labels)

### Model & TrainingArguments

In [7]:
from transformers import Trainer, TrainingArguments
print("Transformers working!")



Transformers working!


In [8]:
import transformers
print(transformers.__version__)

4.57.1


In [None]:
num_labels = len(label2id)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)


training_args = TrainingArguments(
    output_dir="./trainer_results",
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    seed=SEED,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Metrics function

In [11]:
import numpy as np


def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1_weighted': f1,
        'precision_weighted': precision,
        'recall_weighted': recall
    }

### Trainer & Training (with MLflow logging)

In [12]:
from transformers import TrainerCallback

#To ensure no active MLflow run exists before starting a new one
if mlflow.active_run():
    mlflow.end_run()

run_name = f"DistilBERT_Run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

with mlflow.start_run(run_name=run_name):
    # Log basic params
    mlflow.log_param('model', 'distilbert-base-uncased')
    mlflow.log_param('num_epochs', NUM_EPOCHS)
    mlflow.log_param('batch_size', BATCH_SIZE)
    mlflow.log_param('max_length', MAX_LENGTH)
    mlflow.log_param('learning_rate', LEARNING_RATE)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    train_result = trainer.train()
    trainer.save_model(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)


    # Evaluate on validation set
    eval_result = trainer.evaluate(eval_dataset=val_dataset)
    print("Eval result:", eval_result)


    # Predictions for metrics and confusion matrix
    predictions = trainer.predict(val_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)
    y_true = predictions.label_ids


    # Compute sklearn metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')

    metrics = {
        'accuracy': float(accuracy),
        'f1_weighted': float(f1),
        'precision_weighted': float(precision),
        'recall_weighted': float(recall),
        'label2id': label2id,
        'id2label': id2label,
        'train_size': len(train_dataset),
        'val_size': len(val_dataset)
    }


    # Save metrics to artifacts
    metrics_path = os.path.join(ARTIFACTS_DIR, 'transformer_metrics.json')
    with open(metrics_path, 'w') as f:
        json.dump(metrics, f, indent=4)


    # Log metrics & artifacts to MLflow
    mlflow.log_metrics({k: v for k, v in metrics.items() if isinstance(v, (int, float))})
    mlflow.log_artifacts(ARTIFACTS_DIR)
    mlflow.log_artifacts(MODEL_DIR)

    # Save predictions for later analysis
    preds_df = pd.DataFrame({
        'text': val_texts,
        'y_true': [id2label[i] for i in y_true],
        'y_pred': [id2label[i] for i in y_pred]
    })
    preds_df.to_csv(os.path.join(ARTIFACTS_DIR, 'transformer_val_predictions.csv'), index=False)
    mlflow.log_artifact(os.path.join(ARTIFACTS_DIR, 'transformer_val_predictions.csv'))


print("Training and MLflow logging completed. Model saved to:", MODEL_DIR)
print("Metrics saved to:", metrics_path)


  trainer = Trainer(
2025/11/06 12:06:59 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 7e2db463f34e4f9a865afbf26f97a8a5: Failed to log run data: Exception: Changing param values is not allowed. Param with key='max_length' was already logged with value='128' for run ID='7e2db463f34e4f9a865afbf26f97a8a5'. Attempted logging new value '20'.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,Precision Weighted,Recall Weighted
1,1.1112,0.7244,0.858586,0.853244,0.857326,0.858586
2,0.5655,0.372956,0.939394,0.938692,0.947975,0.939394
3,0.3751,0.313393,0.939394,0.938692,0.947975,0.939394




Eval result: {'eval_loss': 0.3133927583694458, 'eval_accuracy': 0.9393939393939394, 'eval_f1_weighted': 0.9386916786916787, 'eval_precision_weighted': 0.9479747305834262, 'eval_recall_weighted': 0.9393939393939394, 'eval_runtime': 1.8063, 'eval_samples_per_second': 54.809, 'eval_steps_per_second': 7.197, 'epoch': 3.0}




Training and MLflow logging completed. Model saved to: ../models/distilbert-ticket-classifier
Metrics saved to: ../artifacts\transformer_metrics.json


### Quick evaluation plots (confusion matrix)

In [None]:
report_text = (
    f"DistilBERT fine-tuning report\n"
    f"===============================\n"
    f"Train size: {len(train_dataset)}\n"
    f"Val size: {len(val_dataset)}\n"
    f"Accuracy: {metrics['accuracy']:.4f}\n"
    f"F1 (weighted): {metrics['f1_weighted']:.4f}\n"
    f"Precision (weighted): {metrics['precision_weighted']:.4f}\n"
    f"Recall (weighted): {metrics['recall_weighted']:.4f}\n"
)
with open(os.path.join(RESULTS_DIR, 'distilbert_evaluation_report.txt'), 'w') as f:
    f.write(report_text)


print('Evaluation report saved to', os.path.join(RESULTS_DIR, 'distilbert_evaluation_report.txt'))