In [19]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)
from tqdm import tqdm
import os
import json

# ================== Configuration ==================
class Config:
    DATA_PATH = "customer_support_tickets_clean_500 (3).csv"
    MODEL_NAME = "distilbert-base-uncased"
    SAVE_DIR = "models/distilbert-ticket-classifier"
    MAX_LENGTH = 128
    BATCH_SIZE = 16
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    TEST_SIZE = 0.2
    RANDOM_STATE = 42
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ================== Dataset Class ==================
class TicketDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [2]:

# ================== Data Loading & Preprocessing ==================
def load_and_preprocess_data(config):
    print("üìÇ Loading dataset...")
    df = pd.read_csv(config.DATA_PATH)
    
    # Remove rows with 'unknown' in text column (if they exist)
    df = df[df['text'] != 'unknown'].reset_index(drop=True)
    
    print(f"‚úì Loaded {len(df)} tickets")
    print(f"\nClass distribution:")
    print(df['label'].value_counts())
    
    # Create label mapping
    label_list = sorted(df['label'].unique())
    label2id = {label: idx for idx, label in enumerate(label_list)}
    id2label = {idx: label for label, idx in label2id.items()}
    
    print(f"\nLabel mapping: {label2id}")
    df['label'] = df['label'].map(label2id)
    df['label_id'] = df['label'].map(label2id)
    return df, label2id, id2label


class Config:
    DATA_PATH = "customer_support_tickets_clean_500 (3).csv"

config = Config()


df, label2id, id2label = load_and_preprocess_data(config)


üìÇ Loading dataset...
‚úì Loaded 495 tickets

Class distribution:
label
Other        125
Technical    119
Account      115
Billing      113
unknown       23
Name: count, dtype: int64

Label mapping: {'Account': 0, 'Billing': 1, 'Other': 2, 'Technical': 3, 'unknown': 4}


In [3]:
 # Convert labels to IDs
# If labels are strings, map them to ids; if they're already integers, leave as is.
if df['label'].dtype == object:
	df['label'] = df['label'].map(label2id)

# Ensure label_id column exists and contains integer label ids
df['label_id'] = df['label'].astype(int)


print("‚úì label_id column populated")

‚úì label_id column populated


In [4]:
from sklearn.model_selection import train_test_split

def split_dataset(df, label2id, id2label, config):
    """
    Splits the dataset into training and testing sets based on configuration.
    
    Args:
        df (pd.DataFrame): The preprocessed dataframe containing 'text' and 'label_id'.
        label2id (dict): Mapping from label string to label id.
        id2label (dict): Mapping from label id to label string.
        config (object): Configuration object with TEST_SIZE and RANDOM_STATE attributes.
    
    Returns:
        tuple: train_texts, test_texts, train_labels, test_labels, label2id, id2label
    """

    # --- Validation checks ---
    if not hasattr(config, "TEST_SIZE") or not hasattr(config, "RANDOM_STATE"):
        raise AttributeError("‚ùå Config object must define TEST_SIZE and RANDOM_STATE.")

    if "text" not in df.columns or "label_id" not in df.columns:
        raise KeyError("‚ùå DataFrame must contain 'text' and 'label_id' columns.")

    # --- Split dataset ---
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df["text"].values,
        df["label_id"].values,
        test_size=config.TEST_SIZE,
        random_state=config.RANDOM_STATE,
        stratify=df["label_id"].values
    )

    print(f"\n‚úì Split completed: {len(train_texts)} train samples, {len(test_texts)} test samples")

    return train_texts, test_texts, train_labels, test_labels, label2id, id2label




In [5]:
 #================== Model Training ==================
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    progress_bar = tqdm(dataloader, desc="Training")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    
    return avg_loss, accuracy


In [6]:
# ================== Model Evaluation ==================
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs.loss.item()
            
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='macro'
    )
    
    return avg_loss, accuracy, precision, recall, f1, predictions, true_labels



In [7]:
# ================== Main Training Pipeline ==================
def main():
    config = Config()
    
    print("="*60)
    print("üöÄ DistilBERT Ticket Classifier Training")
    print("="*60)
    print(f"Device: {config.DEVICE}")
    print(f"Epochs: {config.EPOCHS}")
    print(f"Batch Size: {config.BATCH_SIZE}")
    print(f"Learning Rate: {config.LEARNING_RATE}\n")
    
    # Load data
    train_texts, test_texts, train_labels, test_labels, label2id, id2label = \
        load_and_preprocess_data(config)
    
    # Load tokenizer and model
    print("\nü§ñ Loading DistilBERT model...")
    tokenizer = DistilBertTokenizer.from_pretrained(config.MODEL_NAME)
    model = DistilBertForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=len(label2id)
    )
    model.to(config.DEVICE)
    print("‚úì Model loaded successfully")
    
    

In [8]:
# ======================================
# ‚úÖ FULL DATA PREPARATION PIPELINE
# ======================================

# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Config class ---
class Config:
    DATA_PATH = "customer_support_tickets_clean_500 (3).csv"
    TEST_SIZE = 0.2
    RANDOM_STATE = 42
    MAX_LENGTH = 128
    BATCH_SIZE = 8
    EPOCHS = 3
    LEARNING_RATE = 3e-5

config = Config()

# --- Load and preprocess data ---
print("üìÇ Loading dataset...")
df = pd.read_csv(config.DATA_PATH)
df = df[df['text'] != 'unknown'].reset_index(drop=True)

print(f"‚úì Loaded {len(df)} samples")
print("\nClass distribution:")
print(df['label'].value_counts())

# --- Create label mappings ---
label_list = sorted(df['label'].unique())
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

df['label_id'] = df['label'].map(label2id)

# --- Split dataset ---
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].values,
    df['label_id'].values,
    test_size=config.TEST_SIZE,
    random_state=config.RANDOM_STATE,
    stratify=df['label_id'].values
)

print(f"\n‚úÖ Split completed: {len(train_texts)} train samples, {len(test_texts)} test samples")


üìÇ Loading dataset...
‚úì Loaded 495 samples

Class distribution:
label
Other        125
Technical    119
Account      115
Billing      113
unknown       23
Name: count, dtype: int64

‚úÖ Split completed: 396 train samples, 99 test samples


In [9]:
from transformers import DistilBertTokenizerFast

# Load pretrained DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

print("‚úÖ Tokenizer loaded successfully!")


‚úÖ Tokenizer loaded successfully!


In [12]:
 # Create datasets
train_dataset = TicketDataset(train_texts, train_labels, tokenizer, config.MAX_LENGTH)
test_dataset = TicketDataset(test_texts, test_labels, tokenizer, config.MAX_LENGTH)
    
train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE)
    

In [16]:
from transformers import DistilBertForSequenceClassification

# Create DistilBERT model for text classification
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label2id)  # number of unique labels
)

print("‚úÖ Model loaded successfully!")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded successfully!


In [17]:
# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE)
total_steps = len(train_loader) * config.EPOCHS
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

In [14]:
# Training loop
print("\n" + "="*60)
print("üìä Starting Training")
print("="*60)
    
best_f1 = 0
training_history = []
    
for epoch in range(config.EPOCHS):
        print(f"\n{'='*60}")
        print(f"Epoch {epoch + 1}/{config.EPOCHS}")
        print('='*60)


üìä Starting Training

Epoch 1/3

Epoch 2/3

Epoch 3/3


In [22]:
 # Train
train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler, config.DEVICE
        )
        
print(f"\n‚úì Training Loss: {train_loss:.4f}")
print(f"‚úì Training Accuracy: {train_acc:.4f}")
        
        # Evaluate
val_loss, val_acc, val_precision, val_recall, val_f1, _, _ = evaluate(
            model, test_loader, config.DEVICE
        )
        
print(f"\nüìà Validation Results:")
print(f"   Loss: {val_loss:.4f}")
print(f"   Accuracy: {val_acc:.4f}")
print(f"   Precision: {val_precision:.4f}")
print(f"   Recall: {val_recall:.4f}")
print(f"   F1-Score: {val_f1:.4f}")
        
training_history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_acc': train_acc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'val_precision': val_precision,
            'val_recall': val_recall,
            'val_f1': val_f1
        })
        
        # Save best model
if val_f1 > best_f1:
            best_f1 = val_f1
            print(f"\nüíæ New best F1! Saving model...")

Training:   0%|          | 0/50 [00:00<?, ?it/s]

Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [04:38<00:00,  5.58s/it, loss=1.05] 



‚úì Training Loss: 1.0526
‚úì Training Accuracy: 0.7601


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:15<00:00,  1.19s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



üìà Validation Results:
   Loss: 0.4998
   Accuracy: 0.9293
   Precision: 0.7448
   Recall: 0.7826
   F1-Score: 0.7630

üíæ New best F1! Saving model...


In [21]:
config = Config()
print("‚úÖ Training device:", config.DEVICE)


‚úÖ Training device: cpu


In [23]:
# Final evaluation
print("\n" + "="*60)
print("üéØ Final Evaluation on Test Set")
print("="*60)
    
test_loss, test_acc, test_precision, test_recall, test_f1, predictions, true_labels = \
        evaluate(model, test_loader, config.DEVICE)
    
print(f"\nüìä Final Test Metrics:")
print(f"   Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   Precision: {test_precision:.4f} ({test_precision*100:.2f}%)")
print(f"   Recall: {test_recall:.4f} ({test_recall*100:.2f}%)")
print(f"   F1-Score: {test_f1:.4f} ({test_f1*100:.2f}%)")
    
    # Detailed classification report
print("\n" + "="*60)
print("üìã Detailed Classification Report")
print("="*60)
    
target_names = [id2label[i] for i in range(len(id2label))]
print(classification_report(true_labels, predictions, target_names=target_names))
    
    # Confusion matrix
cm = confusion_matrix(true_labels, predictions)
print("\nüìä Confusion Matrix:")
print(cm)
    
    # Save model
print("\n" + "="*60)
print("üíæ Saving Model")
print("="*60)
    
os.makedirs(config.SAVE_DIR, exist_ok=True)
    
model.save_pretrained(config.SAVE_DIR)
tokenizer.save_pretrained(config.SAVE_DIR)
    
    # Save label mappings
with open(os.path.join(config.SAVE_DIR, 'label2id.json'), 'w') as f:
        json.dump(label2id, f, indent=2)
    
with open(os.path.join(config.SAVE_DIR, 'id2label.json'), 'w') as f:
        json.dump(id2label, f, indent=2)
    
    # Save training history
with open(os.path.join(config.SAVE_DIR, 'training_history.json'), 'w') as f:
        json.dump(training_history, f, indent=2)
    
    # Save final metrics
final_metrics = {
        'test_accuracy': float(test_acc),
        'test_precision': float(test_precision),
        'test_recall': float(test_recall),
        'test_f1': float(test_f1),
        'confusion_matrix': cm.tolist(),
        'classification_report': classification_report(
            true_labels, predictions, target_names=target_names, output_dict=True
        )
    }
    
with open(os.path.join(config.SAVE_DIR, 'metrics.json'), 'w') as f:
        json.dump(final_metrics, f, indent=2)
    
print(f"\n‚úì Model saved to: {config.SAVE_DIR}")
print("‚úì Files saved:")
print("   - pytorch_model.bin")
print("   - config.json")
print("   - vocab.txt")
print("   - tokenizer files")
print("   - label2id.json")
print("   - id2label.json")
print("   - training_history.json")
print("   - metrics.json")
    
print("\n" + "="*60)
print("üéâ Training Complete!")
print("="*60)

if __name__ == "__main__":
    main()


üéØ Final Evaluation on Test Set


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [00:16<00:00,  1.23s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



üìä Final Test Metrics:
   Accuracy: 0.9293 (92.93%)
   Precision: 0.7448 (74.48%)
   Recall: 0.7826 (78.26%)
   F1-Score: 0.7630 (76.30%)

üìã Detailed Classification Report
              precision    recall  f1-score   support

     Account       0.88      0.91      0.89        23
     Billing       1.00      1.00      1.00        22
       Other       0.93      1.00      0.96        25
   Technical       0.92      1.00      0.96        24
     unknown       0.00      0.00      0.00         5

    accuracy                           0.93        99
   macro avg       0.74      0.78      0.76        99
weighted avg       0.88      0.93      0.91        99


üìä Confusion Matrix:
[[21  0  2  0  0]
 [ 0 22  0  0  0]
 [ 0  0 25  0  0]
 [ 0  0  0 24  0]
 [ 3  0  0  2  0]]

üíæ Saving Model


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



‚úì Model saved to: models/distilbert-ticket-classifier
‚úì Files saved:
   - pytorch_model.bin
   - config.json
   - vocab.txt
   - tokenizer files
   - label2id.json
   - id2label.json
   - training_history.json
   - metrics.json

üéâ Training Complete!
üöÄ DistilBERT Ticket Classifier Training
Device: cpu
Epochs: 5
Batch Size: 16
Learning Rate: 2e-05

üìÇ Loading dataset...
‚úì Loaded 495 tickets

Class distribution:
label
Other        125
Technical    119
Account      115
Billing      113
unknown       23
Name: count, dtype: int64

Label mapping: {'Account': 0, 'Billing': 1, 'Other': 2, 'Technical': 3, 'unknown': 4}


ValueError: not enough values to unpack (expected 6, got 3)