In [42]:
import numpy as np
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
import math
import accelerate


In [34]:

"""
Loads the processed data and fine-tunes a DistilBERT model.
"""
print("--- 1. Loading Processed Dataset ---")
try:
    tokenized_datasets = load_from_disk("banking77-processed")
except FileNotFoundError:
    print("\nERROR: Could not find the 'banking77-processed' directory.")
    print("Please make sure you have successfully run the Phase 2 script first.")
    
    
initial_train_size = len(tokenized_datasets["train"])

# --- THE FIX IS HERE ---
# We need a robust function to filter out both None and NaN values.
# The previous check for `is not None` did not catch NaN.
def is_label_valid(example):
    label = example['labels']
    # Check for None or if the label is NaN
    return label is not None and not math.isnan(label)

tokenized_datasets = tokenized_datasets.filter(is_label_valid)
# -----------------------

filtered_train_size = len(tokenized_datasets["train"])

if initial_train_size > filtered_train_size:
    print(f"INFO: Filtered out {initial_train_size - filtered_train_size} rows with missing labels from the dataset.")

# Now that the data is clean, we can safely cast the labels to integers.
def cast_labels_to_int(example):
    example['labels'] = int(example['labels'])
    return example
    
tokenized_datasets = tokenized_datasets.map(cast_labels_to_int)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
print("Dataset loaded, cleaned, and labels are correctly formatted.")

--- 1. Loading Processed Dataset ---


Filter:   0%|          | 0/10003 [00:00<?, ? examples/s]

Filter: 100%|██████████| 10003/10003 [00:00<00:00, 13088.53 examples/s]
Filter: 100%|██████████| 3080/3080 [00:00<00:00, 13818.05 examples/s]


INFO: Filtered out 1283 rows with missing labels from the dataset.


Map: 100%|██████████| 8720/8720 [00:05<00:00, 1604.69 examples/s]
Map: 100%|██████████| 2680/2680 [00:01<00:00, 1687.34 examples/s]

Dataset loaded, cleaned, and labels are correctly formatted.





In [4]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [35]:
print("\n--- 2. Defining Evaluation Metrics ---")
def compute_metrics(pred):
    """
    Calculates accuracy and F1 score for the model's predictions.
    """
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    # Calculate F1 score with 'weighted' average to account for class imbalance if any
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


--- 2. Defining Evaluation Metrics ---


In [36]:
print("\n--- 3. Loading Pre-trained Model and Tokenizer ---")
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


--- 3. Loading Pre-trained Model and Tokenizer ---


In [37]:
# Define the label mappings again so the model understands them
id2label = {0: 'Billing Question', 1: 'Technical Issue', 2: 'General Inquiry'}
label2id = {'Billing Question': 0, 'Technical Issue': 1, 'General Inquiry': 2}


In [38]:
# Load the model, configured for 3 labels and with our specific mappings
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
print(f"Model '{model_checkpoint}' loaded and configured for 3-class classification.")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 'distilbert-base-uncased' loaded and configured for 3-class classification.


In [39]:
# Define the directory where the final model will be saved
output_directory = "my-ticket-classifier"

In [40]:
training_args = TrainingArguments(
  output_dir=output_directory,
  # Training parameters
  num_train_epochs=3,                 # Total number of training passes
  learning_rate=2e-5,                 # The speed at which the model learns
  per_device_train_batch_size=16,     # Number of samples per batch during training
  per_device_eval_batch_size=16,      # Number of samples per batch during evaluation
  weight_decay=0.01,                  # Helps prevent overfitting
  # Evaluation and saving strategy
  eval_strategy="epoch",        # Run evaluation at the end of each epoch
  save_strategy="epoch",              # Save a checkpoint at the end of each epoch
  load_best_model_at_end=True,        # Load the best performing model at the end
  metric_for_best_model="f1",         # Use F1-score to determine the best model
  # Logging
  logging_dir='./logs',
  logging_steps=100,                  # Log training progress every 100 steps
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset,
  tokenizer=tokenizer,
  compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [43]:
print("\n--- 5. Starting Training ---")
trainer.train()


--- 5. Starting Training ---




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
print("\n--- 6. Saving the Final Model ---")
# The trainer already saved the best model, but we can save it again to a final directory
final_model_path = "my-ticket-classifier-final"
trainer.save_model(final_model_path)
print(f"\nTraining complete! The best model has been saved to '{final_model_path}'.")
    