In [9]:
import pandas as pd
df = pd.read_csv('file-for-fineTuning.csv')
df.head()

Unnamed: 0,text,label,ticket_id,agent_name,text_length
0,Where can I find API rate limits documentation?,other,1000,Dana,47
1,Can I migrate data from Freshdesk?,other,1001,Bob,34
2,Cannot update billing address; page keeps relo...,billing,1002,Charlie,52
3,Looking for a product roadmap or upcoming feat...,other,1003,Dana,51
4,Dark mode toggled but UI stays light.,tech_support,1004,Alice,37


In [10]:
texts = df["text"].tolist()
labels = df["label"].astype("category").cat.codes.tolist()

Tokenize the Text Data

In [11]:
### Using the DistilBERT tokenizer:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
encodings = tokenizer(texts, truncation = True, padding = True, max_length = 128)



Creating Dataset for Training

In [12]:
from torch.utils.data import Dataset
import torch

class TicketDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx]) 
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # ← FIX: Add dtype
        return item

    def __len__(self):
        return len(self.labels)



In [13]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labes = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

train_dataset = TicketDataset(train_encodings, train_labels)
val_dataset = TicketDataset(val_encodings, val_labes)

Load Pre-Trained DistilBERT and Fine-Tune

In [14]:
# Run this to check if all packages are installed correctly
import torch
import transformers
import accelerate

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

# Check if GPU is available
print(f"GPU available: {torch.cuda.is_available()}")

PyTorch version: 2.9.0+cpu
Transformers version: 4.30.2
Accelerate version: 1.11.0
GPU available: False


In [15]:
# Run this after restarting kernel
import accelerate
print(f"Accelerate version: {accelerate.__version__}")

from transformers import Trainer, TrainingArguments
print("✅ Trainer imported successfully!")

Accelerate version: 1.11.0
✅ Trainer imported successfully!


In [16]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# Make sure you have the right number of labels
num_labels = len(set(labels))  # Should be 4 for your categories

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=num_labels
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,  # Added for better training
    metric_for_best_model="eval_loss",  # Added for better training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Now train the model
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.

{'loss': 1.2554, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


                                      
  0%|          | 0/75 [05:41<?, ?it/s]         

{'loss': 0.8481, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}



[A
[A
[A
[A
                                      
[A                                            

  0%|          | 0/75 [05:48<?, ?it/s]       
[A
[A

{'eval_loss': 0.5088710188865662, 'eval_runtime': 1.2092, 'eval_samples_per_second': 81.874, 'eval_steps_per_second': 5.789, 'epoch': 1.0}


                                      
  0%|          | 0/75 [05:55<?, ?it/s]         

{'loss': 0.5981, 'learning_rate': 3e-05, 'epoch': 1.2}


                                      
  0%|          | 0/75 [06:06<?, ?it/s]         

{'loss': 0.4359, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


                                      
  0%|          | 0/75 [06:16<?, ?it/s]         

{'loss': 0.3196, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
                                      
[A                                            

  0%|          | 0/75 [06:18<?, ?it/s]       
[A
[A

{'eval_loss': 0.25896069407463074, 'eval_runtime': 1.2795, 'eval_samples_per_second': 77.373, 'eval_steps_per_second': 5.471, 'epoch': 2.0}


                                      
  0%|          | 0/75 [06:30<?, ?it/s]         

{'loss': 0.259, 'learning_rate': 1e-05, 'epoch': 2.4}


                                      
  0%|          | 0/75 [06:41<?, ?it/s]         

{'loss': 0.3136, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}



[A
[A
[A
[A
                                      
[A                                            

  0%|          | 0/75 [06:48<?, ?it/s]       
[A
[A

{'eval_loss': 0.24440349638462067, 'eval_runtime': 1.2443, 'eval_samples_per_second': 79.564, 'eval_steps_per_second': 5.626, 'epoch': 3.0}


                                      
100%|██████████| 75/75 [01:32<00:00,  1.24s/it]

{'train_runtime': 93.8475, 'train_samples_per_second': 12.659, 'train_steps_per_second': 0.799, 'train_loss': 0.5553516308466594, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=0.5553516308466594, metrics={'train_runtime': 93.8475, 'train_samples_per_second': 12.659, 'train_steps_per_second': 0.799, 'train_loss': 0.5553516308466594, 'epoch': 3.0})