In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the data
df = pd.read_excel(r"C:\Users\purvesh.mule_jadeglo\Desktop\Project\it_-_last_3_month_total_tickets_12556686_2025_01_20_.xlsx")

# Combine subject and description
df["Full_description"] = df["Subject"] + " " + df["Description"].fillna('')
df = df.dropna(subset=["Full_description"])

# Encode the target labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["Sub-Category"])

texts = df["Full_description"].tolist()
labels = df["label"].tolist()

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

In [3]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the inputs
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [4]:
class TicketDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = TicketDataset(train_encodings, train_labels)
test_dataset = TicketDataset(test_encodings, test_labels)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(le.classes_))
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
#train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(5):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 67/67 [13:49<00:00, 12.38s/it, loss=0.936]
Epoch 2: 100%|██████████| 67/67 [13:15<00:00, 11.87s/it, loss=1.01] 
Epoch 3: 100%|██████████| 67/67 [12:42<00:00, 11.37s/it, loss=0.403]
Epoch 4: 100%|██████████| 67/67 [14:07<00:00, 12.65s/it, loss=0.902] 
Epoch 5: 100%|██████████| 67/67 [13:26<00:00, 12.04s/it, loss=0.336]


In [6]:
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

print(classification_report(all_labels, all_preds, target_names=le.classes_))

100%|██████████| 34/34 [00:53<00:00,  1.57s/it]


                                 precision    recall  f1-score   support

                 Access Request       0.00      0.00      0.00         4
                            GWS       0.29      0.50      0.37        10
HR Systems & Information Update       0.00      0.00      0.00         4
               Hardware Related       0.62      0.83      0.71        36
            IT SYS - Admin Task       0.00      0.00      0.00         7
                      Messaging       1.00      0.50      0.67         4
                Network Related       0.60      0.43      0.50         7
       Onboarding / Offboarding       0.97      0.94      0.95        63
                         Others       0.00      0.00      0.00         5
      Password & Access related       0.64      0.41      0.50        17
            Security Monitoring       1.00      1.00      1.00        14
                Service Request       0.67      0.86      0.75         7
           Software/Application       0.80      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
import torch.nn.functional as F

def predict_ticket_roberta(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)  # Convert logits to probabilities
        pred_label_id = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_label_id].item()  # Confidence score of predicted label

    if confidence < 0.50:
        return "Helpdesk", confidence
    else:
        predicted_label = le.inverse_transform([pred_label_id])[0]
        return predicted_label, confidence

# Example
label, confidence = predict_ticket_roberta("wifi is not working")
print(f"Assign to: {label} (Confidence: {confidence:.2f})")

Assign to: Network Related (Confidence: 0.61)


In [None]:
model.save_pretrained("roberta_ticket_model/")
tokenizer.save_pretrained("roberta_ticket_model/")
import joblib
joblib.dump(model, "roberta_model.pkl")
joblib.dump(le, "label_encoder_roberta.pkl")