In [1]:
!pip install transformers[torch] datasets pandas scikit-learn



In [2]:
!pip install -q transformers datasets spacy scikit-learn scipy
!python -m spacy download en_core_web_sm


  if entities is not ():
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import torch
if torch.cuda.is_available():
    print(f"GPU is active: {torch.cuda.get_device_name(0)}")
else:
    print("GPU NOT FOUND. Go to Runtime > Change Runtime Type > T4 GPU")

!pip install transformers[torch] datasets pandas scikit-learn

GPU is active: Tesla T4


In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import scipy.special as sp

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import spacy

from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)


2025-12-19 11:55:00.598135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766145300.804027      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766145300.863006      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766145301.342203      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766145301.342241      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766145301.342244      24 computation_placer.cc:177] computation placer alr

In [5]:
PATH = "/kaggle/input/all-tickets-prepared-for-model/all_tickets_prepared_for_model.csv"
df = pd.read_csv(PATH)

print(df.head())

texts = df["clean_text"].astype(str).tolist()

MERGE_MAP = {
    3: 1,
    5: 1   # optional but recommended for 92%+
}

df["Topic_ID_merged"] = df["Topic_ID"].replace(MERGE_MAP)

# üî• STEP 2: Encode merged labels
le = LabelEncoder()
labels = le.fit_transform(df["Topic_ID_merged"])
num_labels = len(np.unique(labels))

print("New number of labels:", num_labels)

X_train, X_val, y_train, y_val = train_test_split(
    texts,
    labels,
    test_size=0.15,
    random_state=42,
    stratify=labels
)


                                          clean_text    Topic_group Priority  \
0  connection icon icon dear please setup icon pe...       Hardware   Medium   
1  work experience user work experience user hi w...         Access   Medium   
2  requesting meeting requesting meeting hi pleas...       Hardware   Medium   
3  reset passwords external accounts expire days ...         Access   Medium   

   Topic_ID  Priority_ID  
0         3            2  
1         0            2  
2         3            2  
3         0            2  
4         5            2  
New number of labels: 6


In [6]:
nlp = spacy.load("en_core_web_sm", disable=["parser","tagger"])

def add_entities(text):
    doc = nlp(text)
    ents = " ".join(ent.text for ent in doc.ents)
    return text + " " + ents

X_train_ner = [add_entities(t) for t in X_train]
X_val_ner   = [add_entities(t) for t in X_val]

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=150_000,
    min_df=2,
    max_df=0.9
)

X_train_tfidf = tfidf.fit_transform(X_train_ner)
X_val_tfidf   = tfidf.transform(X_val_ner)

svm = CalibratedClassifierCV(LinearSVC(C=1.0))
svm.fit(X_train_tfidf, y_train)

svm_probs = svm.predict_proba(X_val_tfidf)
svm_preds = np.argmax(svm_probs, axis=1)

print("TF-IDF + NER + SVM accuracy:",
      accuracy_score(y_val, svm_preds))




TF-IDF + NER + SVM accuracy: 0.8961457603363701


In [7]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=96
    )

train_enc = tokenize(X_train)
val_enc   = tokenize(X_val)

class TicketDataset(Dataset):
    def __init__(self, enc, labels):
        self.enc = enc
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.enc.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_ds = TicketDataset(train_enc, y_train)
val_ds   = TicketDataset(val_enc, y_val)

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=num_labels
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    fp16=True,
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(2)]
)

trainer.train()
pred_out = trainer.predict(val_ds)

roberta_probs = sp.softmax(pred_out.predictions, axis=1)
print("RoBERTa accuracy:",
      accuracy_score(y_val, np.argmax(roberta_probs, axis=1)))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3385,0.30584,0.89124
2,0.2648,0.303489,0.898388
3,0.2023,0.280264,0.90904
4,0.1529,0.30363,0.907078
5,0.1172,0.322564,0.910862




RoBERTa accuracy: 0.9108619481429573


In [8]:
# Build vocabulary
vocab = {w:i+1 for i,w in enumerate(set(" ".join(X_train).split()))}

def text_to_indices(texts, max_len=50):
    seqs = []
    for t in texts:
        s = [vocab.get(w,0) for w in t.split()]
        s = s[:max_len] + [0]*(max_len-len(s))
        seqs.append(s)
    return torch.tensor(seqs)

X_train_idx = text_to_indices(X_train)
X_val_idx   = text_to_indices(X_val)

class LSTMDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = torch.tensor(y)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(
    LSTMDataset(X_train_idx, y_train),
    batch_size=256,
    shuffle=True
)
val_loader = DataLoader(
    LSTMDataset(X_val_idx, y_val),
    batch_size=256
)

class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        emb = self.embedding(x)
        _, h = self.gru(emb)
        return self.fc(h.squeeze(0))

gru_model = GRUClassifier(
    vocab_size=len(vocab)+1,
    embed_dim=50,
    hidden_dim=64,
    num_classes=num_labels
).to("cuda")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru_model.parameters(), lr=1e-3)

for epoch in range(3):
    gru_model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to("cuda"), yb.to("cuda")
        optimizer.zero_grad()
        loss = criterion(gru_model(xb), yb)
        loss.backward()
        optimizer.step()

    gru_model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to("cuda"), yb.to("cuda")
            preds = torch.argmax(gru_model(xb), dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    print(f"GRU Epoch {epoch+1} ‚Üí Val Acc: {correct/total:.4f}")


GRU Epoch 1 ‚Üí Val Acc: 0.4772
GRU Epoch 2 ‚Üí Val Acc: 0.7449
GRU Epoch 3 ‚Üí Val Acc: 0.8353


In [9]:
gru_model.eval()
gru_probs = []

with torch.no_grad():
    for xb, _ in val_loader:
        xb = xb.to("cuda")
        probs = torch.softmax(gru_model(xb), dim=1)
        gru_probs.append(probs.cpu().numpy())

gru_probs = np.vstack(gru_probs)


In [10]:
stack_X = np.hstack([
    roberta_probs,
    svm_probs,
    gru_probs
])

stacker = LogisticRegression(
    max_iter=4000,
    class_weight="balanced",
    n_jobs=-1
)

stacker.fit(stack_X, y_val)
stack_preds = stacker.predict(stack_X)

print("üèÜ FINAL STACKING ACCURACY:",
      accuracy_score(y_val, stack_preds))


üèÜ FINAL STACKING ACCURACY: 0.9118430273300631


In [11]:
import torch

# Assume 'model' is your trained PyTorch model
MODEL_PATH = "/kaggle/working/my_model.pth"

# Save the model's state_dict
torch.save(model.state_dict(), MODEL_PATH)

# Later, to load the model
# model.load_state_dict(torch.load(MODEL_PATH))
# model.eval()
