In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("D:\\ticket_system\\ML_model\\datasets\\merged_dataset.csv")

In [3]:
df["text"] = df["subject"].fillna('') + " " + df["body"].fillna('')

In [4]:
df["text"] = df["subject"].fillna('') + " " + df["body"].fillna('')

In [5]:
df.head()

Unnamed: 0,subject,body,answer,type,priority,tags,text
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Vielen Dank für die Meldung des kritischen Sic...,Incident,high,"['Security', 'Outage', 'Disruption', 'Data Bre...",Wesentlicher Sicherheitsvorfall Sehr geehrtes ...
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,high,"['Account', 'Disruption', 'Outage', 'IT', 'Tec...","Account Disruption Dear Customer Support Team,..."
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Thank you for your inquiry. Our products suppo...,Request,medium,"['Product', 'Feature', 'Tech Support']",Query About Smart Home System Integration Feat...
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",We appreciate you reaching out with your billi...,Request,low,"['Billing', 'Payment', 'Account', 'Documentati...",Inquiry Regarding Invoice Details Dear Custome...
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Thank you for your inquiry. Our product suppor...,Problem,medium,"['Product', 'Feature', 'Feedback', 'Tech Suppo...",Question About Marketing Agency Software Compa...


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [7]:
le_type = LabelEncoder()
le_priority = LabelEncoder()

In [8]:
df["type_label"] = le_type.fit_transform(df["type"])
df["priority_label"] = le_priority.fit_transform(df["priority"])

In [9]:
train_texts, test_texts, train_type, test_type, train_priority, test_priority = train_test_split(
    df["text"], df["type_label"], df["priority_label"], test_size=0.2, random_state=42
)

In [10]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [12]:

class TicketDataset(Dataset):
    def __init__(self, texts, labels1, labels2):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=256)
        self.labels1 = labels1
        self.labels2 = labels2

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels1"] = torch.tensor(self.labels1[idx])
        item["labels2"] = torch.tensor(self.labels2[idx])
        return item

    def __len__(self):
        return len(self.labels1)

In [13]:
train_dataset = TicketDataset(train_texts, train_type.values, train_priority.values)
test_dataset = TicketDataset(test_texts, test_type.values, test_priority.values)

In [14]:
from torch import nn
from transformers import BertModel

In [15]:
class MultiTaskBERT(nn.Module):
    def __init__(self, num_labels1, num_labels2):
        super(MultiTaskBERT, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier1 = nn.Linear(self.bert.config.hidden_size, num_labels1)
        self.classifier2 = nn.Linear(self.bert.config.hidden_size, num_labels2)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels1=None, labels2=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = self.dropout(outputs.pooler_output)
        logits1 = self.classifier1(pooled_output)
        logits2 = self.classifier2(pooled_output)
        return {"logits1": logits1, "logits2": logits2}

In [16]:
num_type_classes = len(le_type.classes_)
num_priority_classes = len(le_priority.classes_)
model = MultiTaskBERT(num_type_classes, num_priority_classes)


In [17]:
from transformers import Trainer, TrainingArguments

In [23]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels1 = inputs.pop("labels1")
        labels2 = inputs.pop("labels2")
        outputs = model(**inputs)
        loss_fct = nn.CrossEntropyLoss()
        loss1 = loss_fct(outputs["logits1"], labels1)
        loss2 = loss_fct(outputs["logits2"], labels2)
        loss = loss1 + loss2
        return (loss, outputs) if return_outputs else loss


In [24]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [25]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
model_path = "./ticket_classifier_model"
trainer.save_model(model_path)
import pickle
with open("label_encoders.pkl", "wb") as f:
    pickle.dump((le_type, le_priority), f)