In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    os.chdir(project_root)

In [27]:
from dotenv import load_dotenv

load_dotenv()

False

In [28]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

print(f"Using device: {device}")

GPU available: NVIDIA GeForce RTX 3060 Laptop GPU
Number of GPUs: 1
Using device: cuda


#### Data loading

In [30]:
from datasets import load_dataset

from helpers.const import HF_HUB_DATASET_ID

ds = load_dataset(HF_HUB_DATASET_ID)

df = ds["train"].to_pandas()

df.head()

Unnamed: 0,subject,body,department
0,Blinking lights on Emberton II,"Hey, so my Marshall Emberton II is doing this ...",Tech Support
1,My Sony speaker isn't working properly?,"Dear Support Team,\n\nI hope this email finds ...",Tech Support
2,Speaker keeps disconnecting,"Hi, I have a Sony SRS-XG300 and it keeps disco...",Tech Support
3,Hello there! Quick question about waterproofin...,"Dear wonderful support team,\n\nI hope this em...",Tech Support
4,AirPods Pro not working after update?,"Hi there, I have the AirPods Pro 2nd gen and a...",Tech Support


#### Prepare dataset

In [6]:
import pandas as pd

llm_df = pd.DataFrame(
    {
        "text": df["subject"].fillna("") + "\n\n" + df["body"].fillna(""),
        "label": df["department"],
    }
)

llm_df.head()

Unnamed: 0,text,label
0,"Blinking lights on Emberton II\n\nHey, so my M...",Tech Support
1,My Sony speaker isn't working properly?\n\nDea...,Tech Support
2,"Speaker keeps disconnecting\n\nHi, I have a So...",Tech Support
3,Hello there! Quick question about waterproofin...,Tech Support
4,AirPods Pro not working after update?\n\nHi th...,Tech Support


In [7]:
llm_df.shape

(1000, 2)

In [8]:
print("Missing values percentage:")
print(llm_df.isna().sum() / len(llm_df) * 100)

Missing values percentage:
text     0.0
label    0.0
dtype: float64


In [9]:
llm_df["label"].value_counts()

label
Tech Support     300
Billing          200
Shipping         200
Sales            150
Legal            100
Customer Care     50
Name: count, dtype: int64

#### Split data

In [10]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    llm_df, test_size=0.2, random_state=42, stratify=llm_df["label"]
)

#### Label Encoder

In [11]:
label2id = {label: i for i, label in enumerate(train_df["label"].unique())}
id2label = {i: label for label, i in label2id.items()}

train_df["label"] = train_df["label"].map(label2id)
test_df["label"] = test_df["label"].map(label2id)

#### Clasification metrics helpers

In [12]:
from functools import partial

from helpers.classification_metrics import (
    get_per_class_metrics_df,
    get_train_test_metrics_df,
)

get_per_class_metrics_df = partial(get_per_class_metrics_df, id2label=id2label)

#### Dummy model 

In [13]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="most_frequent", random_state=42)

dummy_classifier.fit(train_df["text"], train_df["label"])

y_train_pred_dummy = dummy_classifier.predict(train_df["text"])
y_test_pred_dummy = dummy_classifier.predict(test_df["text"])

dummy_metrics = get_train_test_metrics_df(
    test_df["label"], y_test_pred_dummy, "Dummy", train_df["label"], y_train_pred_dummy
)

dummy_per_class = get_per_class_metrics_df(test_df["label"], y_test_pred_dummy, "Dummy")

In [14]:
dummy_metrics

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
Dummy,0.3,0.09,0.3,0.138462,0.3,0.09,0.3,0.138462


In [15]:
dummy_per_class

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,support
model,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dummy,Legal,0.0,0.0,0.0,20.0
Dummy,Shipping,0.0,0.0,0.0,40.0
Dummy,Tech Support,0.3,1.0,0.461538,60.0
Dummy,Sales,0.0,0.0,0.0,30.0
Dummy,Billing,0.0,0.0,0.0,40.0
Dummy,Customer Care,0.0,0.0,0.0,10.0
Dummy,macro avg,0.05,0.166667,0.076923,200.0
Dummy,weighted avg,0.09,0.3,0.138462,200.0


#### Finetuned encoder

In [16]:
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from paths import MODELS_DIR, TRAINING_LOGS_DIR

LLM_MODEL_NAME = "distilbert-base-uncased"
HF_HUB_MODEL_ID = "h3en1x/distilbert-support-tickets-classifier"

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)


def tokenizer_fn(examples: dict) -> dict:
    return tokenizer(examples["text"], truncation=True, padding="max_length")


tokenized_train_ds = train_ds.map(tokenizer_fn, batched=True)
tokenized_test_ds = test_ds.map(tokenizer_fn, batched=True)

tokenized_train_ds.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)
tokenized_test_ds.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)

model = AutoModelForSequenceClassification.from_pretrained(
    LLM_MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=train_df["label"].unique(),
    y=train_df["label"].values,
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)


training_args = TrainingArguments(
    num_train_epochs=5,
    eval_strategy="epoch",
    logging_strategy="epoch",
    report_to="tensorboard",
    logging_dir=TRAINING_LOGS_DIR / LLM_MODEL_NAME,
    output_dir=MODELS_DIR / LLM_MODEL_NAME,
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    weight_decay=0.05,
    learning_rate=1e-5,
    fp16=True,
    push_to_hub=True,
    hub_model_id=HF_HUB_MODEL_ID,
    hub_strategy="checkpoint",
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }


class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(
            weight=class_weights_tensor.to(logits.device)
        )
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

train_result = trainer.train()

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5944,1.199398,0.895,0.87159
2,0.8844,0.617126,0.94,0.917895
3,0.4691,0.434729,0.94,0.916815
4,0.3141,0.337282,0.955,0.947643
5,0.2586,0.329803,0.96,0.951358


In [17]:
llm_train_predictions = trainer.predict(tokenized_train_ds)
llm_y_pred_train = llm_train_predictions.predictions.argmax(axis=-1)
llm_y_true_train = tokenized_train_ds["label"]

llm_test_predictions = trainer.predict(tokenized_test_ds)
llm_y_pred_test = llm_test_predictions.predictions.argmax(axis=-1)
llm_y_true_test = tokenized_test_ds["label"]

llm_metrics_df = get_train_test_metrics_df(
    llm_y_true_test,
    llm_y_pred_test,
    "DistilBERT",
    y_train=llm_y_true_train,
    y_train_pred=llm_y_pred_train,
)

llm_per_class_metrics_df = get_per_class_metrics_df(
    llm_y_true_test, llm_y_pred_test, "DistilBERT"
)

In [18]:
llm_metrics_df

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
DistilBERT,0.97375,0.976893,0.97375,0.969928,0.96,0.962554,0.96,0.951358


In [19]:
llm_per_class_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,support
model,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DistilBERT,Legal,0.863636,0.95,0.904762,20.0
DistilBERT,Shipping,0.952381,1.0,0.97561,40.0
DistilBERT,Tech Support,0.952381,1.0,0.97561,60.0
DistilBERT,Sales,1.0,1.0,1.0,30.0
DistilBERT,Billing,1.0,1.0,1.0,40.0
DistilBERT,Customer Care,1.0,0.3,0.461538,10.0
DistilBERT,macro avg,0.9614,0.875,0.886253,200.0
DistilBERT,weighted avg,0.962554,0.96,0.951358,200.0


#### Summary

In [20]:
all_metrics_df = pd.concat([dummy_metrics, llm_metrics_df], axis=0)

all_metrics_df

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
Dummy,0.3,0.09,0.3,0.138462,0.3,0.09,0.3,0.138462
DistilBERT,0.97375,0.976893,0.97375,0.969928,0.96,0.962554,0.96,0.951358


#### Push to hub

In [21]:
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/h3en1x/distilbert-support-tickets-classifier/commit/68b1ca31003a0fa4f35330e16101813f9d252844', commit_message='End of training', commit_description='', oid='68b1ca31003a0fa4f35330e16101813f9d252844', pr_url=None, repo_url=RepoUrl('https://huggingface.co/h3en1x/distilbert-support-tickets-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='h3en1x/distilbert-support-tickets-classifier'), pr_revision=None, pr_num=None)

#### Test predictions

In [23]:
df[df["department"] == "Customer Care"].iloc[0]["body"]

'hey so ive been trying to reach u guys for days now and nothing. this week has been hell and i just need help with my thing but no one answers??? like whats going on i just want to talk to someone pls help'

In [24]:
custom_prompt = "Website down. Hi, just wanted to let u know that there is an issue with your website. Its down at the moment. "
custom_dataset = Dataset.from_dict({"text": [custom_prompt]})
tokenized_custom = custom_dataset.map(tokenizer_fn, batched=True)
tokenized_custom.set_format(type="torch", columns=["input_ids", "attention_mask"])

predictions = trainer.predict(tokenized_custom)
predicted_class = predictions.predictions.argmax(axis=-1)[0]
confidence = predictions.predictions.max(axis=-1)[0]

predicted_department = id2label[predicted_class]

print("\nPrediction Results:")
print(f"Prompt: {custom_prompt}")
print(f"Predicted Department: {predicted_department}")
print(f"Confidence: {confidence:.3f}")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]


Prediction Results:
Prompt: Website down. Hi, just wanted to let u know that there is an issue with your website. Its down at the moment. 
Predicted Department: Legal
Confidence: 1.819
