In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    os.chdir(project_root)

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

print(f"Using device: {device}")

GPU available: NVIDIA GeForce RTX 3060 Laptop GPU
Number of GPUs: 1
Using device: cuda


#### Data loading

In [5]:
from datasets import load_dataset

from helpers.const import HF_HUB_DATASET_ID

ds = load_dataset(HF_HUB_DATASET_ID)

ds

#### Prepare dataset

In [7]:
def merge_subject_body(example):
    return {
        "text": (example.get("subject") or "") + "\n\n" + (example.get("body") or ""),
        "label": example["department"],
    }


llm_ds = ds.map(merge_subject_body, remove_columns=ds["train"].column_names)

In [8]:
llm_df = llm_ds["train"].to_pandas()

In [9]:
print("Missing values percentage:")
print(llm_df.isna().sum() / len(llm_df) * 100)

Missing values percentage:
text     0.0
label    0.0
dtype: float64


In [10]:
llm_df["label"].value_counts()

label
Tech Support     300
Billing          200
Shipping         200
Sales            150
Legal            100
Customer Care     50
Name: count, dtype: int64

#### Split data

In [11]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    llm_df, test_size=0.2, random_state=42, stratify=llm_df["label"]
)

#### Label Encoder

In [12]:
label2id = {label: i for i, label in enumerate(train_df["label"].unique())}
id2label = {i: label for label, i in label2id.items()}

train_df["label"] = train_df["label"].map(label2id)
test_df["label"] = test_df["label"].map(label2id)

#### Clasification metrics helpers

In [13]:
from functools import partial

from helpers.classification_metrics import (
    get_per_class_metrics_df,
    get_train_test_metrics_df,
)

get_per_class_metrics_df = partial(get_per_class_metrics_df, id2label=id2label)

#### Dummy model 

In [14]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="most_frequent", random_state=42)

dummy_classifier.fit(train_df["text"], train_df["label"])

y_train_pred_dummy = dummy_classifier.predict(train_df["text"])
y_test_pred_dummy = dummy_classifier.predict(test_df["text"])

dummy_metrics = get_train_test_metrics_df(
    test_df["label"], y_test_pred_dummy, "Dummy", train_df["label"], y_train_pred_dummy
)

dummy_per_class = get_per_class_metrics_df(test_df["label"], y_test_pred_dummy, "Dummy")

In [15]:
dummy_metrics

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
Dummy,0.3,0.09,0.3,0.138462,0.3,0.09,0.3,0.138462


In [16]:
dummy_per_class

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,support
model,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dummy,Legal,0.0,0.0,0.0,20.0
Dummy,Shipping,0.0,0.0,0.0,40.0
Dummy,Tech Support,0.3,1.0,0.461538,60.0
Dummy,Sales,0.0,0.0,0.0,30.0
Dummy,Billing,0.0,0.0,0.0,40.0
Dummy,Customer Care,0.0,0.0,0.0,10.0
Dummy,macro avg,0.05,0.166667,0.076923,200.0
Dummy,weighted avg,0.09,0.3,0.138462,200.0


#### Finetuned encoder

In [17]:
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

from helpers.const import HF_HUB_MODEL_ID
from paths import MODELS_DIR, TRAINING_LOGS_DIR

BASE_MODEL_NAME = "distilbert-base-uncased"

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)


def tokenizer_fn(examples: dict) -> dict:
    return tokenizer(examples["text"], truncation=True, padding="max_length")


tokenized_train_ds = train_ds.map(tokenizer_fn, batched=True)
tokenized_test_ds = test_ds.map(tokenizer_fn, batched=True)

tokenized_train_ds.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)
tokenized_test_ds.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=train_df["label"].unique(),
    y=train_df["label"].values,
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)


training_args = TrainingArguments(
    num_train_epochs=5,
    eval_strategy="epoch",
    logging_strategy="epoch",
    report_to="tensorboard",
    logging_dir=TRAINING_LOGS_DIR / BASE_MODEL_NAME,
    output_dir=MODELS_DIR / BASE_MODEL_NAME,
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    weight_decay=0.05,
    learning_rate=1e-5,
    fp16=True,
    push_to_hub=True,
    hub_model_id=HF_HUB_MODEL_ID,
    hub_strategy="checkpoint",
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }


class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(
            weight=class_weights_tensor.to(logits.device)
        )
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

train_result = trainer.train()

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.6155,1.222697,0.88,0.854422
2,0.8703,0.621945,0.935,0.911403
3,0.4462,0.419261,0.945,0.927667
4,0.2973,0.31934,0.965,0.959252
5,0.2401,0.305443,0.96,0.951358


In [18]:
llm_train_predictions = trainer.predict(tokenized_train_ds)
llm_y_pred_train = llm_train_predictions.predictions.argmax(axis=-1)
llm_y_true_train = tokenized_train_ds["label"]

llm_test_predictions = trainer.predict(tokenized_test_ds)
llm_y_pred_test = llm_test_predictions.predictions.argmax(axis=-1)
llm_y_true_test = tokenized_test_ds["label"]

llm_metrics_df = get_train_test_metrics_df(
    llm_y_true_test,
    llm_y_pred_test,
    "DistilBERT",
    y_train=llm_y_true_train,
    y_train_pred=llm_y_pred_train,
)

llm_per_class_metrics_df = get_per_class_metrics_df(
    llm_y_true_test, llm_y_pred_test, "DistilBERT"
)

In [19]:
llm_metrics_df

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
DistilBERT,0.9825,0.983346,0.9825,0.980861,0.96,0.962554,0.96,0.951358


In [20]:
llm_per_class_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,support
model,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DistilBERT,Legal,0.863636,0.95,0.904762,20.0
DistilBERT,Shipping,0.952381,1.0,0.97561,40.0
DistilBERT,Tech Support,0.952381,1.0,0.97561,60.0
DistilBERT,Sales,1.0,1.0,1.0,30.0
DistilBERT,Billing,1.0,1.0,1.0,40.0
DistilBERT,Customer Care,1.0,0.3,0.461538,10.0
DistilBERT,macro avg,0.9614,0.875,0.886253,200.0
DistilBERT,weighted avg,0.962554,0.96,0.951358,200.0


#### Summary

In [21]:
import pandas as pd

all_metrics_df = pd.concat([dummy_metrics, llm_metrics_df], axis=0)

all_metrics_df

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
Dummy,0.3,0.09,0.3,0.138462,0.3,0.09,0.3,0.138462
DistilBERT,0.9825,0.983346,0.9825,0.980861,0.96,0.962554,0.96,0.951358


#### Push to hub

In [22]:
trainer.push_to_hub(dataset=HF_HUB_DATASET_ID)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/h3en1x/distilbert-support-tickets-classifier/commit/c8591632c5d4fc26134d3d18a024572d7971dc41', commit_message='End of training', commit_description='', oid='c8591632c5d4fc26134d3d18a024572d7971dc41', pr_url=None, repo_url=RepoUrl('https://huggingface.co/h3en1x/distilbert-support-tickets-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='h3en1x/distilbert-support-tickets-classifier'), pr_revision=None, pr_num=None)

In [23]:
tokenizer.push_to_hub(HF_HUB_MODEL_ID)

README.md: 0.00B [00:00, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/h3en1x/distilbert-support-tickets-classifier/commit/c8591632c5d4fc26134d3d18a024572d7971dc41', commit_message='Upload tokenizer', commit_description='', oid='c8591632c5d4fc26134d3d18a024572d7971dc41', pr_url=None, repo_url=RepoUrl('https://huggingface.co/h3en1x/distilbert-support-tickets-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='h3en1x/distilbert-support-tickets-classifier'), pr_revision=None, pr_num=None)

#### Test predictions

In [24]:
from transformers import pipeline

from helpers.const import HF_HUB_MODEL_ID

classifier = pipeline("text-classification", model=HF_HUB_MODEL_ID)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0


In [26]:
subject = "Website down"

body = "Hey, I've notices your website is down. Can you please check it?"

prompt = f"{subject}\n\n{body}"

classifier.predict(prompt)

[{'label': 'Legal', 'score': 0.48047029972076416}]