In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    os.chdir(project_root)

In [22]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

print(f"Using device: {device}")

GPU available: NVIDIA GeForce RTX 3060 Laptop GPU
Number of GPUs: 1
Using device: cuda


#### Data loading

In [23]:
import pandas as pd

from paths import DATA_DIR

df = pd.read_csv(DATA_DIR / "customer_support_emails_dataset.csv", index_col=0)

df.head()

Unnamed: 0,subject,body,department
0,Could you kindly advise on the best product fo...,"Dear Support Team, I hope this message finds y...",Tech Support
1,BassHub Pro mic issue?,"Hey! Love my BassHub Pro, but the mic isn't wo...",Tech Support
2,PulseDock not producing sound,"Hello, I'm having an issue with my PulseDock w...",Tech Support
3,WavePanel Touch firmware update issue - Order ...,"I purchased the WavePanel Touch on March 15, 2...",Tech Support
4,Firmware Update Issue with StreamLink Hub (Ord...,"Dear Support Team, I hope this message finds y...",Tech Support


#### Prepare dataset

In [24]:
import pandas as pd

llm_df = pd.DataFrame(
    {
        "text": df["subject"].fillna("") + "\n\n" + df["body"].fillna(""),
        "label": df["department"],
    }
)

llm_df.head()

Unnamed: 0,text,label
0,Could you kindly advise on the best product fo...,Tech Support
1,BassHub Pro mic issue?\n\nHey! Love my BassHub...,Tech Support
2,"PulseDock not producing sound\n\nHello, I'm ha...",Tech Support
3,WavePanel Touch firmware update issue - Order ...,Tech Support
4,Firmware Update Issue with StreamLink Hub (Ord...,Tech Support


In [25]:
llm_df.shape

(2000, 2)

In [26]:
print("Missing values percentage:")
print(llm_df.isna().sum() / len(llm_df) * 100)

Missing values percentage:
text     0.0
label    0.0
dtype: float64


In [27]:
llm_df["label"].value_counts()

label
Tech Support    600
Shipping        500
Billing         400
Sales           300
Legal           200
Name: count, dtype: int64

#### Split data

In [28]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    llm_df, test_size=0.2, random_state=42, stratify=llm_df["label"]
)

#### Label Encoder

In [29]:
label2id = {label: i for i, label in enumerate(train_df["label"].unique())}
id2label = {i: label for label, i in label2id.items()}

train_df["label"] = train_df["label"].map(label2id)
test_df["label"] = test_df["label"].map(label2id)

#### Clasification metrics helpers

In [30]:
from functools import partial

import pandas as pd
from sklearn.metrics import accuracy_score, classification_report


def get_classification_metrics_df(y_true, y_pred, model_name, average="weighted"):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": report[f"{average} avg"]["precision"],
        "recall": report[f"{average} avg"]["recall"],
        "f1-score": report[f"{average} avg"]["f1-score"],
    }
    return pd.DataFrame(metrics, index=[model_name])


def get_train_test_metrics_df(
    y_test, y_test_pred, model_name, y_train, y_train_pred, average="weighted"
):
    train_df = get_classification_metrics_df(
        y_train, y_train_pred, model_name, average=average
    )
    train_df.columns = pd.MultiIndex.from_product([["train"], train_df.columns])

    test_df = get_classification_metrics_df(
        y_test, y_test_pred, model_name, average=average
    )
    test_df.columns = pd.MultiIndex.from_product([["test"], test_df.columns])

    return pd.concat([train_df, test_df], axis=1)


def get_per_class_metrics_df(y_true, y_pred, model_name, id2label):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    df = pd.DataFrame(report).drop(columns=["accuracy"]).T
    df.index = [i if not str(i).isdigit() else id2label[int(i)] for i in df.index]
    df.index = pd.MultiIndex.from_product(
        [[model_name], df.index], names=["model", "class"]
    )
    return df


get_per_class_metrics_df = partial(get_per_class_metrics_df, id2label=id2label)

#### Dummy model 

In [31]:
from sklearn.dummy import DummyClassifier

dummy_classifier = DummyClassifier(strategy="most_frequent", random_state=42)

dummy_classifier.fit(train_df["text"], train_df["label"])

y_train_pred_dummy = dummy_classifier.predict(train_df["text"])
y_test_pred_dummy = dummy_classifier.predict(test_df["text"])

dummy_metrics = get_train_test_metrics_df(
    test_df["label"], y_test_pred_dummy, "Dummy", train_df["label"], y_train_pred_dummy
)

dummy_per_class = get_per_class_metrics_df(test_df["label"], y_test_pred_dummy, "Dummy")

In [32]:
dummy_metrics

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
Dummy,0.3,0.09,0.3,0.138462,0.3,0.09,0.3,0.138462


In [33]:
dummy_per_class

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,support
model,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dummy,Sales,0.0,0.0,0.0,60.0
Dummy,Billing,0.0,0.0,0.0,80.0
Dummy,Legal,0.0,0.0,0.0,40.0
Dummy,Tech Support,0.3,1.0,0.461538,120.0
Dummy,Shipping,0.0,0.0,0.0,100.0
Dummy,macro avg,0.06,0.2,0.092308,400.0
Dummy,weighted avg,0.09,0.3,0.138462,400.0


#### Finetuned encoder

In [34]:
import torch
from datasets import Dataset
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

LLM_MODEL_NAME = "distilbert-base-uncased"

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)


def tokenizer_fn(examples: dict) -> dict:
    return tokenizer(examples["text"], truncation=True, padding="max_length")


tokenized_train_ds = train_ds.map(tokenizer_fn, batched=True)
tokenized_test_ds = test_ds.map(tokenizer_fn, batched=True)

tokenized_train_ds.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)
tokenized_test_ds.set_format(
    type="torch", columns=["input_ids", "attention_mask", "label"]
)

model = AutoModelForSequenceClassification.from_pretrained(
    LLM_MODEL_NAME, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=train_df["label"].unique(),
    y=train_df["label"].values,
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)


training_args = TrainingArguments(
    eval_strategy="epoch",
    logging_strategy="epoch",
    report_to="tensorboard",
    logging_dir="./logs",
    save_strategy="epoch",
    run_name="distilbert-baseline",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    weight_decay=0.05,
    learning_rate=1e-5,
    fp16=True,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }


class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(
            weight=class_weights_tensor.to(logits.device)
        )
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

train_result = trainer.train()

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8438,0.240746,0.9725,0.972196
2,0.103,0.072424,0.985,0.984917
3,0.0267,0.072616,0.985,0.984793


In [35]:
llm_train_predictions = trainer.predict(tokenized_train_ds)
llm_y_pred_train = llm_train_predictions.predictions.argmax(axis=-1)
llm_y_true_train = tokenized_train_ds["label"]

llm_test_predictions = trainer.predict(tokenized_test_ds)
llm_y_pred_test = llm_test_predictions.predictions.argmax(axis=-1)
llm_y_true_test = tokenized_test_ds["label"]

llm_metrics_df = get_train_test_metrics_df(
    llm_y_true_test,
    llm_y_pred_test,
    "DistilBERT",
    y_train=llm_y_true_train,
    y_train_pred=llm_y_pred_train,
)

llm_per_class_metrics_df = get_per_class_metrics_df(
    llm_y_true_test, llm_y_pred_test, "DistilBERT"
)

In [36]:
llm_metrics_df

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
DistilBERT,0.99875,0.998751,0.99875,0.99875,0.985,0.985259,0.985,0.984917


In [37]:
llm_per_class_metrics_df

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,support
model,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DistilBERT,Sales,0.967742,1.0,0.983607,60.0
DistilBERT,Billing,1.0,1.0,1.0,80.0
DistilBERT,Legal,1.0,0.925,0.961039,40.0
DistilBERT,Tech Support,0.97541,0.991667,0.983471,120.0
DistilBERT,Shipping,0.989899,0.98,0.984925,100.0
DistilBERT,macro avg,0.98661,0.979333,0.982608,400.0
DistilBERT,weighted avg,0.985259,0.985,0.984917,400.0


#### Summary

In [38]:
all_metrics_df = pd.concat([dummy_metrics, llm_metrics_df], axis=0)

all_metrics_df

Unnamed: 0_level_0,train,train,train,train,test,test,test,test
Unnamed: 0_level_1,accuracy,precision,recall,f1-score,accuracy,precision,recall,f1-score
Dummy,0.3,0.09,0.3,0.138462,0.3,0.09,0.3,0.138462
DistilBERT,0.99875,0.998751,0.99875,0.99875,0.985,0.985259,0.985,0.984917


In [42]:
custom_prompt = (
    "Hey i have an issue with my order. The headset doesnt work. I want to return it. "
)
custom_dataset = Dataset.from_dict({"text": [custom_prompt]})
tokenized_custom = custom_dataset.map(tokenizer_fn, batched=True)
tokenized_custom.set_format(type="torch", columns=["input_ids", "attention_mask"])

predictions = trainer.predict(tokenized_custom)
predicted_class = predictions.predictions.argmax(axis=-1)[0]
confidence = predictions.predictions.max(axis=-1)[0]

predicted_department = id2label[predicted_class]

print("\nPrediction Results:")
print(f"Prompt: {custom_prompt}")
print(f"Predicted Department: {predicted_department}")
print(f"Confidence: {confidence:.3f}")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]


Prediction Results:
Prompt: Hey i have an issue with my order. The headset doesnt work. I want to return it. 
Predicted Department: Shipping
Confidence: 3.928
