In [2]:
import json
import torch
import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("../nlu_engine/intents.json", "r", encoding="utf-8") as f:
    data = json.load(f)

texts = []
labels = []

label2id = {intent["name"]: i for i, intent in enumerate(data["intents"])}
id2label = {i: name for name, i in label2id.items()}

for intent in data["intents"]:
    for example in intent["examples"]:
        texts.append(example)
        labels.append(label2id[intent["name"]])

In [4]:
print(f"Total examples: {len(texts)}")
display(texts[:5])


Total examples: 92


['check my balance',
 'check the balance in account 1234566',
 'i want to check my balance',
 "What's my account balance?",
 'Show my savings balance']

In [5]:
print(f"Total labels: {len(set(labels))}")

Total labels: 5


In [6]:
shown_labels = len(label2id)
print(f"Shown labels: {shown_labels}")
print(f"Label to ID mapping: {label2id}")
print(f"ID to Label mapping: {id2label}")

Shown labels: 5
Label to ID mapping: {'check_balance': 0, 'tranfer_money': 1, 'card_block': 2, 'find_atm': 3, 'llm': 4}
ID to Label mapping: {0: 'check_balance', 1: 'tranfer_money', 2: 'card_block', 3: 'find_atm', 4: 'llm'}


In [8]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts,labels,test_size=0.2,random_state=42)

In [9]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
def tokenize(texts, labels):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    encodings["labels"] = torch.tensor(labels)
    return encodings

In [11]:
class IntentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["labels"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.encodings.items()}

In [12]:
train_encodings = tokenize(train_texts, train_labels)
val_encodings = tokenize(val_texts, val_labels)

train_dataset = IntentDataset(train_encodings)
val_dataset = IntentDataset(val_encodings)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id)
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="../models/intent_model_2",
    num_train_epochs=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,  # 0.00002
    logging_dir="../models/intent_model_2/logs",
    do_eval=True,
    save_steps=500
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [29]:
trainer.train()


Step,Training Loss
500,0.0
1000,0.0




TrainOutput(global_step=1000, training_loss=2.4083217605948448e-05, metrics={'train_runtime': 447.0354, 'train_samples_per_second': 16.33, 'train_steps_per_second': 2.237, 'total_flos': 38151099463500.0, 'train_loss': 2.4083217605948448e-05, 'epoch': 100.0})

In [None]:
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.softmax(outputs.logits, dim=1)
    pred_id = torch.argmax(probs).item()

    return {
        "intent": id2label[pred_id],
        "confidence": probs[0][pred_id].item()
    }

In [20]:
predict_intent("I want to block my debit card")

{'intent': 'card_block', 'confidence': 0.998481810092926}

In [21]:
predict_intent("Please transfer $250 to my friend")

{'intent': 'tranfer_money', 'confidence': 0.9968141913414001}

In [24]:
predict_intent("check my account balance")

{'intent': 'check_balance', 'confidence': 0.997870683670044}

In [23]:
predict_intent("transfer 500 dollars to john")

{'intent': 'tranfer_money', 'confidence': 0.9975382089614868}