Let's import the necessary libraries

In [1]:
! pip install transformers==4.37.2 pip install git+https://github.com/huggingface/peft accelerate dataclass_csv bitsandbytes datasets

Collecting git+https://github.com/huggingface/peft
  Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-ylp7ajb8
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-ylp7ajb8
  Resolved https://github.com/huggingface/peft to commit 9119b780ebac7859db5753ebad50d94ba803c99c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[

In [1]:

import os
import transformers
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
import torch
from dataclasses import dataclass, field
from typing import Optional
from dataclass_csv import DataclassReader
from torch.utils.data import Dataset, DataLoader

from enum import Enum

## Prepare Model and Tokenizer

In [2]:
def replace_with_xml_tags(token_inp):
    model_inp = {}
    token  = token_inp["target"]
    token = token.replace("<|begintarget|>", "<target>")
    token = token.replace("<|endtarget|>", "</target>")
    token = token.replace("<|begincontext|>", "<context>")
    token = token.replace("<|endcontext|>", "</context>")
    token = token.replace("<|beginlastuserutterance|>", "<lastuserutterance>")
    token = token.replace("<|endlastuserutterance|>", "</lastuserutterance>")
    token = token.replace("<|begindsts|>", "<dsts>")
    token = token.replace("<|enddsts|>", "</dsts>")
    token = token.replace("<|begindst|>", "<dst>")
    token = token.replace("<|enddst|>", "</dst>")
    token = token.replace("<|beginbelief|>", "<belief>")
    token = token.replace("<|endbelief|>", "</belief>")
    token = token.replace("<|beginresponse|>", "<response>")
    token = token.replace("<|endresponse|>", "</response>")
    token = token.replace("<|beginaction|>", "<action>")
    token = token.replace("<|endaction|>", "</action>")
    token = token.replace("<|beginuseraction|>", "<useraction>")
    token = token.replace("<|enduseraction|>", "</useraction>")
    token = token.replace("<|sysactions|>", "<sysactions>")
    token = token.replace("<|beginintent|>", "<intent>")
    token = token.replace("<|endintent|>", "</intent>")
    token = token.replace("<|beginrequestedslots|>", "<requestedslots>")
    token = token.replace("<|endrequestedslots|>", "</requestedslots>")
    token = token.replace("<|pad|>", "<pad>")
    token = token.replace("<|startoftext|>", "<startoftext>")
    model_inp["context"] = token_inp["context"]
    model_inp["target"] = token_inp["target"]
    model_inp["new_target"] = [token]
    return model_inp

Now, we will be adding 27 new tokens as well as replace the existing pad, bos and eos tokens of the model.

In [3]:
class SpecialTokens(str, Enum):
    target = "<target>"
    end_target = "</target>"
    context = "<context>"
    end_context = "</context>"
    system = "<system>"
    user = "<user>"
    last_user_utterance = "<lastuserutterance>"
    end_last_user_utterance = "</lastuserutterance>"
    dsts = "<dsts>"
    end_dsts = "</dsts>"
    dst = "<dst>"
    end_dst = "</dst>"
    belief = "<belief>"
    end_belief = "</belief>"
    response = "<response>"
    end_response = "</response>"
    action = "<action>"
    end_action = "</action>"
    user_action = "<useraction>"
    end_user_action = "</useraction>"
    sys_actions = "<sysactions>"
    intent = "<intent>"
    end_intent = "</intent>"
    requested_slots = "<requestedslots>"
    end_requested_slots = "</requestedslots>"
    pad_token = "<pad>"
    bos_token = "<startoftext>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

In [4]:
from datasets import load_dataset

dataset = load_dataset("smangrul/assistant_chatbot_dataset")
dataset = dataset["train"].train_test_split(0.2)
print(dataset["train"][0])

text_column = "context"
label_column = "new_target"
max_length = 512

dataset = dataset.map(
    replace_with_xml_tags,
    batched=False,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

print(dataset["train"][0])

{'dialog_id': '1_00066', 'turn_id': 1, 'context': '<|begincontext|><|beginlastuserutterance|>Do you know a place to eat?<|endlastuserutterance|><|endcontext|>', 'target': '<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|><|endbelief|><|enddst|><|enddsts|><|beginuseraction|>INFORM_INTENT->Restaurants^intent~FindRestaurants<|enduseraction|><|beginaction|>REQUEST->Restaurants^city~<|endaction|><|beginresponse|>Which city are you looking?<|endresponse|><|endtarget|>'}


Running tokenizer on dataset:   0%|          | 0/986 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/247 [00:00<?, ? examples/s]

{'context': '<|begincontext|><|beginlastuserutterance|>Do you know a place to eat?<|endlastuserutterance|><|endcontext|>', 'target': '<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|><|endbelief|><|enddst|><|enddsts|><|beginuseraction|>INFORM_INTENT->Restaurants^intent~FindRestaurants<|enduseraction|><|beginaction|>REQUEST->Restaurants^city~<|endaction|><|beginresponse|>Which city are you looking?<|endresponse|><|endtarget|>', 'new_target': ['<target><dsts><dst><intent>FindRestaurants</intent><belief></belief></dst></dsts><useraction>INFORM_INTENT->Restaurants^intent~FindRestaurants</useraction><action>REQUEST->Restaurants^city~</action><response>Which city are you looking?</response></target>']}


We will be finetuning Mistral-7B model. Let's load the tokenizer and add the special tokens followed by loading the base model and resizzing the embedding layers to accomodate the newly added tokens.

In [7]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token=SpecialTokens.pad_token.value,
    bos_token=SpecialTokens.bos_token.value,
    eos_token=SpecialTokens.end_target.value,
    additional_special_tokens=SpecialTokens.list(),
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    load_in_4bit=True
    # use_flash_attention_2=True, # leading to an error
)
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Apply LoRA

In [None]:
config = LoraConfig(
    r=32, lora_alpha=32, lora_dropout=0.05, target_modules=["embed_tokens", "lm_head", "q_proj", "v_proj", "k_proj", "up_proj", "down_proj", "gate_proj"]
)
model = get_peft_model(model, config)
print(model.print_trainable_parameters())

trainable params: 77,809,344 || all params: 7,319,762,624 || trainable%: 1.0630036518517572
None


## Preapre Dataset

In [None]:
def preprocess_function(examples):
    batch_size = len(examples[text_column])
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(examples[text_column])
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:max_length]
        model_inputs["attention_mask"][i] = model_inputs["attention_mask"][i][:max_length]
        labels["input_ids"][i] = labels["input_ids"][i][:max_length]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]

Running tokenizer on dataset:   0%|          | 0/986 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/247 [00:00<?, ? examples/s]

In [None]:
print(train_dataset[0])

{'input_ids': [32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002, 32002

In [None]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, pin_memory=True
)

# Train the model

In [None]:
training_args = TrainingArguments(
    output_dir="intent_recognition",
    num_train_epochs=2,
    save_total_limit=1,
    per_device_train_batch_size=2,
    warmup_steps=10,
    weight_decay=0.0001,
    dataloader_drop_last=True,
    fp16=True,
    logging_steps=10,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    remove_unused_columns=False,
    hub_model_id="Vasanth/intent_recognition",
    push_to_hub=True,
    # hub_private_repo=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=default_data_collator,
)
model.config.use_cache = False
trainer.train()



Step,Training Loss
10,7.2646
20,7.5532
30,5.5472
40,4.5932
50,4.0138
60,2.8528
70,2.5428
80,1.7702
90,1.7515
100,1.2425




TrainOutput(global_step=986, training_loss=0.6239259507786672, metrics={'train_runtime': 4809.1156, 'train_samples_per_second': 0.41, 'train_steps_per_second': 0.205, 'total_flos': 4.354830281657549e+16, 'train_loss': 0.6239259507786672, 'epoch': 2.0})

# Save the Adapter model

When the lora layers are applied to embedding layers, the corresponding base model embedding layers are also saved.

In [None]:
trainer.push_to_hub()
trainer.model.push_to_hub(training_args.output_dir)
tokenizer.push_to_hub(training_args.output_dir)



adapter_model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1709205793.3cc069af542e.8272.0:   0%|          | 0.00/20.5k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Vasanth/intent_recognition/commit/9f561a3f0b8e99314ec151b87403ffc72382c035', commit_message='Upload tokenizer', commit_description='', oid='9f561a3f0b8e99314ec151b87403ffc72382c035', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

i = random.randint(0, len(dataset["test"]))
context = dataset["test"][i]["context"]
tokenizer = AutoTokenizer.from_pretrained("Vasanth/intent_recognition")
batch = tokenizer(context, return_tensors="pt")
batch = {k: v.to("cuda") for k, v in batch.items()}
inference_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    low_cpu_mem_usage=True,
    load_in_4bit=True
    # use_flash_attention_2=True,
)
inference_model.resize_token_embeddings(len(tokenizer))
inference_model = PeftModel.from_pretrained(inference_model, "Vasanth/intent_recognition")
inference_model.to("cuda")
inference_model.eval()

tokenizer_config.json:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PeftModel(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32027, 4096)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.05, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 32x32027 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 4096x32 (cuda:0)])
        )
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
    

In [6]:
output_tokens = inference_model.generate(
    **batch,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    top_k=50,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)
target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False)
print(f"{context=} \n\n {target_predicted=} \n\n")



context="<|begincontext|><|user|>I would like to find a restaurant please.<|system|>Which city would you like to eat in and what kind of food would youn like to eat?<|user|>I would like a restaurant that serves alcohol in San Francisco. I'd like to eat in a Burgers restaurant please.<|system|>There were 8 matches to your inquiry. 5a5 Steak Lounge is a nice place that's in San Francisco.<|beginlastuserutterance|>Can you please make another suggestion?<|endlastuserutterance|><|endcontext|>" 

 target_predicted="<startoftext> <|begincontext|><|user|>I would like to find a restaurant please.<|system|>Which city would you like to eat in and what kind of food would youn like to eat?<|user|>I would like a restaurant that serves alcohol in San Francisco. I'd like to eat in a Burgers restaurant please.<|system|>There were 8 matches to your inquiry. 5a5 Steak Lounge is a nice place that's in San Francisco.<|beginlastuserutterance|>Can you please make another suggestion?<|endlastuserutterance|><|

# Inference

In [11]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
! pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [12]:
import xmltodict
import json

# Function to convert XML string to JSON string
def convert_xml_string_to_json_string(xml_string):
    # Convert XML string to Python dictionary
    dict_data = xmltodict.parse(xml_string)
    return dict_data

In [24]:
target_predicted = target_predicted.replace(context, "").replace("[", "").replace("<startoftext>  '", "")
target_predicted

'<target><dsts><dst><intent> FindRestaurants</intent><belief> Restaurants^city->San Francisco|Restaurants^cuisine->Burgers|Restaurants^serves_alcohol->True</belief></dst></dsts><useraction> REQUEST_ALTS->Restaurants^~</useraction><action> OFFER->Restaurants^restaurant_name~A16 Restaurant|OFFER->Restaurants^city~San Francisco</action><response> A16 Restaurant is a nice restaurant in San Francisco.</response></target>'

In [25]:
convert_xml_string_to_json_string(target_predicted)

{'target': {'dsts': {'dst': {'intent': 'FindRestaurants',
    'belief': 'Restaurants^city->San Francisco|Restaurants^cuisine->Burgers|Restaurants^serves_alcohol->True'}},
  'useraction': 'REQUEST_ALTS->Restaurants^~',
  'action': 'OFFER->Restaurants^restaurant_name~A16 Restaurant|OFFER->Restaurants^city~San Francisco',
  'response': 'A16 Restaurant is a nice restaurant in San Francisco.'}}