# Customer Support Chat Bot with RAG

In [None]:
!pip install torch

First we check the GPU version available in the environment and install specific dependencies that are compatible with the detected GPU to prevent version conflicts.

In [None]:
%%capture
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes

## Imports

In [None]:
import time
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from evaluate import load

Next we need to prepare to load a range of quantized language models, including a new 15 trillion token LLama-3 model, optimized for memory efficiency with 4-bit quantization.


In [None]:
max_seq_length = 4096 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# fourbit_models = [
#     "unsloth/mistral-7b-bnb-4bit",
#     "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
#     "unsloth/llama-2-7b-bnb-4bit",
#     "unsloth/gemma-7b-bnb-4bit",
#     "unsloth/gemma-7b-it-bnb-4bit",
#     "unsloth/gemma-2b-bnb-4bit",
#     "unsloth/gemma-2b-it-bnb-4bit",
#     "unsloth/llama-3-8b-bnb-4bit",
# ]

initial_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)



---



Next, we integrate LoRA adapters into our model, which allows us to efficiently update just a fraction of the model's parameters, enhancing training speed and reducing computational load.

In [None]:
final_model = FastLanguageModel.get_peft_model(
    initial_model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Preparation
We now use the  dataset from [customer support data set](bitext/Bitext-customer-support-llm-chatbot-training-dataset), which contains of 26K+ of the data.

Then, we define a system prompt that formats tasks into instructions, inputs, flags, categories and responses, and apply it to a dataset to prepare our inputs and outputs for the model, with an EOS token to signal completion.


In [None]:
custom_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Intent:
{}

### Flags:
{}

### Category:
{}

### Response:
{}"""


# Retrieve the EOS token from the tokenizer
EOS_TOKEN = tokenizer.eos_token

# Define the formatting function for the QA template
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    intents = examples["intent"]
    flags = examples["flags"]
    categories = examples["category"]
    responses = examples["response"]

    texts = []
    for instruction, intent, flag, category, response in zip(instructions, intents, flags, categories, responses):
        # Format the text using the custom_prompt template
        text = custom_prompt.format(instruction, "", intent, flag, category, response) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }



# Load the Bitext customer support dataset
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split = "train")

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched = True)

train_dataset = Dataset(dataset[:23000])
eval_dataset = Dataset(dataset[23000:])

<a name="Train"></a>
### Train the model
- We do 100 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.
- At this stage, we're configuring our model's training setup, where we define things like batch size and learning rate, to teach our model effectively with the data we have prepared.

In [None]:
trainer = SFTTrainer(
    model = final_model,
    tokenizer = tokenizer,
    train_dataset = Dataset.from_dict(train_dataset), # Convert back to Dataset
    eval_dataset = Dataset.from_dict(eval_dataset), # Convert back to Dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,

        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100, # increase this for better model performance
        num_train_epochs=4,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "responses",
    ),
)

In [None]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

## Start tarining the model

In [None]:
trainer_stats = trainer.train()

In [None]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
# Ensure the model is ready for inference
FastLanguageModel.for_inference(final_model)

# Prepare the custom prompt
instruction = "i need help cancelling puchase of the order number 14526?"
input = ""  # Leave input blank
intent = ""  # intent
flags = ""  # flags
category = ""  # category

# Create the prompt instance
custom_prompt_instance = custom_prompt.format(instruction, input, intent, flags, category, "")

# Tokenize the custom prompt
inputs = tokenizer([custom_prompt_instance], return_tensors="pt").to("cuda")

# Generate the response using the model
outputs = final_model.generate(**inputs, max_new_tokens=1024, use_cache=True)

# Decode the generated output
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print("Generated Response:", response)

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# Ensure the model is ready for inference
FastLanguageModel.for_inference(final_model)

# Prepare the custom prompt
instruction = "I ordered a mobile phone using your website, Now I want to cancell it, how can I do that? order number 123403"
input_text = ""  # Leave input blank
intent = ""  # Example intent
flags = ""  # Example flags
category = ""  # Example category

# Create the prompt instance
custom_prompt_instance = custom_prompt.format(instruction, input_text, intent, flags, category, "")

# Tokenize the custom prompt
inputs = tokenizer([custom_prompt_instance], return_tensors="pt").to("cuda")

# Generate the response using the model
outputs = final_model.generate(**inputs, max_new_tokens=128, use_cache=True)

# Decode the generated output
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print("Generated Response:", response)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
final_model.save_pretrained("customer_support_fine_tunned_4_bit_llm")
final_model.push_to_hub("MODEL_NAME", token = "YOUR_HF_TOKEN") # Online saving (Better option)

## Evaluating the LLM

In [None]:
# Load the Bitext customer support dataset
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split = "train")

# Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched = True)

In [None]:
eval_dataset = dataset[-1000:]
predictions_initial, predictions_final, references = [], [], []

for i in range(len(eval_dataset["instruction"])):
  instruction = eval_dataset["instruction"][i]
  custom_prompt_instance = custom_prompt.format(instruction, "", "", "", "", "") # Added empty string for Response
  inputs = tokenizer([custom_prompt_instance], return_tensors='pt').to("cuda")

  initial_outputs = initial_model.generate(**inputs, max_new_tokens=128, use_cache=True)
  initial_response = tokenizer.batch_decode(initial_outputs, skip_special_tokens=True)[0]

  final_outputs = final_model.generate(**inputs, max_new_tokens=128, use_cache=True)
  final_response = tokenizer.batch_decode(final_outputs, skip_special_tokens=True)[0]

  predictions_initial.append(initial_response)
  predictions_final.append(final_response)
  references.append(eval_dataset["response"][i]) # Append the actual response for comparison
  print("Iteration Done:", i)


print("Sample generated response:", predictions_final[0])

### Initial Testing

In [None]:
# Load metrics
bleu = load("bleu")
rouge = load("rouge")
meteor = load("meteor")
bertscore = load("bertscore")

# Ensure references are in correct format
# If your 'references' are just strings, you can use them directly.
# If they are lists (like [["response1"], ["response2"]]), flatten them:
if isinstance(references[0], list):
    references = [r[0] for r in references]

# BLEU
bleu_initial = bleu.compute(predictions=predictions_initial, references=references)
bleu_final = bleu.compute(predictions=predictions_final, references=references)

print(f"\nBLEU Score (Initial Model): {bleu_initial['bleu']:.4f}")
print(f"BLEU Score (Final Model):   {bleu_final['bleu']:.4f}")

# ROUGE
rouge_initial = rouge.compute(predictions=predictions_initial, references=references)
rouge_final = rouge.compute(predictions=predictions_final, references=references)

print("\nROUGE Scores (Initial Model):")
for key, value in rouge_initial.items():
    print(f"  {key}: {value:.4f}")

print("\nROUGE Scores (Final Model):")
for key, value in rouge_final.items():
    print(f"  {key}: {value:.4f}")

# METEOR
meteor_initial = meteor.compute(predictions=predictions_initial, references=references)
meteor_final = meteor.compute(predictions=predictions_final, references=references)

print(f"\nMETEOR Score (Initial Model): {meteor_initial['meteor']:.4f}")
print(f"METEOR Score (Final Model):   {meteor_final['meteor']:.4f}")

# BERTScore (using average F1)
bertscore_initial = bertscore.compute(predictions=predictions_initial, references=references, lang="en")
bertscore_final = bertscore.compute(predictions=predictions_final, references=references, lang="en")

avg_f1_initial = sum(bertscore_initial["f1"]) / len(bertscore_initial["f1"])
avg_f1_final = sum(bertscore_final["f1"]) / len(bertscore_final["f1"])

print(f"\nBERTScore F1 (Initial Model): {avg_f1_initial:.4f}")
print(f"BERTScore F1 (Final Model):   {avg_f1_final:.4f}")


### A/B Testing

- A - Initial pre-trained model
- B - Fine-tunned model

In [None]:
# -------------------------
# Initialize lists
# -------------------------
predictions_initial, predictions_final, references = [], [], []
latency_initial, latency_final = [], []

# -------------------------
# Load metrics
# -------------------------
bleu = load("bleu")
rouge = load("rouge")
meteor = load("meteor")
bertscore = load("bertscore")

# -------------------------
# Loop through evaluation samples
# -------------------------
for i in range(len(eval_dataset["instruction"])):
    instruction = eval_dataset["instruction"][i]
    reference = eval_dataset["response"][i]
    
    # Create prompt
    custom_prompt_instance = custom_prompt.format(instruction, "", "", "", "", "")
    
    # Tokenize input
    inputs = tokenizer([custom_prompt_instance], return_tensors='pt').to("cuda")

    # --- Model A: Initial (Pre-trained) ---
    start_time = time.time()
    with torch.no_grad():
        outputs_initial = initial_model.generate(**inputs, max_new_tokens=128, use_cache=True)
    latency_initial.append(time.time() - start_time)
    response_initial = tokenizer.batch_decode(outputs_initial, skip_special_tokens=True)[0]

    # --- Model B: Final (Fine-tuned) ---
    start_time = time.time()
    with torch.no_grad():
        outputs_final = initial_model.generate(**inputs, max_new_tokens=128, use_cache=True)
    latency_final.append(time.time() - start_time)
    response_final = tokenizer.batch_decode(outputs_final, skip_special_tokens=True)[0]

    # Store results
    predictions_initial.append(response_initial)
    predictions_final.append(response_final)
    references.append(reference)

    # Progress logging
    print(f"Iteration Done: {i+1}/{len(eval_dataset)}")

# -------------------------
# Compute metrics (same as before)
# -------------------------
if isinstance(references[0], list):
    references = [r[0] for r in references]

# BLEU
bleu_initial = bleu.compute(predictions=predictions_initial, references=references)
bleu_final = bleu.compute(predictions=predictions_final, references=references)

# ROUGE
rouge_initial = rouge.compute(predictions=predictions_initial, references=references)
rouge_final = rouge.compute(predictions=predictions_final, references=references)

# METEOR
meteor_initial = meteor.compute(predictions=predictions_initial, references=references)
meteor_final = meteor.compute(predictions=predictions_final, references=references)

# BERTScore
bertscore_initial = bertscore.compute(predictions=predictions_initial, references=references, lang="en")
bertscore_final = bertscore.compute(predictions=predictions_final, references=references, lang="en")

avg_f1_initial = sum(bertscore_initial["f1"]) / len(bertscore_initial["f1"])
avg_f1_final = sum(bertscore_final["f1"]) / len(bertscore_final["f1"])

# -------------------------
# Compute average latency
# -------------------------
avg_latency_initial = sum(latency_initial) / len(latency_initial)
avg_latency_final = sum(latency_final) / len(latency_final)

# -------------------------
# Print results
# -------------------------
print("\n📊 --- A/B Testing Results ---")
print(f"BLEU Score (Initial Model): {bleu_initial['bleu']:.4f}")
print(f"BLEU Score (Final Model):   {bleu_final['bleu']:.4f}")

print("\nROUGE Scores (Initial Model):", rouge_initial)
print("ROUGE Scores (Final Model):", rouge_final)

print(f"\nMETEOR Score (Initial Model): {meteor_initial['meteor']:.4f}")
print(f"METEOR Score (Final Model):   {meteor_final['meteor']:.4f}")

print(f"\nBERTScore F1 (Initial Model): {avg_f1_initial:.4f}")
print(f"BERTScore F1 (Final Model):   {avg_f1_final:.4f}")

print(f"\nAverage Latency (Initial Model): {avg_latency_initial:.3f} sec per response")
print(f"Average Latency (Final Model):   {avg_latency_final:.3f} sec per response")