<a href="https://colab.research.google.com/github/cs1090218/conv/blob/main/Finetuning_w_Llama3_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup  & Install libraries

Tested this notebook with A100 GPU

In [1]:
base_model_name = "meta-llama/Meta-Llama-3.1-8B"
new_model_name = "llama-3-1-8b-kid-friendly-chatbot"
dataset_name = "shashankverma590/empathetic-dialogues-clean"
# dataset_name = "ruslanmv/ai-medical-chatbot"

In [2]:
# Install Pytorch & other libraries
%pip install "torch==2.4.0" torchvision torchaudio tensorboard

# Install Hugging Face libraries
%pip install  --upgrade \
  "transformers==4.44.2" \
  "datasets==2.21.0" \
  "accelerate==0.33.0" \
  "evaluate==0.4.2" \
  "bitsandbytes==0.43.3" \
  "trl==0.9.6" \
  "peft==0.12.0"

Collecting torch==2.4.0
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.4.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.4.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-many

In [1]:
import torch
print(torch.__version__)
import accelerate
print(accelerate.__version__)
import bitsandbytes
print(bitsandbytes.__version__)
import datasets
print(datasets.__version__)
import evaluate
print(evaluate.__version__)
import peft
print (peft.__version__)
import transformers
print(transformers.__version__)
import trl
print(trl.__version__)
import torchvision
print(torchvision.__version__)
torchvision.ops.nms


# 2.5.0+cu121
# 1.1.0
# 0.42.0
# 3.1.0
# 0.4.3
# 0.13.2
# 4.46.1
# 0.12.0
# 0.20.0+cu121

2.4.0+cu121
0.33.0
0.43.3
2.21.0
0.4.2
0.12.0
4.44.2
0.9.6
0.19.0+cu121


In [2]:
import torch

use_flash_attention = False
if torch.cuda.get_device_capability()[0] >= 8:
  use_flash_attention = True
  !pip install ninja packaging
  !MAX_JOBS=4 pip install flash-attn --no-build-isolation




In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from huggingface_hub import login
from google.colab import userdata

login(
  token=userdata.get('HF_TOKEN'),
  add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Prepare Dataset

In [7]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset(dataset_name, split="train")
df = pd.DataFrame(dataset)
df.head()

Downloading readme:   0%|          | 0.00/447 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.78M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/64300 [00:00<?, ? examples/s]

Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,__index_level_0__
0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",0
1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,1
2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,2
3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,3
4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,4


In [8]:
# Conversations have repeated sitations, so can't group by situation.
data = []

prev_situation = ""
prev_emotion = ""
running_conv = []

for idx, row in df.iterrows():
    situation = row['Situation'].strip()
    emotion = row['emotion'].strip()
    user_prompt = row['empathetic_dialogues'][10:-8].strip()  # [10:-9] is to remove "Customer :" prefix and "\nAgent :" suffix
    agent_response = row['labels'].strip()

    if situation != prev_situation:
        if prev_situation:
            data.append((prev_situation, prev_emotion, running_conv))
        prev_situation = situation
        prev_emotion = emotion
        running_conv = []

    # To remove the loopback conversation present in this dataset, where the next line is somehow 3 lines ago
    if user_prompt not in running_conv:
        running_conv.append(user_prompt)
    if agent_response not in running_conv:
        running_conv.append(agent_response)
data.append((prev_situation, prev_emotion, running_conv))

print ("Number of conversations:", len(data))

Number of conversations: 19436


In [9]:
messages = []

system_prompt = """You are a helpful chatbot for conversing with kids under the age of 7.
You should be empathetic, encouraging and positive minded in general.
The current mood of the user is "{emotion}", you should reply accordingly."""

for situation, emotion, conv in data:
    m = {
      "messages": [
        {"role": "system", "content": system_prompt.format(emotion=emotion)}
      ]
    }
    # Remove last conversation if there's odd number of conversational sides since we want the last response to be from Agent and first is Customer
    conv_length = len(conv) if len(conv) % 2 == 0 else len(conv) - 1
    for i in range(0, conv_length, 2):
      m["messages"].append({"role": "user", "content": conv[i]})
      m["messages"].append({"role": "assistant", "content": conv[i+1]})
    messages.append(m)


In [10]:
from datasets import Dataset
df = pd.DataFrame(messages, columns=['messages'])
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=2000/len(dataset))

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

[{'content': 'You are a helpful chatbot for conversing with kids under the age of 7.\nYou should be empathetic, encouraging and positive minded in general.\nThe current mood of the user is "joyful", you should reply accordingly.', 'role': 'system'}, {'content': 'I had the best weekend! My husband brought a puppy home and seeing him with our son for the first time was so cute!', 'role': 'user'}, {'content': 'That sounds adorable! What type of puppy is it?', 'role': 'assistant'}, {'content': 'He is an 8 week old Cane Corso. My son has had such a great time learning how to play with him and teach him things.', 'role': 'user'}, {'content': "That sounds like a great time. I'm sure it's going to be cool to see the puppy grow up along with your son and how their relationship builds.", 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1335085

In [11]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

# Training Setup - Model and params

In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
if use_flash_attention:
  model = AutoModelForCausalLM.from_pretrained(
      base_model_name,
      device_map="auto",
      attn_implementation="flash_attention_2",
      torch_dtype=torch.bfloat16,
      quantization_config=bnb_config
  )
else:
  model = AutoModelForCausalLM.from_pretrained(
      base_model_name,
      device_map="auto",
      # attn_implementation="flash_attention_2",
      torch_dtype=torch.bfloat16,
      quantization_config=bnb_config
  )
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.padding_side = 'right' # to prevent warnings

# set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [14]:
adapter_model_dir = "./" + new_model_name

In [15]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir=adapter_model_dir, # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=False,                      # dont push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [None]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

In [18]:
# Save the adapter model files locally
trainer.save_model(adapter_model_dir)

# Optionally also save it to drive
# trainer.save_model("/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql")



In [54]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

# Cmd for copying model adapter files to or from drive

In [33]:
# !cp ./code-llama-3-1-8b-text-to-sql/adapter_config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/adapter_model.safetensors /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/generation_config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/model.safetensors.index.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/special_tokens_map.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/tokenizer.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/tokenizer_config.json /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql
# !cp ./code-llama-3-1-8b-text-to-sql/training_args.bin /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql

In [7]:
# !mkdir code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/adapter_config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/adapter_model.safetensors ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/generation_config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/model.safetensors.index.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/special_tokens_map.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/tokenizer.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/tokenizer_config.json ./code-llama-3-1-8b-text-to-sql
# !cp /content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/training_args.bin ./code-llama-3-1-8b-text-to-sql

cp: cannot stat '/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/config.json': No such file or directory
cp: cannot stat '/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/generation_config.json': No such file or directory
cp: cannot stat '/content/drive/MyDrive/temptest/code-llama-3-1-8b-text-to-sql/model.safetensors.index.json': No such file or directory


# Inference

In [20]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# Load PEFT model on CPU
model = AutoPeftModelForCausalLM.from_pretrained(
    adapter_model_dir,  # "./code-llama-3-1-8b-text-to-sql"
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(adapter_model_dir)

# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
merged_model.resize_token_embeddings(len(tokenizer))
model = merged_model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# If the local dir doesn't have adapter files and we need to get the full model from hub

# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("shashankverma590/" + new_model_name)
# model = AutoModelForCausalLM.from_pretrained("shashankverma590/" + new_model_name,
#                                              torch_dtype=torch.float16, device_map="auto")

config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

In [11]:
from datasets import load_dataset
from random import randint

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(1, len(eval_dataset))

Generating train split: 0 examples [00:00, ? examples/s]

In [18]:
# Inference style #1 where we see the special tokens our model generates.

prompt = tokenizer.apply_chat_template(
    eval_dataset[rand_idx]["messages"][:-1],
    tokenize=False,
    add_generation_prompt=True
)

# Tokenize with add_special_tokens=False
input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(model.device)

# Generate response with special tokens
outputs = model.generate(
    input_ids,
    max_new_tokens=256,
    do_sample=False,
    temperature=0.1,
    top_k=50,
    top_p=0.1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

# Decode the output while skipping special tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1:-1]}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][-1]['content']}")
print(f"\nGenerated Answer:\n{generated_text}")

Query:
[{'content': 'Every time my best friend comes back in town, we go eat at the old restaurant that we used to work at when we met.', 'role': 'user'}, {'content': 'I bet that brings back a lot of great memories! Is the food good?', 'role': 'assistant'}, {'content': "It sure does! The food is great and the place hasn't changed a bit. It feels like we went back in time.", 'role': 'user'}]
Original Answer:
What type of food do they serve?

Generated Answer:
<|im_start|>system
You are a helpful chatbot for conversing with kids under the age of 7.
You should be empathetic, encouraging and positive minded in general.
The current mood of the user is "nostalgic", you should reply accordingly.<|im_end|>
<|im_start|>user
Every time my best friend comes back in town, we go eat at the old restaurant that we used to work at when we met.<|im_end|>
<|im_start|>assistant
I bet that brings back a lot of great memories! Is the food good?<|im_end|>
<|im_start|>user
It sure does! The food is great and

In [25]:
# Inference style #2 where we see the cleaned up final output.

from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

print(f"Query:")
for item in eval_dataset[rand_idx]['messages'][1:-1]:
  print (f"{item['role']}: {item['content']}")

print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][-1]['content']}")
print(f"\nGenerated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")



Query:
user: Every time my best friend comes back in town, we go eat at the old restaurant that we used to work at when we met.
assistant: I bet that brings back a lot of great memories! Is the food good?
user: It sure does! The food is great and the place hasn't changed a bit. It feels like we went back in time.
Original Answer:
What type of food do they serve?

Generated Answer:
That's so sweet.  I love going back to old places like that.  It always brings back so many memories.


# Upload the model to HuggingFace Hub

In [38]:
model.push_to_hub("shashankverma590/" + new_model_name, check_pr=True)
tokenizer.push_to_hub("shashankverma590/" + new_model_name,check_pr=True)

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shashankverma590/llama-3-1-8b-kid-friendly-chatbot/commit/8b243232442a0afc53e0345d2e5f3ea1fea5bc6f', commit_message='Upload tokenizer', commit_description='', oid='8b243232442a0afc53e0345d2e5f3ea1fea5bc6f', pr_url=None, pr_revision=None, pr_num=None)