In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datasets import load_dataset, DatasetDict
from huggingface_hub import login
import re
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
if torch.cuda.is_available():
    print("GPU available:", torch.cuda.get_device_name(0))
    device = "cuda"
else:
    print("No GPU detected")
    device = "cpu"

GPU available: NVIDIA GeForce RTX 4060 Laptop GPU


## Daily_dialog

In [7]:
dataset = load_dataset("daily_dialog", trust_remote_code=True)

dataset

Downloading data: 100%|██████████| 4.48M/4.48M [00:00<00:00, 5.57MB/s]
Generating train split: 100%|██████████| 11118/11118 [00:00<00:00, 12958.66 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 10390.58 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 10409.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})

In [8]:
dataset["train"][0]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but is really not good for our fitness . ',
  ' What do you mean ? It will help us to relax . ',
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
  " Good.Let ' s go now . ",
  ' All right . '],
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}

## Empathetic_dialogue

In [None]:
df = pd.read_csv("empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,
2,2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,,
3,3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,,
4,4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,,


In [71]:
# removing the Customer:, and agent: part from the 'empethetic_dialogues' column
df['extracted_text'] = df['empathetic_dialogues'].apply(lambda x: re.findall(r':\s*(.*?)\n', x)[0] if re.findall(r':\s*(.*?)\n', x) else None)
# keeping only the extracted_text, and labels columns
cleaned_df = df[["extracted_text", "labels"]].copy()

In [72]:
# these rows have parsing problems
cleaned_df[cleaned_df.isna().any(axis=1)].index
# these indexes are special case : [23485, 23486, 23487, 23488]

Index([ 1419,  1420,  1421,  1422,  2546,  2547,  2548,  3722,  3723,  3724,
        3725,  3726, 20933, 20934, 20935, 20936, 23485, 23486, 23487, 23488,
       28674, 28675, 28676, 31743, 31744, 31745, 35867, 35868, 35869, 35870,
       35871, 40194, 40195, 40196, 40197, 42140, 42141, 42142, 53778, 53779,
       53780, 53781, 64217, 64218, 64219],
      dtype='int64')

In [73]:
# Drop the rows with the parsing problem
cleaned_df.drop(index=cleaned_df[cleaned_df.isna().any(axis=1)].index, inplace=True, axis=0)

In [74]:
# This code is for the correction of the first subrows with parsing problems
df_v1 = df.iloc[[1419,  1420,  1421,  1422,  2546,  2547,  2548,  3722,  3723,  3724, 3725,  3726, 20933, 20934, 20935, 20936, 28674, 28675, 
                 28676, 31743, 31744, 31745, 35867, 35868, 35869, 35870, 35871, 40194, 40195, 40196, 40197, 42140, 42141, 42142, 53778, 53779,
                 53780, 53781, 64217, 64218, 64219]].copy()
df_v1['extracted_text'] = df_v1['labels'].apply(lambda x: re.findall(r':\s*(.*?)\n', x)[0] if re.findall(r':\s*(.*?)\n', x) else None)
df_v1['labels'] = df_v1["Unnamed: 5"]
# And this is for the seconde subrows
df_v2 = df.iloc[[23485, 23486, 23487, 23488]].copy()
df_v2['extracted_text'] = df_v2['Unnamed: 5'].apply(lambda x: re.findall(r':\s*(.*?)\n', x)[0] if re.findall(r':\s*(.*?)\n', x) else None)
df_v2['labels'] = df_v2["Unnamed: 6"]
# Then we reconcat all of the data
cleaned_df = pd.concat([cleaned_df,df_v1[["extracted_text", "labels"]], df_v2[["extracted_text", "labels"]]], axis=0)


In [75]:
# And now we check. Voila!
cleaned_df.iloc[[ 1419,  1420,  1421,  1422,  2546,  2547,  2548,  3722,  3723,  3724,
        3725,  3726, 20933, 20934, 20935, 20936, 23485, 23486, 23487, 23488,
       28674, 28675, 28676, 31743, 31744, 31745, 35867, 35868, 35869, 35870,
       35871, 40194, 40195, 40196, 40197, 42140, 42141, 42142, 53778, 53779,
       53780, 53781, 64217, 64218, 64219]]


Unnamed: 0,extracted_text,labels
1423,"One night, i hugged my wife and told her i lov...",Aww! That's so sweet! How long have you been m...
1424,I went to work and got a big raise. I rushed h...,That's very exciting! Congratulations on your ...
1425,Aww! That's so sweet! How long have you been m...,I went to work and got a big raise. I rushed h...
1426,That's very exciting! Congratulations on your ...,"When i got home, all of the furniture, my wife..."
2553,"High school sweethearts, that's so special. Do...","No not at all, I feel so so lucky that I met h..."
2554,Your outlook on love is really refreshing. I w...,"Thank you very much, I'm wishing/praying the b..."
2555,How can people be stupid enough to step on you...,That is a disgrace. Some people just do not ge...
3734,It had it's ups and downs which is why I wonde...,Do you have kids?
3735,"I do, which is one of the reasons I'm so excited.",It's never to late to start new. You guys will...
3736,Was it a nice one?,It had it's ups and downs which is why I wonde...


## Preprocessing

In [None]:
login(
  token="--Add your token here--", # ADD YOUR TOKEN HERE
  # add_to_git_credential=True
)
 

In [None]:
# Convert dataset to OAI messages
system_message = """You are an emotional conversationalist. Users will talk to you in English and you will generate a response."""
 
def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": sample["extracted_text"]},
      {"role": "assistant", "content": sample["labels"]}
    ]
  }
 
# Load dataset from the hub
dataset = load_dataset("csv", data_files="Data/cleaned_empethatic_dataset.csv", split="train")
#dataset = dataset.shuffle().select(range(12500))
 
# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset_1 = dataset.train_test_split(test_size=0.2)
dataset_2 = dataset_1["test"].train_test_split(test_size=0.5)

split_dataset = {
    'train': dataset_1['train'],
    'validation': dataset_2['train'],
    'test': dataset_2['test']
}

new_dataset = DatasetDict(split_dataset)

 
print(new_dataset["train"][345]["messages"])
 
# save datasets to disk
new_dataset["train"].to_json("Data/train_dataset.json", orient="records")
new_dataset["validation"].to_json("Data/validation_dataset.json", orient="records")
new_dataset["test"].to_json("Data/test_dataset.json", orient="records")

[{'content': 'You are an emotional conversationalist. Users will talk to you in English and you will generate a response.', 'role': 'system'}, {'content': "Yep I'm really lucky. He's always been very trustworthy and never takes advantage when I let him use it.", 'role': 'user'}, {'content': "That's so rare in teenagers! Sounds like you did a great job raising him.", 'role': 'assistant'}]


Creating json from Arrow format: 100%|██████████| 52/52 [00:01<00:00, 36.33ba/s]
Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 40.79ba/s]
Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 40.76ba/s]


2241077

## Training

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

model_id = "Qwen/Qwen1.5-0.5B-Chat" # or `mistralai/Mistral-7B-v0.1`
 
# BitsAndBytesConfig int-4 config
""" bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
) """
 
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    #attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    #quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings
 
# # set chat template to OAI chatML, remove if you start from a fine-tuned model
#model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
# Load jsonl data from disk
dataset = load_dataset("json", data_files="Data/train_dataset.json", split="train")

In [6]:
from peft import LoraConfig
 
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [8]:
from transformers import TrainingArguments
 
args = TrainingArguments(
    output_dir="code-Qwen1.5", # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=3,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    # push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [9]:
from trl import SFTTrainer
 
max_seq_length = 512 # max sequence length for model and packing of the dataset
 
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)





Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 6957 examples [00:09, 708.76 examples/s] 


In [10]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

  0%|          | 0/773 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  1%|▏         | 10/773 [00:15<18:31,  1.46s/it]

{'loss': 2.1828, 'grad_norm': 0.6152185797691345, 'learning_rate': 0.0002, 'epoch': 0.01}


  3%|▎         | 20/773 [00:29<18:08,  1.45s/it]

{'loss': 1.6085, 'grad_norm': 0.2858924865722656, 'learning_rate': 0.0002, 'epoch': 0.03}


  4%|▍         | 30/773 [00:43<17:14,  1.39s/it]

{'loss': 1.567, 'grad_norm': 0.28052735328674316, 'learning_rate': 0.0002, 'epoch': 0.04}


  5%|▌         | 40/773 [01:00<20:33,  1.68s/it]

{'loss': 1.5208, 'grad_norm': 0.2459777444601059, 'learning_rate': 0.0002, 'epoch': 0.05}


  6%|▋         | 50/773 [01:15<17:14,  1.43s/it]

{'loss': 1.5054, 'grad_norm': 0.27517563104629517, 'learning_rate': 0.0002, 'epoch': 0.06}


  8%|▊         | 60/773 [01:29<16:56,  1.43s/it]

{'loss': 1.5229, 'grad_norm': 0.2119239717721939, 'learning_rate': 0.0002, 'epoch': 0.08}


  9%|▉         | 70/773 [01:43<16:29,  1.41s/it]

{'loss': 1.4709, 'grad_norm': 0.23544570803642273, 'learning_rate': 0.0002, 'epoch': 0.09}


 10%|█         | 80/773 [01:57<16:37,  1.44s/it]

{'loss': 1.4936, 'grad_norm': 0.23373468220233917, 'learning_rate': 0.0002, 'epoch': 0.1}


 12%|█▏        | 90/773 [02:12<16:06,  1.41s/it]

{'loss': 1.4826, 'grad_norm': 0.2512829303741455, 'learning_rate': 0.0002, 'epoch': 0.12}


 13%|█▎        | 100/773 [02:26<15:45,  1.41s/it]

{'loss': 1.4952, 'grad_norm': 0.22206951677799225, 'learning_rate': 0.0002, 'epoch': 0.13}


 14%|█▍        | 110/773 [02:40<15:39,  1.42s/it]

{'loss': 1.4691, 'grad_norm': 0.21872612833976746, 'learning_rate': 0.0002, 'epoch': 0.14}


 16%|█▌        | 120/773 [02:54<15:17,  1.41s/it]

{'loss': 1.4437, 'grad_norm': 0.23098516464233398, 'learning_rate': 0.0002, 'epoch': 0.16}


 17%|█▋        | 130/773 [03:08<15:00,  1.40s/it]

{'loss': 1.5009, 'grad_norm': 0.26446661353111267, 'learning_rate': 0.0002, 'epoch': 0.17}


 18%|█▊        | 140/773 [03:22<14:48,  1.40s/it]

{'loss': 1.485, 'grad_norm': 0.2837963104248047, 'learning_rate': 0.0002, 'epoch': 0.18}


 19%|█▉        | 150/773 [03:36<14:32,  1.40s/it]

{'loss': 1.479, 'grad_norm': 0.2397577464580536, 'learning_rate': 0.0002, 'epoch': 0.19}


 21%|██        | 160/773 [03:50<14:25,  1.41s/it]

{'loss': 1.4371, 'grad_norm': 0.2557121515274048, 'learning_rate': 0.0002, 'epoch': 0.21}


 22%|██▏       | 170/773 [04:04<14:01,  1.40s/it]

{'loss': 1.4508, 'grad_norm': 0.2323865443468094, 'learning_rate': 0.0002, 'epoch': 0.22}


 23%|██▎       | 180/773 [04:18<13:53,  1.40s/it]

{'loss': 1.4878, 'grad_norm': 0.22693663835525513, 'learning_rate': 0.0002, 'epoch': 0.23}


 25%|██▍       | 190/773 [04:32<13:34,  1.40s/it]

{'loss': 1.456, 'grad_norm': 0.21417443454265594, 'learning_rate': 0.0002, 'epoch': 0.25}


 26%|██▌       | 200/773 [04:46<13:20,  1.40s/it]

{'loss': 1.4609, 'grad_norm': 0.24707169830799103, 'learning_rate': 0.0002, 'epoch': 0.26}


 27%|██▋       | 210/773 [05:00<13:02,  1.39s/it]

{'loss': 1.4428, 'grad_norm': 0.223281592130661, 'learning_rate': 0.0002, 'epoch': 0.27}


 28%|██▊       | 220/773 [05:14<12:57,  1.41s/it]

{'loss': 1.4426, 'grad_norm': 0.23084834218025208, 'learning_rate': 0.0002, 'epoch': 0.28}


 30%|██▉       | 230/773 [05:28<12:36,  1.39s/it]

{'loss': 1.438, 'grad_norm': 0.29445716738700867, 'learning_rate': 0.0002, 'epoch': 0.3}


 31%|███       | 240/773 [05:42<12:28,  1.40s/it]

{'loss': 1.444, 'grad_norm': 0.21922005712985992, 'learning_rate': 0.0002, 'epoch': 0.31}


 32%|███▏      | 250/773 [05:56<12:16,  1.41s/it]

{'loss': 1.4548, 'grad_norm': 0.26009950041770935, 'learning_rate': 0.0002, 'epoch': 0.32}


 34%|███▎      | 260/773 [06:11<12:08,  1.42s/it]

{'loss': 1.4483, 'grad_norm': 0.2541850805282593, 'learning_rate': 0.0002, 'epoch': 0.34}


 35%|███▍      | 270/773 [06:25<11:41,  1.39s/it]

{'loss': 1.4364, 'grad_norm': 0.24903592467308044, 'learning_rate': 0.0002, 'epoch': 0.35}


 36%|███▌      | 280/773 [06:39<11:39,  1.42s/it]

{'loss': 1.4562, 'grad_norm': 0.2506919503211975, 'learning_rate': 0.0002, 'epoch': 0.36}


 38%|███▊      | 290/773 [06:53<11:18,  1.40s/it]

{'loss': 1.452, 'grad_norm': 0.2379612922668457, 'learning_rate': 0.0002, 'epoch': 0.38}


 39%|███▉      | 300/773 [07:07<11:01,  1.40s/it]

{'loss': 1.4332, 'grad_norm': 0.2601514458656311, 'learning_rate': 0.0002, 'epoch': 0.39}


 40%|████      | 310/773 [07:21<10:45,  1.39s/it]

{'loss': 1.4677, 'grad_norm': 0.26151296496391296, 'learning_rate': 0.0002, 'epoch': 0.4}


 41%|████▏     | 320/773 [07:35<10:34,  1.40s/it]

{'loss': 1.4286, 'grad_norm': 0.23084235191345215, 'learning_rate': 0.0002, 'epoch': 0.41}


 43%|████▎     | 330/773 [07:49<10:22,  1.41s/it]

{'loss': 1.4615, 'grad_norm': 0.27504971623420715, 'learning_rate': 0.0002, 'epoch': 0.43}


 44%|████▍     | 340/773 [08:06<13:37,  1.89s/it]

{'loss': 1.4276, 'grad_norm': 1.728877305984497, 'learning_rate': 0.0002, 'epoch': 0.44}


 45%|████▌     | 350/773 [08:26<13:28,  1.91s/it]

{'loss': 1.4871, 'grad_norm': 0.23188602924346924, 'learning_rate': 0.0002, 'epoch': 0.45}


 47%|████▋     | 360/773 [08:45<13:25,  1.95s/it]

{'loss': 1.4446, 'grad_norm': 0.2195141315460205, 'learning_rate': 0.0002, 'epoch': 0.47}


 48%|████▊     | 370/773 [09:04<12:57,  1.93s/it]

{'loss': 1.4154, 'grad_norm': 0.2531881332397461, 'learning_rate': 0.0002, 'epoch': 0.48}


 49%|████▉     | 380/773 [09:24<12:49,  1.96s/it]

{'loss': 1.4283, 'grad_norm': 0.2221730500459671, 'learning_rate': 0.0002, 'epoch': 0.49}


 50%|█████     | 390/773 [09:44<12:23,  1.94s/it]

{'loss': 1.4645, 'grad_norm': 0.22982165217399597, 'learning_rate': 0.0002, 'epoch': 0.5}


 52%|█████▏    | 400/773 [10:03<11:58,  1.93s/it]

{'loss': 1.4124, 'grad_norm': 0.23969420790672302, 'learning_rate': 0.0002, 'epoch': 0.52}


 53%|█████▎    | 410/773 [10:23<12:05,  2.00s/it]

{'loss': 1.423, 'grad_norm': 0.22634534537792206, 'learning_rate': 0.0002, 'epoch': 0.53}


 54%|█████▍    | 420/773 [10:43<11:30,  1.96s/it]

{'loss': 1.4172, 'grad_norm': 0.23348179459571838, 'learning_rate': 0.0002, 'epoch': 0.54}


 56%|█████▌    | 430/773 [11:03<11:18,  1.98s/it]

{'loss': 1.4004, 'grad_norm': 0.2898707091808319, 'learning_rate': 0.0002, 'epoch': 0.56}


 57%|█████▋    | 440/773 [11:22<10:41,  1.93s/it]

{'loss': 1.4128, 'grad_norm': 0.21427759528160095, 'learning_rate': 0.0002, 'epoch': 0.57}


 58%|█████▊    | 450/773 [11:38<07:56,  1.48s/it]

{'loss': 1.3849, 'grad_norm': 0.22090360522270203, 'learning_rate': 0.0002, 'epoch': 0.58}


 60%|█████▉    | 460/773 [11:52<07:22,  1.41s/it]

{'loss': 1.4104, 'grad_norm': 0.2277952879667282, 'learning_rate': 0.0002, 'epoch': 0.6}


 61%|██████    | 470/773 [12:06<07:05,  1.40s/it]

{'loss': 1.3993, 'grad_norm': 0.6363450288772583, 'learning_rate': 0.0002, 'epoch': 0.61}


 62%|██████▏   | 480/773 [12:20<06:54,  1.41s/it]

{'loss': 1.4298, 'grad_norm': 0.24585872888565063, 'learning_rate': 0.0002, 'epoch': 0.62}


 63%|██████▎   | 490/773 [12:35<06:38,  1.41s/it]

{'loss': 1.3822, 'grad_norm': 0.24537914991378784, 'learning_rate': 0.0002, 'epoch': 0.63}


 65%|██████▍   | 500/773 [12:49<06:23,  1.41s/it]

{'loss': 1.4125, 'grad_norm': 0.24289533495903015, 'learning_rate': 0.0002, 'epoch': 0.65}


 66%|██████▌   | 510/773 [13:03<06:07,  1.40s/it]

{'loss': 1.3833, 'grad_norm': 0.22519031167030334, 'learning_rate': 0.0002, 'epoch': 0.66}


 67%|██████▋   | 520/773 [13:17<05:51,  1.39s/it]

{'loss': 1.4259, 'grad_norm': 0.24794025719165802, 'learning_rate': 0.0002, 'epoch': 0.67}


 69%|██████▊   | 530/773 [13:31<05:36,  1.39s/it]

{'loss': 1.4122, 'grad_norm': 0.22548390924930573, 'learning_rate': 0.0002, 'epoch': 0.69}


 70%|██████▉   | 540/773 [13:45<05:22,  1.39s/it]

{'loss': 1.4221, 'grad_norm': 0.299169659614563, 'learning_rate': 0.0002, 'epoch': 0.7}


 71%|███████   | 550/773 [13:58<05:11,  1.40s/it]

{'loss': 1.395, 'grad_norm': 0.22573430836200714, 'learning_rate': 0.0002, 'epoch': 0.71}


 72%|███████▏  | 560/773 [14:12<05:00,  1.41s/it]

{'loss': 1.4075, 'grad_norm': 0.32023245096206665, 'learning_rate': 0.0002, 'epoch': 0.72}


 74%|███████▎  | 570/773 [14:26<04:41,  1.39s/it]

{'loss': 1.418, 'grad_norm': 0.603272557258606, 'learning_rate': 0.0002, 'epoch': 0.74}


 75%|███████▌  | 580/773 [14:40<04:29,  1.40s/it]

{'loss': 1.3923, 'grad_norm': 0.2487013339996338, 'learning_rate': 0.0002, 'epoch': 0.75}


 76%|███████▋  | 590/773 [14:54<04:13,  1.39s/it]

{'loss': 1.4169, 'grad_norm': 0.38712307810783386, 'learning_rate': 0.0002, 'epoch': 0.76}


 78%|███████▊  | 600/773 [15:08<04:04,  1.41s/it]

{'loss': 1.3872, 'grad_norm': 0.24488097429275513, 'learning_rate': 0.0002, 'epoch': 0.78}


 79%|███████▉  | 610/773 [15:22<03:49,  1.41s/it]

{'loss': 1.3677, 'grad_norm': 0.23598997294902802, 'learning_rate': 0.0002, 'epoch': 0.79}


 80%|████████  | 620/773 [15:36<03:34,  1.40s/it]

{'loss': 1.4042, 'grad_norm': 0.24547520279884338, 'learning_rate': 0.0002, 'epoch': 0.8}


 82%|████████▏ | 630/773 [15:51<03:23,  1.42s/it]

{'loss': 1.3698, 'grad_norm': 0.23522484302520752, 'learning_rate': 0.0002, 'epoch': 0.82}


 83%|████████▎ | 640/773 [16:05<03:07,  1.41s/it]

{'loss': 1.4039, 'grad_norm': 0.3357525169849396, 'learning_rate': 0.0002, 'epoch': 0.83}


 84%|████████▍ | 650/773 [16:19<02:51,  1.40s/it]

{'loss': 1.3653, 'grad_norm': 0.28011077642440796, 'learning_rate': 0.0002, 'epoch': 0.84}


 85%|████████▌ | 660/773 [16:33<02:37,  1.40s/it]

{'loss': 1.4006, 'grad_norm': 0.2414141744375229, 'learning_rate': 0.0002, 'epoch': 0.85}


 87%|████████▋ | 670/773 [16:47<02:23,  1.40s/it]

{'loss': 1.3748, 'grad_norm': 1.5034081935882568, 'learning_rate': 0.0002, 'epoch': 0.87}


 88%|████████▊ | 680/773 [17:01<02:09,  1.39s/it]

{'loss': 1.3946, 'grad_norm': 0.24030402302742004, 'learning_rate': 0.0002, 'epoch': 0.88}


 89%|████████▉ | 690/773 [17:15<01:56,  1.40s/it]

{'loss': 1.383, 'grad_norm': 0.2691127061843872, 'learning_rate': 0.0002, 'epoch': 0.89}


 91%|█████████ | 700/773 [17:29<01:42,  1.41s/it]

{'loss': 1.4104, 'grad_norm': 0.4148443937301636, 'learning_rate': 0.0002, 'epoch': 0.91}


 92%|█████████▏| 710/773 [17:43<01:29,  1.42s/it]

{'loss': 1.3939, 'grad_norm': 0.23629221320152283, 'learning_rate': 0.0002, 'epoch': 0.92}


 93%|█████████▎| 720/773 [17:57<01:14,  1.40s/it]

{'loss': 1.3968, 'grad_norm': 0.3357713222503662, 'learning_rate': 0.0002, 'epoch': 0.93}


 94%|█████████▍| 730/773 [18:11<01:00,  1.42s/it]

{'loss': 1.3584, 'grad_norm': 0.2280924916267395, 'learning_rate': 0.0002, 'epoch': 0.94}


 96%|█████████▌| 740/773 [18:25<00:46,  1.40s/it]

{'loss': 1.4074, 'grad_norm': 0.24406524002552032, 'learning_rate': 0.0002, 'epoch': 0.96}


 97%|█████████▋| 750/773 [18:39<00:32,  1.40s/it]

{'loss': 1.3682, 'grad_norm': 0.22515447437763214, 'learning_rate': 0.0002, 'epoch': 0.97}


 98%|█████████▊| 760/773 [18:53<00:18,  1.39s/it]

{'loss': 1.3828, 'grad_norm': 0.24991323053836823, 'learning_rate': 0.0002, 'epoch': 0.98}


100%|█████████▉| 770/773 [19:07<00:04,  1.40s/it]

{'loss': 1.3981, 'grad_norm': 0.26145321130752563, 'learning_rate': 0.0002, 'epoch': 1.0}


100%|██████████| 773/773 [19:21<00:00,  1.50s/it]

{'train_runtime': 1161.4729, 'train_samples_per_second': 5.99, 'train_steps_per_second': 0.666, 'train_loss': 1.4427231576433466, 'epoch': 1.0}





TrainOutput(global_step=773, training_loss=1.4427231576433466, metrics={'train_runtime': 1161.4729, 'train_samples_per_second': 5.99, 'train_steps_per_second': 0.666, 'total_flos': 9179569931157504.0, 'train_loss': 1.4427231576433466, 'epoch': 1.0})

In [12]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_tokenizer")


('./trained_tokenizer\\tokenizer_config.json',
 './trained_tokenizer\\special_tokens_map.json',
 './trained_tokenizer\\vocab.json',
 './trained_tokenizer\\merges.txt',
 './trained_tokenizer\\added_tokens.json',
 './trained_tokenizer\\tokenizer.json')

In [15]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
 
""" peft_model_id = "./trained_model"
# peft_model_id = args.output_dir
 
# Load Model with PEFT adapter
model_2 = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer_2 = AutoTokenizer.from_pretrained("./trained_tokenizer")
# load into pipeline """
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
from datasets import load_dataset
from random import randint
 
 
# Load our test dataset
eval_dataset = load_dataset("json", data_files="Data/test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))
 
# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
 
print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Generating train split: 6464 examples [00:00, 177561.09 examples/s]


Query:
Just got back from downstairs, what a gross scene.
Original Answer:
Wow, what did you see?
Generated Answer:
Oh no! I hope it wasn't too much of a hassle for you?


In [None]:
# Load our test dataset
eval_dataset = load_dataset("json", data_files="Data/test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))
 
# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
 
print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Query:
I stepped on my dog's tail accidentally earlier and I felt really bad about it.
Original Answer:
was he hurt?
Generated Answer:
Oh no! Did he get hurt?
