In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import re

from datasets import load_dataset, DatasetDict
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TrainingArguments
from peft import PeftModel, LoraConfig
from random import randint
from trl import SFTTrainer

In [2]:
assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
if torch.cuda.is_available():
    print("GPU available:", torch.cuda.get_device_name(0))
    device = "cuda"
else:
    print("No GPU detected")
    device = "cpu"

GPU available: NVIDIA GeForce RTX 4060 Laptop GPU


## Daily_dialog

In [7]:
dataset = load_dataset("daily_dialog", trust_remote_code=True)

dataset

Downloading data: 100%|██████████| 4.48M/4.48M [00:00<00:00, 5.57MB/s]
Generating train split: 100%|██████████| 11118/11118 [00:00<00:00, 12958.66 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 10390.58 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 10409.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})

In [8]:
dataset["train"][0]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but is really not good for our fitness . ',
  ' What do you mean ? It will help us to relax . ',
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
  " Good.Let ' s go now . ",
  ' All right . '],
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}

## Empathetic_dialogue

In [None]:
df = pd.read_csv("empathetic-dialogues-facebook-ai/emotion-emotion_69k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,
2,2,I remember going to the fireworks with my best...,sentimental,Customer :We no longer talk.\nAgent :,Oh was this something that happened because of...,,
3,3,I remember going to the fireworks with my best...,sentimental,Customer :Was this a friend you were in love w...,This was a best friend. I miss her.,,
4,4,I remember going to the fireworks with my best...,sentimental,Customer :Where has she gone?\nAgent :,We no longer talk.,,


In [71]:
# removing the Customer:, and agent: part from the 'empethetic_dialogues' column
df['extracted_text'] = df['empathetic_dialogues'].apply(lambda x: re.findall(r':\s*(.*?)\n', x)[0] if re.findall(r':\s*(.*?)\n', x) else None)
# keeping only the extracted_text, and labels columns
cleaned_df = df[["extracted_text", "labels"]].copy()

In [72]:
# these rows have parsing problems
cleaned_df[cleaned_df.isna().any(axis=1)].index
# these indexes are special case : [23485, 23486, 23487, 23488]

Index([ 1419,  1420,  1421,  1422,  2546,  2547,  2548,  3722,  3723,  3724,
        3725,  3726, 20933, 20934, 20935, 20936, 23485, 23486, 23487, 23488,
       28674, 28675, 28676, 31743, 31744, 31745, 35867, 35868, 35869, 35870,
       35871, 40194, 40195, 40196, 40197, 42140, 42141, 42142, 53778, 53779,
       53780, 53781, 64217, 64218, 64219],
      dtype='int64')

In [73]:
# Drop the rows with the parsing problem
cleaned_df.drop(index=cleaned_df[cleaned_df.isna().any(axis=1)].index, inplace=True, axis=0)

In [74]:
# This code is for the correction of the first subrows with parsing problems
df_v1 = df.iloc[[1419,  1420,  1421,  1422,  2546,  2547,  2548,  3722,  3723,  3724, 3725,  3726, 20933, 20934, 20935, 20936, 28674, 28675, 
                 28676, 31743, 31744, 31745, 35867, 35868, 35869, 35870, 35871, 40194, 40195, 40196, 40197, 42140, 42141, 42142, 53778, 53779,
                 53780, 53781, 64217, 64218, 64219]].copy()
df_v1['extracted_text'] = df_v1['labels'].apply(lambda x: re.findall(r':\s*(.*?)\n', x)[0] if re.findall(r':\s*(.*?)\n', x) else None)
df_v1['labels'] = df_v1["Unnamed: 5"]
# And this is for the seconde subrows
df_v2 = df.iloc[[23485, 23486, 23487, 23488]].copy()
df_v2['extracted_text'] = df_v2['Unnamed: 5'].apply(lambda x: re.findall(r':\s*(.*?)\n', x)[0] if re.findall(r':\s*(.*?)\n', x) else None)
df_v2['labels'] = df_v2["Unnamed: 6"]
# Then we reconcat all of the data
cleaned_df = pd.concat([cleaned_df,df_v1[["extracted_text", "labels"]], df_v2[["extracted_text", "labels"]]], axis=0)


In [75]:
# And now we check. Voila!
cleaned_df.iloc[[ 1419,  1420,  1421,  1422,  2546,  2547,  2548,  3722,  3723,  3724,
        3725,  3726, 20933, 20934, 20935, 20936, 23485, 23486, 23487, 23488,
       28674, 28675, 28676, 31743, 31744, 31745, 35867, 35868, 35869, 35870,
       35871, 40194, 40195, 40196, 40197, 42140, 42141, 42142, 53778, 53779,
       53780, 53781, 64217, 64218, 64219]]


Unnamed: 0,extracted_text,labels
1423,"One night, i hugged my wife and told her i lov...",Aww! That's so sweet! How long have you been m...
1424,I went to work and got a big raise. I rushed h...,That's very exciting! Congratulations on your ...
1425,Aww! That's so sweet! How long have you been m...,I went to work and got a big raise. I rushed h...
1426,That's very exciting! Congratulations on your ...,"When i got home, all of the furniture, my wife..."
2553,"High school sweethearts, that's so special. Do...","No not at all, I feel so so lucky that I met h..."
2554,Your outlook on love is really refreshing. I w...,"Thank you very much, I'm wishing/praying the b..."
2555,How can people be stupid enough to step on you...,That is a disgrace. Some people just do not ge...
3734,It had it's ups and downs which is why I wonde...,Do you have kids?
3735,"I do, which is one of the reasons I'm so excited.",It's never to late to start new. You guys will...
3736,Was it a nice one?,It had it's ups and downs which is why I wonde...


## Preprocessing

In [None]:
login(
  token="--add your token here--", # ADD YOUR TOKEN HERE
  # add_to_git_credential=True
)

In [4]:
# Convert dataset to OAI messages
system_message = """You are an emotional conversationalist. Users will talk to you in English and you will generate a response."""
 
def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message},
      {"role": "user", "content": sample["extracted_text"]},
      {"role": "assistant", "content": sample["labels"]}
    ]
  }
 
# Load dataset from the hub
dataset = load_dataset("csv", data_files="Data/cleaned_empethatic_dataset.csv", split="train")
#dataset = dataset.shuffle().select(range(12500))
 
# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset_1 = dataset.train_test_split(test_size=0.2)
dataset_2 = dataset_1["test"].train_test_split(test_size=0.5)

split_dataset = {
    'train': dataset_1['train'],
    'validation': dataset_2['train'],
    'test': dataset_2['test']
}

new_dataset = DatasetDict(split_dataset)

 
print(new_dataset["train"][345]["messages"])
 
# save datasets to disk
new_dataset["train"].to_json("Data/train_dataset.json", orient="records")
new_dataset["validation"].to_json("Data/validation_dataset.json", orient="records")
new_dataset["test"].to_json("Data/test_dataset.json", orient="records")

[{'content': 'You are an emotional conversationalist. Users will talk to you in English and you will generate a response.', 'role': 'system'}, {'content': 'For real!  It is worse for me because no one else is here. :(', 'role': 'user'}, {'content': 'Yea, then you hae to do it all yourself', 'role': 'assistant'}]


Creating json from Arrow format: 100%|██████████| 52/52 [00:01<00:00, 43.84ba/s]
Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 53.34ba/s]
Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 52.27ba/s]


2241570

## Training

In [5]:
model_id = "Qwen/Qwen1.5-0.5B-Chat" # or `mistralai/Mistral-7B-v0.1`
 
# BitsAndBytesConfig int-4 config
""" bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
) """
 
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    #attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    #quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

In [6]:
# Load jsonl data from disk
dataset = load_dataset("json", data_files="Data/train_dataset.json", split="train")

Generating train split: 51708 examples [00:00, 389835.39 examples/s]


In [None]:
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [None]:
args = TrainingArguments(
    output_dir="code-Qwen1.5", # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=3,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    # push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [10]:
max_seq_length = 512 # max sequence length for model and packing of the dataset
 
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)





Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 6954 examples [00:09, 720.67 examples/s]


In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

## Inference

In [None]:
peft_model_id = "./model"
 
# Load Model with PEFT adapter
model_2 = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer_2 = AutoTokenizer.from_pretrained("./model")
# load into pipeline 
pipe = pipeline("text-generation", model=model_2, tokenizer=tokenizer_2)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

In [None]:
# Load our test dataset
eval_dataset = load_dataset("json", data_files="Data/test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))
 
# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
 
print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

Generating train split: 6464 examples [00:00, 345393.15 examples/s]


Query:
Are you sure? I think its bad
Original Answer:
No, only one extra day isn't bad.
Generated Answer:
I don't know, but I'm not sure if it's really bad or just a bad feeling.  I feel like I should be able to do it now, but I'm worried about the consequences.
