In [1]:
import os
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import json

In [7]:
output_dir = os.path.join(os.path.dirname(os.getcwd()), 'models', 'checkpoint-150')

In [8]:
# Tải model và tokenizer
model = AutoModelForCausalLM.from_pretrained(output_dir).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(output_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
def create_dataset_from_files(train_json_path, test_json_path):
    """
    Hàm nhận vào đường dẫn đến các tệp JSON chứa train và test, sau đó tạo DatasetDict
    theo cấu trúc messages với role "system", "user", và "assistant".
    
    Args:
        train_json_path (str): Đường dẫn đến tệp JSON chứa dữ liệu huấn luyện.
        test_json_path (str): Đường dẫn đến tệp JSON chứa dữ liệu kiểm tra.
        
    Returns:
        DatasetDict: Bao gồm tập train và test dưới dạng Hugging Face Dataset.
    """
    # Bước 1: Đọc tệp JSON train và test
    with open(train_json_path, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    
    with open(test_json_path, 'r', encoding='utf-8') as f:
        test_data = json.load(f)

    # Khởi tạo danh sách train_data và test_data để lưu cặp input-output
    train_samples = []
    test_samples = []

    # Bước 2: Tiền xử lý và tách các đoạn hội thoại thành các cặp input-output cho tập train
    for item in train_data:
        previous_context = item["previous_context"]
        topic = item["topic"]
        language = item["language"]
        conversation = item["conversation"]

        for i in range(len(conversation) - 1):
            if conversation[i]["speaker"] == "David" and conversation[i + 1]["speaker"] == "Choi":
                # Input: Tạo đoạn hội thoại phù hợp với "user"
                input_message = (f"Previous context: {previous_context}\n"
                                 f"Topic: {topic}\n"
                                 f"Language: {language}\n"
                                 f"David's Emotion: {conversation[i]['emotion']}\n"
                                 f"Choi's Role: {conversation[i+1]['role']}\n"
                                 f"Choi's Emotion: {conversation[i+1]['emotion']}\n")

                # Output: Cả hội thoại giữa David và Choi
                output_message = (f"David's Text: {conversation[i]['text']}\n"
                                  f"Choi's Text: {conversation[i+1]['text']}")

                # Tạo cấu trúc messages
                conversation_messages = [
                    {"role": "system", "content": "Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion."},
                    {"role": "user", "content": input_message},
                    {"role": "assistant", "content": output_message}
                ]

                # Thêm cặp input-output vào train_data
                train_samples.append({
                    "messages": conversation_messages
                })

    # Tương tự cho tập test
    for item in test_data:
        previous_context = item["previous_context"]
        topic = item["topic"]
        language = item["language"]
        conversation = item["conversation"]

        for i in range(len(conversation) - 1):
            if conversation[i]["speaker"] == "David" and conversation[i + 1]["speaker"] == "Choi":
                # Input: Tạo đoạn hội thoại phù hợp với "user"
                input_message = (f"Previous context: {previous_context}\n"
                                 f"Topic: {topic}\n"
                                 f"Language: {language}\n"
                                 f"David's Emotion: {conversation[i]['emotion']}\n"
                                 f"Choi's Role: {conversation[i+1]['role']}\n"
                                 f"Choi's Emotion: {conversation[i+1]['emotion']}\n")

                # Output: Cả hội thoại giữa David và Choi
                output_message = (f"David's Text: {conversation[i]['text']}\n"
                                  f"Choi's Text: {conversation[i+1]['text']}")

                # Tạo cấu trúc messages
                conversation_messages = [
                    {"role": "system", "content": "Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion."},
                    {"role": "user", "content": input_message},
                    {"role": "assistant", "content": output_message}
                ]

                # Thêm cặp input-output vào test_data
                test_samples.append({
                    "messages": conversation_messages
                })

    # Bước 3: Tạo Dataset cho train và test bằng DatasetDict
    dataset_dict = DatasetDict({
        "train": Dataset.from_list(train_samples),
        "test": Dataset.from_list(test_samples)
    })

    return dataset_dict


In [10]:
data_path = (os.path.join(os.path.dirname(os.getcwd()), 'data', 'train_data.json'))
test_path = (os.path.join(os.path.dirname(os.getcwd()), 'data', 'test_data.json'))

In [11]:
dataset = create_dataset_from_files(data_path, test_path)
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 400
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 30
    })
})

In [18]:
dataset['train']['messages'][0]

[{'content': 'Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.',
  'role': 'system'},
 {'content': "Previous context: David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.\nTopic: David's old job in the U.S.\nLanguage: Mixed English-Korean\nDavid's Emotion: humorous\nChoi's Role: Son\nChoi's Emotion: patient\n",
  'role': 'user'},
 {'content': "David's Text: Ah, that job! I was the big boss, right?  I think I even had a fancy office with a view.  Maybe I was even the CEO?  Or was that just in my dreams?\nChoi's Text: You were a manager, Dad.  You were in charge of a big team, and you did a great job.  Remember that project you led?  The one with the… what was it called again?  The…  Ah, the ‘Blue Sky’ project!  You were the one who came up with the ide

In [24]:
test_text = text = tokenizer.apply_chat_template(
    dataset['train']['messages'][0],
    tokenize=False,
    add_generation_prompt=True,
)
test_text = '\n'.join(test_text.split("\n")[:-4]) + '\n'
test_text

"<|im_start|>system\nYour task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.<|im_end|>\n<|im_start|>user\nPrevious context: David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.\nTopic: David's old job in the U.S.\nLanguage: Mixed English-Korean\nDavid's Emotion: humorous\nChoi's Role: Son\nChoi's Emotion: patient\n<|im_end|>\n<|im_start|>assistant\n"

In [25]:
tokenizer_test_text = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True, max_length=1024)

In [26]:
tokenizer_test_text.to(model.device)

{'input_ids': tensor([[151644,   8948,    198,   7771,   3383,    374,    311,   6923,    264,
          10435,   1948,   6798,    323,  86573,     13,   6798,    686,   2548,
           4755,    476,   3061,   3118,    389,    806,  21261,    323,   2266,
             11,    323,  86573,    686,   5889,  34901,   4092,    311,    806,
           3476,    323,  19772,     13, 151645,    198, 151644,    872,    198,
          21291,   2266,     25,   6798,    572,  21815,    911,    806,   3476,
            518,    806,   2310,   2618,    304,    279,    547,    808,     13,
          86573,    572,  62942,   1435,    911,    279,    882,    566,   8975,
            264,   6849,   2390,   1052,    624,  26406,     25,   6798,    594,
           2310,   2618,    304,    279,    547,    808,    624,  13806,     25,
          50168,   6364,  15843,  45195,    198,  22286,    594,   5748,   5956,
             25,  69846,    198,   1143,   6728,    594,  15404,     25,  11840,
            19

In [27]:
output = model.generate(**tokenizer_test_text, max_length=1024)

In [28]:
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("Output:", decoded_output)

Output: system
Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.
user
Previous context: David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.
Topic: David's old job in the U.S.
Language: Mixed English-Korean
David's Emotion: humorous
Choi's Role: Son
Choi's Emotion: patient

assistant
David's Text: Oh yeah, I remember that project! We were like, 'We can do this! We can make it happen!' But then, we hit a wall with those pesky regulations. It was like trying to build a house with all these weird rules, you know? It was a real mess!
Choi's Text: Yeah, Dad, those regulations were definitely a challenge. But you handled them so well. Remember how you convinced everyone to use those new software tools to streamline everything? That saved us a lot of time and