In [1]:
import os
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import json

In [2]:
output_dir = os.path.join(os.path.dirname(os.getcwd()), 'models', 'checkpoint-36')

In [3]:
# Tải model và tokenizer
model = AutoModelForCausalLM.from_pretrained(output_dir).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(output_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def create_dataset_from_files(train_json_path, test_json_path):
    """
    Hàm nhận vào đường dẫn đến các tệp JSON chứa train và test, sau đó tạo DatasetDict
    theo cấu trúc messages với role "system", "user", và "assistant".
    
    Args:
        train_json_path (str): Đường dẫn đến tệp JSON chứa dữ liệu huấn luyện.
        test_json_path (str): Đường dẫn đến tệp JSON chứa dữ liệu kiểm tra.
        
    Returns:
        DatasetDict: Bao gồm tập train và test dưới dạng Hugging Face Dataset.
    """
    # Bước 1: Đọc tệp JSON train và test
    with open(train_json_path, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    
    with open(test_json_path, 'r', encoding='utf-8') as f:
        test_data = json.load(f)

    # Khởi tạo danh sách train_data và test_data để lưu cặp input-output
    train_samples = []
    test_samples = []

    # Bước 2: Tiền xử lý và tách các đoạn hội thoại thành các cặp input-output cho tập train
    for item in train_data:
        previous_context = item["previous_context"]
        topic = item["topic"]
        language = item["language"]
        conversation = item["conversation"]

        for i in range(len(conversation) - 1):
            if conversation[i]["speaker"] == "David" and conversation[i + 1]["speaker"] == "Choi":
                # Input: Tạo đoạn hội thoại phù hợp với "user"
                input_message = (f"Previous context: {previous_context}\n"
                                 f"Topic: {topic}\n"
                                 f"Language: {language}\n"
                                 f"David's Emotion: {conversation[i]['emotion']}\n"
                                 f"Choi's Role: {conversation[i+1]['role']}\n"
                                 f"Choi's Emotion: {conversation[i+1]['emotion']}\n")

                # Output: Cả hội thoại giữa David và Choi
                output_message = (f"David's Text: {conversation[i]['text']}\n"
                                  f"Choi's Text: {conversation[i+1]['text']}")

                # Tạo cấu trúc messages
                conversation_messages = [
                    {"role": "system", "content": "Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion."},
                    {"role": "user", "content": input_message},
                    {"role": "assistant", "content": output_message}
                ]

                # Thêm cặp input-output vào train_data
                train_samples.append({
                    "messages": conversation_messages
                })

    # Tương tự cho tập test
    for item in test_data:
        previous_context = item["previous_context"]
        topic = item["topic"]
        language = item["language"]
        conversation = item["conversation"]

        for i in range(len(conversation) - 1):
            if conversation[i]["speaker"] == "David" and conversation[i + 1]["speaker"] == "Choi":
                # Input: Tạo đoạn hội thoại phù hợp với "user"
                input_message = (f"Previous context: {previous_context}\n"
                                 f"Topic: {topic}\n"
                                 f"Language: {language}\n"
                                 f"David's Emotion: {conversation[i]['emotion']}\n"
                                 f"Choi's Role: {conversation[i+1]['role']}\n"
                                 f"Choi's Emotion: {conversation[i+1]['emotion']}\n")

                # Output: Cả hội thoại giữa David và Choi
                output_message = (f"David's Text: {conversation[i]['text']}\n"
                                  f"Choi's Text: {conversation[i+1]['text']}")

                # Tạo cấu trúc messages
                conversation_messages = [
                    {"role": "system", "content": "Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion."},
                    {"role": "user", "content": input_message},
                    {"role": "assistant", "content": output_message}
                ]

                # Thêm cặp input-output vào test_data
                test_samples.append({
                    "messages": conversation_messages
                })

    # Bước 3: Tạo Dataset cho train và test bằng DatasetDict
    dataset_dict = DatasetDict({
        "train": Dataset.from_list(train_samples),
        "test": Dataset.from_list(test_samples)
    })

    return dataset_dict


In [5]:
data_path = (os.path.join(os.path.dirname(os.getcwd()), 'data', 'train_data.json'))
test_path = (os.path.join(os.path.dirname(os.getcwd()), 'data', 'test_data.json'))

In [6]:
dataset = create_dataset_from_files(data_path, test_path)
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 100
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 10
    })
})

In [32]:
test_text = text = tokenizer.apply_chat_template(
    dataset['train']['messages'][2],
    tokenize=False,
    add_generation_prompt=True,
)
test_text = '\n'.join(test_text.split("\n")[:-4]) + '\n'
test_text

"<|im_start|>system\nYour task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.<|im_end|>\n<|im_start|>user\nPrevious context: David was reminiscing about his early days in Korea, mentioning how he felt a bit lost at first but eventually found his footing.\nTopic: David’s early days in Korea\nLanguage: Korean\nDavid's Emotion: warm\nChoi's Role: Motivational speaker\nChoi's Emotion: comforting\n<|im_end|>\n<|im_start|>assistant\n"

In [33]:
tokenizer_test_text = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True, max_length=1024)

In [34]:
tokenizer_test_text.to(model.device)

{'input_ids': tensor([[151644,   8948,    198,   7771,   3383,    374,    311,   6923,    264,
          10435,   1948,   6798,    323,  86573,     13,   6798,    686,   2548,
           4755,    476,   3061,   3118,    389,    806,  21261,    323,   2266,
             11,    323,  86573,    686,   5889,  34901,   4092,    311,    806,
           3476,    323,  19772,     13, 151645,    198, 151644,    872,    198,
          21291,   2266,     25,   6798,    572,  42550,  52754,    911,    806,
           4124,   2849,    304,  11862,     11,  44291,   1246,    566,   6476,
            264,   2699,   5558,    518,   1156,    714,   9583,   1730,    806,
          73403,    624,  26406,     25,   6798,    748,   4124,   2849,    304,
          11862,    198,  13806,     25,  16134,    198,  22286,    594,   5748,
           5956,     25,   8205,    198,   1143,   6728,    594,  15404,     25,
          18977,    344,   1663,  18601,    198,   1143,   6728,    594,   5748,
           595

In [35]:
output = model.generate(**tokenizer_test_text, max_length=1024)

In [36]:
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("Output:", decoded_output)

Output: system
Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.
user
Previous context: David was reminiscing about his early days in Korea, mentioning how he felt a bit lost at first but eventually found his footing.
Topic: David’s early days in Korea
Language: Korean
David's Emotion: warm
Choi's Role: Motivational speaker
Choi's Emotion: comforting

assistant
David: "한국에서 처음부터 시작했을 때는 좀 허무하고 막막했어요. 그때부터 점점 적응하고 있더라고요."
Choi: "그런 어려움을 겪은 당신이지만, 그것이 결국 당신의 성장을 만들어냈어요. 저는 여러분에게도 그런 경험을 추천하고 싶어요. 당신이 지금 느끼는 허무함과 막막함이 당신의 성장의 단계를 지나치게 빨리 간주하지 말고, 그 흔적들이 당신의 성장을 통해 큰 가치를 가질 수 있다는 것을 기억하세요."
