In [1]:
# !pip install peft accelerate bitsandbytes trl transformers datasets

In [2]:
import os
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import json

In [3]:
config_path = os.path.join(os.path.dirname(os.getcwd()), 'config', 'train_model_config.json')
with open(config_path, 'r') as file:
    config = json.load(file)

In [4]:
torch.cuda.is_available()

True

In [5]:
config

{'model_name': 'Qwen/Qwen2.5-3B-Instruct',
 'new_model_name': 'Qwen/Qwen2.5-3B-Instruct-quan',
 'lora_r': 64,
 'lora_alpha': 16,
 'lora_dropout': 0.1,
 'use_4bit': 1,
 'bnb_4bit_compute_dtype': 'float16',
 'bnb_4bit_quant_type': 'nf4',
 'use_nested_quant': 0,
 'num_train_epochs': 3,
 'per_device_train_batch_size': 4,
 'per_device_eval_batch_size': 4,
 'gradient_accumulation_steps': 2,
 'gradient_checkpointing': 1,
 'max_grad_norm': 0.3,
 'learning_rate': 0.0002,
 'weight_decay': 0.001,
 'optim': 'paged_adamw_32bit',
 'lr_scheduler_type': 'cosine',
 'max_steps': -1,
 'warmup_ratio': 0.03,
 'group_by_length': 1,
 'save_steps': 0,
 'logging_steps': 25,
 'max_seq_length': 1024,
 'packing': 0,
 'device_map': {'': 0}}

In [6]:
model_name = config['model_name']

# Fine-tuned model name
new_model = config['new_model_name']


In [7]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = config['lora_r']

# Alpha parameter for LoRA scaling
lora_alpha = config['lora_alpha']

# Dropout probability for LoRA layers
lora_dropout = config['lora_dropout']

In [8]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = bool(config['use_4bit'])

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = config['bnb_4bit_compute_dtype']

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = config['bnb_4bit_quant_type']

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = bool(config['use_nested_quant'])


In [9]:
print(os.path.join(os.path.dirname(os.getcwd()), 'model'))

D:\illuminus_bot\model


In [10]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = os.path.join(os.path.dirname(os.getcwd()), 'models')

# Number of training epochs
num_train_epochs = config['num_train_epochs']

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = config['per_device_train_batch_size']

# Batch size per GPU for evaluation
per_device_eval_batch_size = config['per_device_eval_batch_size']

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = config['gradient_accumulation_steps']

# Enable gradient checkpointing
gradient_checkpointing = bool(config['gradient_checkpointing'])

# Maximum gradient normal (gradient clipping)
max_grad_norm = config['max_grad_norm']

# Initial learning rate (AdamW optimizer)
learning_rate = config['learning_rate']

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = config['weight_decay']

# Optimizer to use
optim = config['optim']

# Learning rate schedule
lr_scheduler_type = config['lr_scheduler_type']

# Number of training steps (overrides num_train_epochs)
max_steps = config['max_steps']

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = config['warmup_ratio']

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = bool(config['group_by_length'])

# Save checkpoint every X updates steps
save_steps = config['save_steps']

# Log every X updates steps
logging_steps = config['logging_steps']

In [11]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = config['max_seq_length']

# Pack multiple short examples in the same input sequence to increase efficiency
packing = bool(config['packing'])

# Load the entire model on the GPU 0
device_map = config['device_map']

In [12]:
data_path = (os.path.join(os.path.dirname(os.getcwd()), 'data', 'train_data.json'))
print(data_path)

D:\illuminus_bot\data\train_data.json


In [13]:
# Load dataset (D:\illuminus_bot\data\train_data.json)

# Đọc nội dung file train JSON từ hệ thống
with open(data_path, 'r', encoding='utf-8') as file:
    data_train = json.load(file)

In [14]:
len(data_train), data_train[0]

(400,
 {'previous_context': 'David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.',
  'topic': "David's old job in the U.S.",
  'language': 'Mixed English-Korean',
  'conversation': [{'speaker': 'David',
    'emotion': 'humorous',
    'text': 'Ah, that job! I was the big boss, right?  I think I even had a fancy office with a view.  Maybe I was even the CEO?  Or was that just in my dreams?'},
   {'speaker': 'Choi',
    'role': 'Son',
    'emotion': 'patient',
    'text': 'You were a manager, Dad.  You were in charge of a big team, and you did a great job.  Remember that project you led?  The one with the… what was it called again?  The…  Ah, the ‘Blue Sky’ project!  You were the one who came up with the idea, remember?'}]})

In [15]:
# Đọc nội dung file test JSON từ hệ thống
test_path = (os.path.join(os.path.dirname(os.getcwd()), 'data', 'test_data.json'))
with open(test_path, 'r', encoding='utf-8') as file:
    data_test = json.load(file)

In [16]:
len(data_test), data_test[0]

(30,
 {'previous_context': 'David was reminiscing about his early days in Korea, mentioning how he struggled to learn the language.',
  'topic': 'David’s early days in Korea',
  'language': 'English',
  'conversation': [{'speaker': 'David',
    'emotion': 'humorous',
    'text': "You know, I used to think 'kimchi' was a type of animal! I'd ask people, 'Where can I find the best kimchi farm?' They'd just stare at me like I was crazy."},
   {'speaker': 'Choi',
    'role': 'Doctor',
    'emotion': 'patient',
    'text': "That's a funny story, Dad. I can imagine how confusing it must have been at first. But you learned quickly, and now you're a true Korean."}]})

In [17]:
def create_dataset_from_files(train_json_path, test_json_path):
    """
    Hàm nhận vào đường dẫn đến các tệp JSON chứa train và test, sau đó tạo DatasetDict
    theo cấu trúc messages với role "system", "user", và "assistant".
    
    Args:
        train_json_path (str): Đường dẫn đến tệp JSON chứa dữ liệu huấn luyện.
        test_json_path (str): Đường dẫn đến tệp JSON chứa dữ liệu kiểm tra.
        
    Returns:
        DatasetDict: Bao gồm tập train và test dưới dạng Hugging Face Dataset.
    """
    # Bước 1: Đọc tệp JSON train và test
    with open(train_json_path, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    
    with open(test_json_path, 'r', encoding='utf-8') as f:
        test_data = json.load(f)

    # Khởi tạo danh sách train_data và test_data để lưu cặp input-output
    train_samples = []
    test_samples = []

    # Bước 2: Tiền xử lý và tách các đoạn hội thoại thành các cặp input-output cho tập train
    for item in train_data:
        previous_context = item["previous_context"]
        topic = item["topic"]
        language = item["language"]
        conversation = item["conversation"]

        for i in range(len(conversation) - 1):
            if conversation[i]["speaker"] == "David" and conversation[i + 1]["speaker"] == "Choi":
                # Input: Tạo đoạn hội thoại phù hợp với "user"
                input_message = (f"Previous context: {previous_context}\n"
                                 f"Topic: {topic}\n"
                                 f"Language: {language}\n"
                                 f"David's Emotion: {conversation[i]['emotion']}\n"
                                 f"Choi's Role: {conversation[i+1]['role']}\n"
                                 f"Choi's Emotion: {conversation[i+1]['emotion']}\n")

                # Output: Cả hội thoại giữa David và Choi
                output_message = (f"David's Text: {conversation[i]['text']}\n"
                                  f"Choi's Text: {conversation[i+1]['text']}")

                # Tạo cấu trúc messages
                conversation_messages = [
                    {"role": "system", "content": "Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion."},
                    {"role": "user", "content": input_message},
                    {"role": "assistant", "content": output_message}
                ]

                # Thêm cặp input-output vào train_data
                train_samples.append({
                    "messages": conversation_messages
                })

    # Tương tự cho tập test
    for item in test_data:
        previous_context = item["previous_context"]
        topic = item["topic"]
        language = item["language"]
        conversation = item["conversation"]

        for i in range(len(conversation) - 1):
            if conversation[i]["speaker"] == "David" and conversation[i + 1]["speaker"] == "Choi":
                # Input: Tạo đoạn hội thoại phù hợp với "user"
                input_message = (f"Previous context: {previous_context}\n"
                                 f"Topic: {topic}\n"
                                 f"Language: {language}\n"
                                 f"David's Emotion: {conversation[i]['emotion']}\n"
                                 f"Choi's Role: {conversation[i+1]['role']}\n"
                                 f"Choi's Emotion: {conversation[i+1]['emotion']}\n")

                # Output: Cả hội thoại giữa David và Choi
                output_message = (f"David's Text: {conversation[i]['text']}\n"
                                  f"Choi's Text: {conversation[i+1]['text']}")

                # Tạo cấu trúc messages
                conversation_messages = [
                    {"role": "system", "content": "Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion."},
                    {"role": "user", "content": input_message},
                    {"role": "assistant", "content": output_message}
                ]

                # Thêm cặp input-output vào test_data
                test_samples.append({
                    "messages": conversation_messages
                })

    # Bước 3: Tạo Dataset cho train và test bằng DatasetDict
    dataset_dict = DatasetDict({
        "train": Dataset.from_list(train_samples),
        "test": Dataset.from_list(test_samples)
    })

    return dataset_dict


In [18]:
dataset = create_dataset_from_files(data_path, test_path)
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 400
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 30
    })
})

In [19]:
dataset['train']['messages'][0]

[{'content': 'Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.',
  'role': 'system'},
 {'content': "Previous context: David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.\nTopic: David's old job in the U.S.\nLanguage: Mixed English-Korean\nDavid's Emotion: humorous\nChoi's Role: Son\nChoi's Emotion: patient\n",
  'role': 'user'},
 {'content': "David's Text: Ah, that job! I was the big boss, right?  I think I even had a fancy office with a view.  Maybe I was even the CEO?  Or was that just in my dreams?\nChoi's Text: You were a manager, Dad.  You were in charge of a big team, and you did a great job.  Remember that project you led?  The one with the… what was it called again?  The…  Ah, the ‘Blue Sky’ project!  You were the one who came up with the ide

In [20]:
dataset['test']['messages'][0]

[{'content': 'Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.',
  'role': 'system'},
 {'content': "Previous context: David was reminiscing about his early days in Korea, mentioning how he struggled to learn the language.\nTopic: David’s early days in Korea\nLanguage: English\nDavid's Emotion: humorous\nChoi's Role: Doctor\nChoi's Emotion: patient\n",
  'role': 'user'},
 {'content': "David's Text: You know, I used to think 'kimchi' was a type of animal! I'd ask people, 'Where can I find the best kimchi farm?' They'd just stare at me like I was crazy.\nChoi's Text: That's a funny story, Dad. I can imagine how confusing it must have been at first. But you learned quickly, and now you're a true Korean.",
  'role': 'assistant'}]

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [22]:
text = tokenizer.apply_chat_template(
    dataset['train']['messages'][0],
    tokenize=False,
    add_generation_prompt=True,
)
print('\n'.join(text.split("\n")[:-2]))

<|im_start|>system
Your task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.<|im_end|>
<|im_start|>user
Previous context: David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.
Topic: David's old job in the U.S.
Language: Mixed English-Korean
David's Emotion: humorous
Choi's Role: Son
Choi's Emotion: patient
<|im_end|>
<|im_start|>assistant
David's Text: Ah, that job! I was the big boss, right?  I think I even had a fancy office with a view.  Maybe I was even the CEO?  Or was that just in my dreams?
Choi's Text: You were a manager, Dad.  You were in charge of a big team, and you did a great job.  Remember that project you led?  The one with the… what was it called again?  The…  Ah, the ‘Blue Sky’ project!  You were the one who came up with the idea, remember?<|i

In [23]:
def add_chat_template(messages):
    """
    Áp dụng template hội thoại cho các tin nhắn và trả về văn bản đã xử lý dưới dạng dictionary.
    
    Args:
        messages (list): Danh sách các tin nhắn từ cuộc hội thoại.
        
    Returns:
        dict: Trả về kết quả đã xử lý dưới dạng dictionary.
    """
    # Giả định tokenizer đã được khởi tạo và có phương thức apply_chat_template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Loại bỏ hai dòng cuối cùng và trả về dưới dạng dictionary
    return {"prompt": '\n'.join(text.split("\n")[:-2])}


In [24]:
text = add_chat_template(dataset['train']['messages'][2])
print(text)

{'prompt': "<|im_start|>system\nYour task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.<|im_end|>\n<|im_start|>user\nPrevious context: David was reminiscing about his early days in Korea, mentioning how he felt a bit lost at first but eventually found his footing.\nTopic: David’s early days in Korea\nLanguage: Korean\nDavid's Emotion: warm\nChoi's Role: Motivational speaker\nChoi's Emotion: comforting\n<|im_end|>\n<|im_start|>assistant\nDavid's Text: 처음 한국에 왔을 때는 정말 낯설었어. 모든 것이 새롭고, 말도 잘 통하지 않아서 힘들었지. 하지만 시간이 지나면서 이곳 사람들의 따뜻함에 감동했어. 특히 이웃 사람들이 친절하게 도와주었던 기억이 아직도 생생해.\nChoi's Text: 아버지, 그때 힘든 시간을 이겨내고 한국에 뿌리를 내리신 모습이 정말 대단해요. 아버지의 강인함과 적응력은 저에게도 큰 영감을 주고 있어요. 아버지 덕분에 저도 어려움에 굴하지 않고 꿈을 향해 나아갈 수 있었어요.<|im_end|>"}


In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 400
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 30
    })
})

In [26]:
data_add_prompt = dataset.map(lambda x: add_chat_template(x['messages']))

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [27]:
data_add_prompt

DatasetDict({
    train: Dataset({
        features: ['messages', 'prompt'],
        num_rows: 400
    })
    test: Dataset({
        features: ['messages', 'prompt'],
        num_rows: 30
    })
})

In [28]:
def tokenize_function(example):
    return tokenizer(example['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=1024)

In [29]:
data_add_prompt['train'][0]['prompt']

"<|im_start|>system\nYour task is to generate a conversation between David and Choi. David will ask questions or talk based on his emotions and context, and Choi will respond appropriately according to his role and emotion.<|im_end|>\n<|im_start|>user\nPrevious context: David was confused about his role at his old job in the U.S. Choi was reminding him about the time he managed a successful project there.\nTopic: David's old job in the U.S.\nLanguage: Mixed English-Korean\nDavid's Emotion: humorous\nChoi's Role: Son\nChoi's Emotion: patient\n<|im_end|>\n<|im_start|>assistant\nDavid's Text: Ah, that job! I was the big boss, right?  I think I even had a fancy office with a view.  Maybe I was even the CEO?  Or was that just in my dreams?\nChoi's Text: You were a manager, Dad.  You were in charge of a big team, and you did a great job.  Remember that project you led?  The one with the… what was it called again?  The…  Ah, the ‘Blue Sky’ project!  You were the one who came up with the idea,

In [30]:
example = tokenize_function(data_add_prompt['train'][0])
example.input_ids.shape, example.attention_mask.shape

(torch.Size([1, 245]), torch.Size([1, 245]))

In [31]:
tokenized_datasets = data_add_prompt.map(tokenize_function, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [32]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['messages', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
    test: Dataset({
        features: ['messages', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 30
    })
})

In [33]:
avg_input_ids = sum(len(i['input_ids']) for i in tokenized_datasets['train']) / len(tokenized_datasets['train'])
avg_attention_mask = sum(len(i['attention_mask']) for i in tokenized_datasets['train']) / len(tokenized_datasets['train'])

print(f"Average input_ids length: {avg_input_ids}")
print(f"Average attention_mask length: {avg_attention_mask}")

Average input_ids length: 387.0
Average attention_mask length: 387.0


In [34]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [35]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [36]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
torch.cuda.empty_cache()

In [38]:
# Xem kích thước mô hình sau lượng tử hóa
num_params = sum(p.numel() for p in model.parameters())
print(f"Tham số sau lượng tử hóa: {num_params / 1e9} tỷ tham số")

Tham số sau lượng tử hóa: 1.69867264 tỷ tham số


In [39]:
# Load Qwen tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [40]:
embedding_size = model.config.hidden_size
print(f"Embedding size: {embedding_size}")

Embedding size: 2048


In [41]:
# for name, module in model.named_modules(): #cHECK LAYER
#     print(name)

In [42]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    # target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

In [43]:
peft_config.target_modules

In [44]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",

    # Evaluation strategy and steps
    evaluation_strategy="steps",  # Đánh giá theo số bước
    eval_steps=10,               # Đánh giá sau mỗi 10 bước
    load_best_model_at_end=True,   # Lưu lại mô hình tốt nhất
)



In [45]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [46]:
# Train model
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
10,No log,2.298718
20,No log,1.898304
30,2.342000,1.441201
40,2.342000,1.124336
50,1.251200,1.000671
60,1.251200,0.926149
70,1.251200,0.875006
80,0.911200,0.847079
90,0.911200,0.825175
100,0.819400,0.805585


TrainOutput(global_step=150, training_loss=1.1395888392130533, metrics={'train_runtime': 239.2107, 'train_samples_per_second': 5.016, 'train_steps_per_second': 0.627, 'total_flos': 7772716744704000.0, 'train_loss': 1.1395888392130533, 'epoch': 3.0})

In [47]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [48]:
!nvidia-smi

Tue Sep 24 10:36:26 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 561.09                 Driver Version: 561.09         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090      WDDM  |   00000000:09:00.0  On |                  Off |
| 60%   53C    P2             58W /  500W |   23868MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                