In [None]:
!pip install -q unsloth==2025.4.7 datasets==3.5.1

# Set CUDA device and disable Triton
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #chon GPU dau tien
os.environ["TRITON_DISABLE"] = "1"  # tat triton pytorch
os.environ["TRITON_DISABLE_LINE_INFO"] = "1" 

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login, create_repo


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.5/218.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.4/152.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0

In [None]:
login(token="hf_......................")

model_name = "unsloth/Llama-3.1-8B-Instruct-bnb-4bit" #tai base model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

#cau hình quantize
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,   # quantize 4 bit ---> giam 2^n (n=4)
    bnb_4bit_quant_type="nf4", #normal float 4-bit
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16 #dinh dang float 16-bit
)

#load model tu duong dan cuc bo
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16 #tensor se co dang float 16-bit
)

# cau hinh lora
lora_config = LoraConfig(
    r=16,
    lora_alpha=16, #scale factor
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    task_type="CAUSAL_LM",
    use_rslora=True
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

dataset_name = "5CD-AI/Vietnamese-Multi-turn-Chat-Alpaca"
hf_dataset = load_dataset(dataset_name, split="train")
local_dataset = load_dataset("json", data_files="/kaggle/input/formate-data/formated_data.json", split="train") #du lieu o local

SYS_INSTRUCT_FRIENDLY = "Bạn là một trợ lý AI thân thiện, hãy trả lời bằng tiếng Việt."
SYS_INSTRUCT_EXPERT = "Bạn là một chuyên gia AI, hãy chuyển câu trong ngôn ngữ ký hiệu Việt Nam VSL sang ngôn ngữ nói."


def convert_to_chat_format(conversations, is_expert=False):
    #chon instruction cho tung nguon du lieu
    sys_instruction = SYS_INSTRUCT_EXPERT if is_expert else SYS_INSTRUCT_FRIENDLY
    messages = [{"role": "system", "content": sys_instruction}]
    for msg in conversations:
        role = "user" if msg["from"] == "human" else "assistant"
        messages.append({"role": role, "content": msg["value"]})
    return messages
    

def format_prompt(example, is_expert=False):
    messages = convert_to_chat_format(example["conversations"], is_expert=is_expert)
    return {
        "text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    }

# Áp dụng format_prompt cho từng dataset
hf_dataset = hf_dataset.map(lambda x: format_prompt(x, is_expert=False))  
local_dataset = local_dataset.map(lambda x: format_prompt(x, is_expert=True)) 

# Kết hợp dataset sau khi áp dụng instruction
combined_dataset = concatenate_datasets([hf_dataset, local_dataset])


def tokenize(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    )
    tokens["labels"] = tokens["input_ids"].clone()
    return {k: v.squeeze() for k, v in tokens.items()}

combined_dataset = combined_dataset.map(tokenize, batched=True, remove_columns=combined_dataset.column_names)

#CustomTrainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        labels = inputs["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = torch.nn.functional.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1),
            ignore_index=-100
        )
        return (loss, outputs) if return_outputs else loss

# TrainingArguments
training_args = TrainingArguments(
    per_device_train_batch_size=2, 
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    warmup_ratio=0.05, #tang dan to do hoc 0.05*max steps = 50 lan tang
    logging_steps=100, #ghi lai sau 100 buoc
    save_strategy="steps", #luu theo step
    save_steps=50,
    output_dir="./llama3-chat-t4",
    save_total_limit=2,
    max_steps=1000, 
    report_to="none",
    fp16=True,
    bf16=False, #la gia tri mac dinh, rong hon, on dinh voi gradient lon
)
#Khởi tạo trainer và huấn luyện
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    tokenizer=tokenizer,
)
trainer.train()

# 8. Merge LoRA và push lên Hugging Face
model = model.merge_and_unload()

# Tạo repository mới trên Hugging Face
repo_name = "VyDat/llama3-8b-vietnamese-multi"
create_repo(repo_id=repo_name, private=False, exist_ok=True)

# Push mô hình và tokenizer
model.push_to_hub(repo_name, commit_message="Merge & push fine-tuned LLaMA3-8B")
tokenizer.push_to_hub(repo_name, commit_message="Push tokenizer")