In [1]:
%%capture
!pip install -q -U "protobuf==3.20.3" "transformers>=4.51.0" datasets accelerate peft trl wandb sacrebleu

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from tqdm import tqdm
from pathlib import Path

BASE_MODEL = "Qwen/Qwen3-1.7B"
CKPT_DIR = Path("/kaggle/input/stage-2/checkpoint-774")
MERGED_DIR = "Qwen3-1.7B-medical"

assert os.path.isdir(CKPT_DIR), f"Không thấy checkpoint dir: {CKPT_DIR}"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="sdpa",
)

peft_model = PeftModel.from_pretrained(base, CKPT_DIR)
merged = peft_model.merge_and_unload()

merged.config.use_cache = True
merged.eval()

merged.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)

print("Saved merged model to:", MERGED_DIR)

2025-12-17 09:53:59.489637: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765965239.505158     399 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765965239.509794     399 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Saved merged model to: Qwen3-1.7B-medical


In [3]:
import os
import wandb
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["WANDB_PROJECT"] = "Qwen3-1.7B-LoRA"
os.environ["WANDB_LOG_MODEL"] = "false"

wandb.login(key="17cf64fcdf2e849c5b569d29066ba2193798ba02")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmightdung7105[0m ([33mmightdung7105-vietnam-national-university-hanoi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
run = wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name="Qwen3-1.7B-LoRA",
    notes="LoRA SFT VI->EN",
)

In [5]:
import unicodedata
from datasets import Dataset, concatenate_datasets # Import thêm concatenate_datasets

# (Giữ nguyên các hàm clean_line, read_lines, group_lines_into_segments, build_segment_dataset)
# ... [VUI LÒNG GIỮ NGUYÊN HOẶC CHẠY LẠI ĐỊNH NGHĨA CÁC HÀM NÀY TỪ CÁC BƯỚC TRƯỚC] ...
def clean_line(s: str) -> str:
    if s is None:
        return ""
    s = s.replace("\ufeff", "")
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u2013", "-").replace("\u2014", "-")
    s = s.replace('\\u200b', '').replace("\u200e", "")
    s = s.replace("“", '\"').replace("”", '\"')
    s = s.replace("‘", "'").replace("’", "'")
    s = s.replace("‟", '\"')
    s = s.replace("‛", "'")
    s = " ".join(s.strip().split())
    return s

def read_lines(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return [clean_line(line.rstrip("\n")) for line in f]

def group_lines_into_segments(lines: list[str], N: int = 10) -> list[str]:
    segments = []
    for i in range(0, len(lines), N):
        chunk = lines[i:i + N]
        segments.append("\n".join(chunk))
    return segments

def build_segment_dataset(en_path: str, vi_path: str, N_lines: int = 10):
    en_lines = read_lines(en_path)
    vi_lines = read_lines(vi_path)
    
    if len(en_lines) != len(vi_lines):
        raise ValueError(f"Line count mismatch: en={len(en_lines)} vi={len(vi_lines)}")
        
    total_lines = len(en_lines)
    aligned_total = (total_lines // N_lines) * N_lines
    
    en_segments = group_lines_into_segments(en_lines[:aligned_total], N=N_lines)
    vi_segments = group_lines_into_segments(vi_lines[:aligned_total], N=N_lines)
    
    dropped_truncation = total_lines - aligned_total 

    rows = []
    dropped_empty = 0
    
    for i, (src, tgt) in enumerate(zip(en_segments, vi_segments)):
        if not src.strip() or not tgt.strip():
            dropped_empty += 1
            continue
        rows.append({"en": src, "vi": tgt, "idx": i * N_lines}) 

    ds = Dataset.from_list(rows)
    return ds, dropped_empty * N_lines + dropped_truncation
# -----------------------------------------------------------------------------


# 1. Tải và tiền xử lý ngữ liệu GỐC (Train/Validation Split Source)
raw_ds_original, dropped_orig = build_segment_dataset(
    "/kaggle/input/train-data/train.en.txt", 
    "/kaggle/input/train-data/train.vi.txt", 
    N_lines=10
)
print("Total Original segments:", len(raw_ds_original), "Dropped lines:", dropped_orig)

split = raw_ds_original.train_test_split(test_size=0.01, seed=42)
train_ds_orig = split["train"]
eval_ds = split["test"]

try:
    raw_ds_add, dropped_add = build_segment_dataset(
        "/kaggle/input/train-add/train_en_add.txt", 
        "/kaggle/input/train-data/train.vi.txt", # CẦN ĐẢM BẢO FILE NÀY TỒN TẠI
        N_lines=10
    )
    print("Total Added segments:", len(raw_ds_add), "Dropped lines:", dropped_add)
except FileNotFoundError:
    print("WARNING: train_vi_add.txt không được tìm thấy. Chỉ sử dụng dữ liệu gốc.")
    raw_ds_add = None

if raw_ds_add:
    train_ds = concatenate_datasets([train_ds_orig, raw_ds_add])
else:
    train_ds = train_ds_orig

print("\n--- FINAL DATASET SIZES ---")
print("Train segments (Original + Added):", len(train_ds))
print("Eval segments (Original only):", len(eval_ds))

Total Original segments: 50000 Dropped lines: 0
Total Added segments: 50000 Dropped lines: 0

--- FINAL DATASET SIZES ---
Train segments (Original + Added): 99500
Eval segments (Original only): 500


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path

MODEL_NAME = Path("/kaggle/working/Qwen3-1.7B-medical")

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B", use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="sdpa",
)

system_prompt = (
    "You are a medical translation engine. Translate from English to Vietnamese. "
    "Rules: Keep abbreviations as-is (e.g., V.A, V.a, PTA, Type B/C/As). "
    "Preserve all numbers, %, ±, ≥, ≤,... parentheses, and punctuation. "
    "Do not add explanations. Output only the Vietnamese translation."
    "Prioritize medical accuracy and use standard Vietnamese medical terminology."
)

In [7]:
def to_prompt_completion(ex):
    system_msg = {"role": "system", "content": system_prompt}
    user_msg   = {"role": "user", "content": f"Translate English to Vietnamese:\n{ex['en']}"}
    asst_msg   = {"role": "assistant", "content": ex["vi"]}

    prompt_text = tokenizer.apply_chat_template(
        [system_msg, user_msg],
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    full_text = tokenizer.apply_chat_template(
        [system_msg, user_msg, asst_msg],
        tokenize=False,
        add_generation_prompt=False,
        enable_thinking=False,
    )

    completion_text = full_text[len(prompt_text):]
    return {"prompt": prompt_text, "completion": completion_text, "en": ex["en"], "vi": ex["vi"]}

columns_to_remove = [col for col in train_ds.column_names if col in ['idx']]

train_pc = train_ds.map(to_prompt_completion, remove_columns=columns_to_remove)
eval_pc  = eval_ds.map(to_prompt_completion,  remove_columns=columns_to_remove)

Map:   0%|          | 0/99500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
print(train_pc[0]["prompt"][:]) 
print(train_pc[0]["completion"][:])

<|im_start|>system
You are a medical translation engine. Translate from English to Vietnamese. Rules: Keep abbreviations as-is (e.g., V.A, V.a, PTA, Type B/C/As). Preserve all numbers, %, ±, ≥, ≤,... parentheses, and punctuation. Do not add explanations. Output only the Vietnamese translation.Prioritize medical accuracy and use standard Vietnamese medical terminology.<|im_end|>
<|im_start|>user
Translate English to Vietnamese:
Diuretic-induced hyperuricemia without gout does not require treatment or discontinuation of the diuretic.
Diuretics may slightly increase mortality in patients with a history of heart failure who do not have pulmonary congestion, particularly in those who are also taking an ACE inhibitor or angiotensin II receptor blocker and who do not drink at least 1400 mL (48 oz) of fluid daily.
The increased mortality is probably related to diuretic-induced hyponatremia and hypotension.
Adrenergic modifiers Adrenergic modifiers include central alpha-2-agonists, postsynaptic

In [9]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

In [10]:
import math
import random
import torch
import wandb
from transformers import TrainerCallback
from trl import SFTTrainer, SFTConfig
from sacrebleu.metrics import BLEU

bleu_metric = BLEU(tokenize="none", effective_order=True)

class PPLCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        if "loss" in logs and logs["loss"] is not None:
            loss = float(logs["loss"])
            try:
                logs["ppl"] = math.exp(loss)
            except OverflowError:
                logs["ppl"] = float("inf")

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not metrics:
            return
        if "eval_loss" in metrics and metrics["eval_loss"] is not None:
            loss = float(metrics["eval_loss"])
            try:
                metrics["eval_ppl"] = math.exp(loss)
            except OverflowError:
                metrics["eval_ppl"] = float("inf")

class BLEUCallback(TrainerCallback):
    def __init__(self, tokenizer, eval_ds, system_prompt, n_samples=100):
        self.tokenizer = tokenizer
        self.eval_ds = eval_ds
        self.system_prompt = system_prompt
        self.n_samples = n_samples

    def translate_one(self, model, en: str, max_new_tokens=2048):
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": f"Translate English to Vietnamese:\n{en}"},
        ]
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            model.eval() 
            out = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.2,
                top_p=0.9,
            )
        return self.tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if state.is_local_process_zero:
            model = kwargs['model']
            data = random.sample(list(self.eval_ds), k=min(self.n_samples, len(self.eval_ds)))

            refs = []
            hyps = []
            
            unwrapped_model = model.module if hasattr(model, "module") else model

            for ex in data:
                pred = self.translate_one(unwrapped_model, ex["en"])
                hyps.append(pred)
                refs.append(ex["vi"])

            # Tính BLEU score
            score = bleu_metric.corpus_score(hyps, [refs]).score

            # Log kết quả lên W&B
            if args.report_to and "wandb" in args.report_to:
                wandb.log({
                    "eval/bleu": score,
                    "eval/bleu_n_samples": len(data),
                }, step=state.global_step)

            if metrics is not None:
                metrics["eval_bleu"] = score
            print(f"\nBLEU (tokenize=none) on {len(data)} samples: {score:.2f}")

        return control

In [11]:
args = SFTConfig(
    output_dir="Qwen3-1.7B-medical-LoRA",
    max_length=2048,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,

    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    
    learning_rate=1e-4,
    num_train_epochs=1,
    warmup_ratio=0.03,
    logging_steps=5,
    eval_steps=100,
    save_steps=100,
    save_total_limit=4,
    eval_strategy="steps",
    save_strategy="steps",
    

    bf16=True,
    completion_only_loss=True,
    packing=False,
    group_by_length=False,

    report_to=["wandb"],
    run_name=run.name,
)

bleu_callback = BLEUCallback(
    tokenizer=tokenizer,
    eval_ds=eval_pc, 
    system_prompt=system_prompt,
    n_samples=10,
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_pc,
    eval_dataset=eval_pc,
    peft_config=lora_config,
    processing_class=tokenizer,
    callbacks=[PPLCallback(), bleu_callback], 
)

trainer.train()

Adding EOS to train dataset:   0%|          | 0/99500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/99500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/99500 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy,Ppl,Bleu
100,0.8834,1.041244,1.834494,5670209.0,0.766798,2.832739,21.094631
200,0.8124,0.943802,1.809879,11349846.0,0.784024,2.569732,23.478552
300,0.7602,0.902939,1.761972,17021396.0,0.792294,2.466842,24.671251
400,0.7216,0.874753,1.796619,22732438.0,0.796803,2.398282,37.444362
500,0.7133,0.853596,1.782468,28412857.0,0.800852,2.348075,43.51331
600,0.7023,0.840725,1.77248,34096277.0,0.802966,2.318047,41.977407
700,0.6606,0.826913,1.793714,39789516.0,0.804778,2.28625,19.75565
800,0.6401,0.818832,1.777114,45476834.0,0.807018,2.26785,44.282744
900,0.6708,0.808511,1.798132,51156411.0,0.808014,2.244562,39.913957
1000,0.6845,0.801396,1.816726,56851859.0,0.809115,2.22865,44.424218



BLEU (tokenize=none) on 10 samples: 21.09

BLEU (tokenize=none) on 10 samples: 23.48

BLEU (tokenize=none) on 10 samples: 24.67

BLEU (tokenize=none) on 10 samples: 37.44

BLEU (tokenize=none) on 10 samples: 43.51

BLEU (tokenize=none) on 10 samples: 41.98

BLEU (tokenize=none) on 10 samples: 19.76

BLEU (tokenize=none) on 10 samples: 44.28

BLEU (tokenize=none) on 10 samples: 39.91

BLEU (tokenize=none) on 10 samples: 44.42

BLEU (tokenize=none) on 10 samples: 30.18

BLEU (tokenize=none) on 10 samples: 39.91

BLEU (tokenize=none) on 10 samples: 35.74

BLEU (tokenize=none) on 10 samples: 41.62

BLEU (tokenize=none) on 10 samples: 38.84


TrainOutput(global_step=1555, training_loss=0.7185183158641459, metrics={'train_runtime': 13433.6373, 'train_samples_per_second': 7.407, 'train_steps_per_second': 0.116, 'total_flos': 1.145411217321984e+18, 'train_loss': 0.7185183158641459, 'epoch': 1.0})