In [1]:
%%capture
!pip install -q -U "protobuf==3.20.3" "transformers>=4.51.0" datasets accelerate peft trl wandb sacrebleu

In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from tqdm import tqdm
from pathlib import Path

BASE_MODEL = "Qwen/Qwen3-1.7B"
CKPT_DIR = Path("/kaggle/input/stage-1/checkpoint-774")
MERGED_DIR = "Qwen3-1.7B-en2vi"

assert os.path.isdir(CKPT_DIR), f"Kh√¥ng th·∫•y checkpoint dir: {CKPT_DIR}"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="sdpa",
)

peft_model = PeftModel.from_pretrained(base, CKPT_DIR)
merged = peft_model.merge_and_unload()

merged.config.use_cache = True
merged.eval()

merged.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)

print("Saved merged model to:", MERGED_DIR)

2025-12-16 19:54:51.488898: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765914891.605981      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765914891.639547      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Saved merged model to: Qwen3-1.7B-en2vi


In [3]:
import os
import wandb

os.environ["WANDB_PROJECT"] = "Qwen3-1.7B-LoRA-vi2en"
os.environ["WANDB_LOG_MODEL"] = "false"

wandb.login(key="17cf64fcdf2e849c5b569d29066ba2193798ba02")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmightdung7105[0m ([33mmightdung7105-vietnam-national-university-hanoi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
run = wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name="Qwen3-1.7B-LoRA-vi2en",
    notes="LoRA SFT VI->EN",
)

[34m[1mwandb[0m: Tracking run with wandb version 0.23.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20251216_195516-lrn3414u[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mQwen3-1.7B-LoRA-vi2en[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/mightdung7105-vietnam-national-university-hanoi/Qwen3-1.7B-LoRA-vi2en[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/mightdung7105-vietnam-national-university-hanoi/Qwen3-1.7B-LoRA-vi2en/runs/lrn3414u[0m


In [5]:
import unicodedata
from datasets import Dataset

def clean_line(s: str) -> str:
    if s is None:
        return ""
    s = s.replace("\ufeff", "")
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u2013", "-").replace("\u2014", "-")
    s = s.replace('\u200b', '').replace("\u200e", "")
    s = s.replace("‚Äú", '"').replace("‚Äù", '"')
    s = s.replace("‚Äò", "'").replace("‚Äô", "'")
    s = s.replace("‚Äü", '"')
    s = s.replace("‚Äõ", "'")
    s = " ".join(s.strip().split())
    return s

def read_lines(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return [clean_line(line.rstrip("\n")) for line in f]


def group_lines_into_segments(lines: list[str], N: int = 10) -> list[str]:
    segments = []
    for i in range(0, len(lines), N):
        chunk = lines[i:i + N]
        segments.append("\n".join(chunk))
    return segments

def build_dataset(en_path: str, vi_path: str, N_lines: int = 10):
    en_lines = read_lines(en_path)
    vi_lines = read_lines(vi_path)
    
    if len(en_lines) != len(vi_lines):
        raise ValueError(f"Line count mismatch: en={len(en_lines)} vi={len(vi_lines)}")
        
    total_lines = len(en_lines)
    aligned_total = (total_lines // N_lines) * N_lines
    
    en_segments = group_lines_into_segments(en_lines[:aligned_total], N=N_lines)
    vi_segments = group_lines_into_segments(vi_lines[:aligned_total], N=N_lines)
    
    dropped_truncation = total_lines - aligned_total 

    rows = []
    dropped_empty = 0
    
    for i, (src, tgt) in enumerate(zip(en_segments, vi_segments)):
        if not src.strip() or not tgt.strip():
            dropped_empty += 1
            continue
        rows.append({"en": src, "vi": tgt, "idx": i * N_lines}) 

    ds = Dataset.from_list(rows)
    return ds, dropped_empty * N_lines + dropped_truncation

raw_ds, dropped = build_dataset("/kaggle/input/train-data/train.vi.txt", "/kaggle/input/train-data/train.en.txt", N_lines=10)
print("total_pairs (segments):", len(raw_ds), "dropped_lines:", dropped)

split = raw_ds.train_test_split(test_size=0.01, seed=42)
train_ds, eval_ds = split["train"], split["test"]
len(train_ds), len(eval_ds)

total_pairs (segments): 50000 dropped_lines: 0


(49500, 500)

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path

MODEL_NAME = Path("/kaggle/working/Qwen3-1.7B-en2vi")

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B", use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="sdpa",
)

system_prompt = (
    "You are a medical translation engine. Translate from Vietnamese to English. "
    "Rules: Keep abbreviations as-is (e.g., V.A, V.a, PTA, Type B/C/As). "
    "Preserve all numbers, %, ¬±, ‚â•, ‚â§,... parentheses, and punctuation. "
    "Do not add explanations. Output only the English translation."
    "Prioritize medical accuracy and use standard English medical terminology."
)

In [7]:
def to_prompt_completion(ex):
    system_msg = {"role": "system", "content": system_prompt}
    user_msg   = {"role": "user", "content": f"Translate Vietnamese to English:\n{ex['en']}"}
    asst_msg   = {"role": "assistant", "content": ex["vi"]}

    prompt_text = tokenizer.apply_chat_template(
        [system_msg, user_msg],
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    full_text = tokenizer.apply_chat_template(
        [system_msg, user_msg, asst_msg],
        tokenize=False,
        add_generation_prompt=False,
        enable_thinking=False,
    )

    completion_text = full_text[len(prompt_text):]
    return {"prompt": prompt_text, "completion": completion_text, "en": ex["en"], "vi": ex["vi"]}

columns_to_remove = [col for col in train_ds.column_names if col in ['idx']]

train_pc = train_ds.map(to_prompt_completion, remove_columns=columns_to_remove)
eval_pc  = eval_ds.map(to_prompt_completion,  remove_columns=columns_to_remove)

Map:   0%|          | 0/49500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
print(train_pc[0]["prompt"][:]) 
print(train_pc[0]["completion"][:])

<|im_start|>system
You are a medical translation engine. Translate from Vietnamese to English. Rules: Keep abbreviations as-is (e.g., V.A, V.a, PTA, Type B/C/As). Preserve all numbers, %, ¬±, ‚â•, ‚â§,... parentheses, and punctuation. Do not add explanations. Output only the English translation.Prioritize medical accuracy and use standard English medical terminology.<|im_end|>
<|im_start|>user
Translate Vietnamese to English:
TƒÉng axit uric m√°u do d√πng thu·ªëc l·ª£i ti·ªÉu m√† kh√¥ng g√¢y b·ªánh gout kh√¥ng c·∫ßn ph·∫£i ƒëi·ªÅu tr·ªã ho·∫∑c ng∆∞ng d√πng thu·ªëc l·ª£i ti·ªÉu.
Thu·ªëc l·ª£i ti·ªÉu c√≥ th·ªÉ l√†m tƒÉng nh·∫π t·ª∑ l·ªá t·ª≠ vong ·ªü nh·ªØng b·ªánh nh√¢n c√≥ ti·ªÅn s·ª≠ suy tim kh√¥ng c√≥ ·ª© m√°u ph·ªïi, ƒë·∫∑c bi·ªát ·ªü nh·ªØng ng∆∞·ªùi c≈©ng ƒëang d√πng thu·ªëc ·ª©c ch·∫ø ACE ho·∫∑c thu·ªëc ch·∫πn th·ª• th·ªÉ angiotensin II v√† nh·ªØng ng∆∞·ªùi kh√¥ng u·ªëng √≠t nh·∫•t 1400 mL n∆∞·ªõc (48 oz) m·ªói ng√†y.
T·ª∑ l·ªá t·ª≠ vong tƒÉng l√™n c√≥ th·ªÉ li√™n quan ƒë·∫øn h·∫

In [9]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

In [10]:
import math
import random
import torch
import wandb
from transformers import TrainerCallback
from trl import SFTTrainer, SFTConfig
from sacrebleu.metrics import BLEU

bleu_metric = BLEU(tokenize="none", effective_order=True)

class PPLCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        if "loss" in logs and logs["loss"] is not None:
            loss = float(logs["loss"])
            try:
                logs["ppl"] = math.exp(loss)
            except OverflowError:
                logs["ppl"] = float("inf")

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if not metrics:
            return
        if "eval_loss" in metrics and metrics["eval_loss"] is not None:
            loss = float(metrics["eval_loss"])
            try:
                metrics["eval_ppl"] = math.exp(loss)
            except OverflowError:
                metrics["eval_ppl"] = float("inf")

class BLEUCallback(TrainerCallback):
    def __init__(self, tokenizer, eval_ds, system_prompt, n_samples=100):
        self.tokenizer = tokenizer
        self.eval_ds = eval_ds
        self.system_prompt = system_prompt
        self.n_samples = n_samples

    def translate_one(self, model, en: str, max_new_tokens=2048):
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": f"Translate English to Vietnamese:\n{en}"},
        ]
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )
        inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            model.eval() 
            out = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.2,
                top_p=0.9,
            )
        return self.tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
    
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if state.is_local_process_zero:
            model = kwargs['model']
            data = random.sample(list(self.eval_ds), k=min(self.n_samples, len(self.eval_ds)))

            refs = []
            hyps = []
            
            unwrapped_model = model.module if hasattr(model, "module") else model

            for ex in data:
                pred = self.translate_one(unwrapped_model, ex["en"])
                hyps.append(pred)
                refs.append(ex["vi"])

            # T√≠nh BLEU score
            score = bleu_metric.corpus_score(hyps, [refs]).score

            # Log k·∫øt qu·∫£ l√™n W&B
            if args.report_to and "wandb" in args.report_to:
                wandb.log({
                    "eval/bleu": score,
                    "eval/bleu_n_samples": len(data),
                }, step=state.global_step)

            if metrics is not None:
                metrics["eval_bleu"] = score
            print(f"\nBLEU (tokenize=none) on {len(data)} samples: {score:.2f}")

        return control

In [11]:
args = SFTConfig(
    output_dir="Qwen3-1.7B-LoRA",
    max_length=2048,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    
    learning_rate=1e-4,
    num_train_epochs=1,
    warmup_ratio=0.03,
    logging_steps=5,
    eval_steps=50,
    save_steps=50,
    save_total_limit=4,
    eval_strategy="steps",
    save_strategy="steps",

    bf16=True,
    completion_only_loss=True,
    packing=False,
    group_by_length=False,

    report_to=["wandb"],
    run_name=run.name,
)

bleu_callback = BLEUCallback(
    tokenizer=tokenizer,
    eval_ds=eval_pc, 
    system_prompt=system_prompt,
    n_samples=10,
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_pc,
    eval_dataset=eval_pc,
    peft_config=lora_config,
    processing_class=tokenizer,
    callbacks=[PPLCallback(), bleu_callback], 
)

trainer.train()

Adding EOS to train dataset:   0%|          | 0/49500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/49500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/49500 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy,Ppl,Bleu
50,1.0943,1.083736,2.167039,2788795.0,0.743045,2.955702,26.148978
100,0.9918,1.02923,2.375309,5590198.0,0.755163,2.798911,31.099993
150,1.0051,1.005802,2.499391,8364762.0,0.760333,2.7341,27.715712
200,0.9593,0.991232,2.567763,11165769.0,0.762615,2.694552,26.952598
250,0.9743,0.979975,2.572923,13962517.0,0.764801,2.66439,31.494103
300,1.0014,0.970825,2.534983,16771479.0,0.766429,2.640123,30.484495
350,0.9335,0.964083,2.506672,19558422.0,0.76764,2.622383,29.859529
400,0.9856,0.957601,2.547124,22337984.0,0.768804,2.605438,34.347163
450,0.956,0.952693,2.554838,25126278.0,0.769615,2.592683,29.876734
500,0.945,0.949043,2.545596,27925471.0,0.770881,2.583235,31.419528



BLEU (tokenize=none) on 10 samples: 26.15

BLEU (tokenize=none) on 10 samples: 31.10

BLEU (tokenize=none) on 10 samples: 27.72

BLEU (tokenize=none) on 10 samples: 26.95

BLEU (tokenize=none) on 10 samples: 31.49

BLEU (tokenize=none) on 10 samples: 30.48

BLEU (tokenize=none) on 10 samples: 29.86

BLEU (tokenize=none) on 10 samples: 34.35

BLEU (tokenize=none) on 10 samples: 29.88

BLEU (tokenize=none) on 10 samples: 31.42

BLEU (tokenize=none) on 10 samples: 26.54

BLEU (tokenize=none) on 10 samples: 31.23

BLEU (tokenize=none) on 10 samples: 29.15

BLEU (tokenize=none) on 10 samples: 31.22

BLEU (tokenize=none) on 10 samples: 29.06


TrainOutput(global_step=774, training_loss=0.980014523178416, metrics={'train_runtime': 7190.5407, 'train_samples_per_second': 6.884, 'train_steps_per_second': 0.108, 'total_flos': 5.568648675926016e+17, 'train_loss': 0.980014523178416, 'entropy': 2.58049590247018, 'num_tokens': 43214818.0, 'mean_token_accuracy': 0.7779610923358372, 'epoch': 1.0})