In [4]:
import os
os.environ["HF_HOME"] = "/kaggle/working/hf"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

# Fix protobuf conflict first
!pip -q uninstall -y protobuf
!pip -q install -q "protobuf==4.25.3"
!python -c "import google.protobuf as pb; print('protobuf =', pb.__version__)"

# Install pinned libs (with deps)
!pip -q install -U "transformers==4.44.2" "tokenizers==0.19.1" "peft==0.11.1" \
                 "accelerate==0.33.0" "datasets==2.21.0" "sacrebleu==2.4.2"

import torch, transformers, peft, datasets
print("torch", torch.__version__)
print("transformers", transformers.__version__, "| peft", peft.__version__)
print("datasets", datasets.__version__)
print("GPU count:", torch.cuda.device_count())

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.3 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29.5, but you have protobuf 4.25.3 which is incompatible.
ray 2.51.1 requires click!=8.3.0,>=7.0, but you have click 8.3.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
pydrive2 1.21.3 requires cryptography<44, but you have cryptography 46.0.3 which is incompatible.
pydrive2 1.21.3 requires pyOpenSSL<=24.2.1,>=19.1.0, but you have pyopenssl 25.3.0 which is incompa

In [5]:
from pathlib import Path
from datasets import Dataset, DatasetDict

PROC_DIR = Path("/kaggle/input/data-vlsp/processed")
assert PROC_DIR.exists()

TRAIN_EN, TRAIN_VI = PROC_DIR/"train.en", PROC_DIR/"train.vi"
VALID_EN, VALID_VI = PROC_DIR/"valid.en", PROC_DIR/"valid.vi"
TEST_EN,  TEST_VI  = PROC_DIR/"test.en",  PROC_DIR/"test.vi"

OUT_ROOT = Path("/kaggle/working/vlsp_en2vi_run")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
DS_DIR = OUT_ROOT/"dataset_en2vi_raw"

def build_split(src_path: Path, tgt_path: Path):
    def gen():
        with src_path.open("r", encoding="utf-8") as fs, tgt_path.open("r", encoding="utf-8") as ft:
            for s, t in zip(fs, ft):
                s = s.strip()
                t = t.strip()
                if s and t:
                    yield {"src": s, "tgt": t}
    return Dataset.from_generator(gen)

raw = DatasetDict({
    "train": build_split(TRAIN_EN, TRAIN_VI),
    "valid": build_split(VALID_EN, VALID_VI),
    "test":  build_split(TEST_EN,  TEST_VI),
})
raw.save_to_disk(str(DS_DIR))
print("Saved dataset to:", DS_DIR)
print(raw)
print("Sample:", raw["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/490000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

Saved dataset to: /kaggle/working/vlsp_en2vi_run/dataset_en2vi_raw
DatasetDict({
    train: Dataset({
        features: ['src', 'tgt'],
        num_rows: 490000
    })
    valid: Dataset({
        features: ['src', 'tgt'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['src', 'tgt'],
        num_rows: 3000
    })
})
Sample: {'src': 'Characteristics of patients studied 60 patients with 60 soft-tisue defects in the weight-bearing area of the foot, including 46 male and 14 female.', 'tgt': 'Đặc điểm của nhóm BN nghiên cứu Tổng cộng có 60 BN với 60 KHPM, bao gồm 46 nam và 14 nữ.'}


In [None]:
# from pathlib import Path

# Path("/kaggle/working/train_qwen_en2vi_lora.py").write_text(r"""
# import os, argparse, glob
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["TRANSFORMERS_NO_TF"] = "1"
# os.environ["TRANSFORMERS_NO_FLAX"] = "1"

# import torch
# from datasets import load_from_disk
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# from peft import LoraConfig, get_peft_model

# def make_prompt_en2vi(src_en: str) -> str:
#     # Format chuẩn ChatML cho Qwen
#     return (
#         f"<|im_start|>system\nYou are a professional medical translator.<|im_end|>\n"
#         f"<|im_start|>user\nTranslate the following medical text from English to Vietnamese:\n{src_en}<|im_end|>\n"
#         f"<|im_start|>assistant\n"
#     )

# def preprocess_builder(tokenizer, max_len: int):
#     eos_id = tokenizer.eos_token_id
#     def _pp(ex):
#         prompt = make_prompt_en2vi(ex["src"])
#         completion = " " + ex["tgt"]

#         prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
#         comp_ids   = tokenizer(completion, add_special_tokens=False)["input_ids"]
#         if eos_id is not None:
#             comp_ids = comp_ids + [eos_id]

#         input_ids = (prompt_ids + comp_ids)[:max_len]
#         labels    = ([-100] * len(prompt_ids) + comp_ids)[:max_len]
#         attn      = [1] * len(input_ids)
#         return {"input_ids": input_ids, "attention_mask": attn, "labels": labels}
#     return _pp

# class CausalCollator:
#     def __init__(self, pad_id: int):
#         self.pad_id = pad_id
#     def __call__(self, feats):
#         max_len = max(len(f["input_ids"]) for f in feats)
#         def pad(x, v): return x + [v] * (max_len - len(x))
#         return {
#             "input_ids": torch.tensor([pad(f["input_ids"], self.pad_id) for f in feats], dtype=torch.long),
#             "attention_mask": torch.tensor([pad(f["attention_mask"], 0) for f in feats], dtype=torch.long),
#             "labels": torch.tensor([pad(f["labels"], -100) for f in feats], dtype=torch.long),
#         }

# def last_checkpoint(output_dir: str):
#     ckpts = sorted(glob.glob(os.path.join(output_dir, "checkpoint-*")), key=lambda p: int(p.split("-")[-1]))
#     return ckpts[-1] if ckpts else None

# def main():
#     ap = argparse.ArgumentParser()
#     ap.add_argument("--model_id", type=str, default="Qwen/Qwen2.5-1.5B-Instruct")
#     ap.add_argument("--dataset_dir", type=str, required=True)
#     ap.add_argument("--output_dir", type=str, required=True)

#     ap.add_argument("--max_seq_length", type=int, default=512)
#     ap.add_argument("--per_device_train_batch_size", type=int, default=1)
#     ap.add_argument("--per_device_eval_batch_size", type=int, default=1)
#     ap.add_argument("--gradient_accumulation_steps", type=int, default=32)

#     ap.add_argument("--learning_rate", type=float, default=2e-4)
#     ap.add_argument("--max_steps", type=int, default=20000)  # overnight
#     ap.add_argument("--warmup_ratio", type=float, default=0.03)

#     ap.add_argument("--lora_r", type=int, default=16)
#     ap.add_argument("--lora_alpha", type=int, default=32)
#     ap.add_argument("--lora_dropout", type=float, default=0.05)

#     ap.add_argument("--eval_steps", type=int, default=2000)
#     ap.add_argument("--save_steps", type=int, default=2000)
#     ap.add_argument("--logging_steps", type=int, default=50)
#     ap.add_argument("--seed", type=int, default=42)
#     args = ap.parse_args()

#     torch.manual_seed(args.seed)

#     local_rank = int(os.environ.get("LOCAL_RANK", "0"))
#     if torch.cuda.is_available():
#         torch.cuda.set_device(local_rank)

#     dsd = load_from_disk(args.dataset_dir)
#     train_raw = dsd["train"]
#     valid_raw = dsd["valid"]

#     tok = AutoTokenizer.from_pretrained(args.model_id, use_fast=True, trust_remote_code=True)
#     if tok.pad_token is None:
#         tok.pad_token = tok.eos_token
#     tok.padding_side = "right"

#     pp = preprocess_builder(tok, args.max_seq_length)
#     train_ds = train_raw.map(pp, remove_columns=train_raw.column_names, num_proc=4)
#     valid_ds = valid_raw.map(pp, remove_columns=valid_raw.column_names, num_proc=1)

#     model = AutoModelForCausalLM.from_pretrained(
#         args.model_id,
#         torch_dtype=torch.float16,
#         device_map={"": local_rank} if torch.cuda.is_available() else None,
#         low_cpu_mem_usage=True,
#         trust_remote_code=True,
#     )
#     model.config.use_cache = False

#     lora_cfg = LoraConfig(
#         r=args.lora_r, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout,
#         bias="none", task_type="CAUSAL_LM",
#         target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
#     )
#     model = get_peft_model(model, lora_cfg)

#     # IMPORTANT: keep trainable (LoRA) params in fp32 to avoid GradScaler "unscale FP16 gradients" error
#     for n, p in model.named_parameters():
#         if p.requires_grad:
#             p.data = p.data.float()


#     targs = TrainingArguments(
#         output_dir=args.output_dir,
#         seed=args.seed,
#         fp16=True, bf16=False,

#         per_device_train_batch_size=args.per_device_train_batch_size,
#         per_device_eval_batch_size=args.per_device_eval_batch_size,
#         gradient_accumulation_steps=args.gradient_accumulation_steps,

#         learning_rate=args.learning_rate,
#         warmup_ratio=args.warmup_ratio,
#         max_steps=args.max_steps,

#         logging_steps=args.logging_steps,
#         evaluation_strategy="steps",
#         eval_steps=args.eval_steps,

#         save_strategy="steps",
#         save_steps=args.save_steps,
#         save_total_limit=3,

#         load_best_model_at_end=True,
#         metric_for_best_model="eval_loss",
#         greater_is_better=False,

#         report_to="none",
#         ddp_find_unused_parameters=False,
#         remove_unused_columns=False,
#         optim="adamw_torch",

#         dataloader_num_workers=4,
#         dataloader_pin_memory=True,
#         group_by_length=True,
#     )

#     trainer = Trainer(
#         model=model,
#         args=targs,
#         train_dataset=train_ds,
#         eval_dataset=valid_ds,
#         data_collator=CausalCollator(tok.pad_token_id),
#     )

#     ckpt = last_checkpoint(args.output_dir)
#     trainer.train(resume_from_checkpoint=ckpt)
#     trainer.save_model(args.output_dir)
#     tok.save_pretrained(args.output_dir)

# if __name__ == "__main__":
#     main()
# """, encoding="utf-8")

# print("Wrote train script.")

In [None]:
# MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
# TRAIN_OUT = str(OUT_ROOT/"lora_en2vi_qwen2.5_1.5b")

# !TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=1 torchrun --nproc_per_node=2 /kaggle/working/train_qwen_en2vi_lora.py \
#   --model_id "{MODEL_ID}" \
#   --dataset_dir "{DS_DIR}" \
#   --output_dir "{TRAIN_OUT}" \
#   --max_seq_length 320 \
#   --per_device_train_batch_size 4 \
#   --per_device_eval_batch_size 4 \
#   --gradient_accumulation_steps 8 \
#   --learning_rate 2e-4 \
#   --lora_dropout 0.05 \
#   --max_steps 8000 \
#   --eval_steps 800 --save_steps 800 --logging_steps 50

In [1]:
from pathlib import Path

Path("/kaggle/working/eval_test_bleu_en2vi_ddp.py").write_text(r"""
import os, argparse
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

import torch
import torch.distributed as dist
from datasets import load_from_disk
from sacrebleu.metrics import BLEU
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

def make_prompt_en2vi(src_en: str) -> str:
    return (
        "You are a professional medical translator.\n"
        "### Task: Translate English to Vietnamese (medical domain)\n"
        f"### English: {src_en}\n"
        "### Vietnamese:"
    )

def ddp_setup():
    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
        dist.init_process_group(backend="nccl")
        rank = dist.get_rank()
        world = dist.get_world_size()
        local_rank = int(os.environ.get("LOCAL_RANK", "0"))
        torch.cuda.set_device(local_rank)
        return True, rank, world, local_rank
    return False, 0, 1, 0

@torch.inference_mode()
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--base_model_id", type=str, default="Qwen/Qwen2.5-1.5B-Instruct")
    ap.add_argument("--adapter_dir", type=str, required=True)
    ap.add_argument("--dataset_dir", type=str, required=True)

    ap.add_argument("--batch_size", type=int, default=8)        # per GPU
    ap.add_argument("--max_prompt_len", type=int, default=512)  # prompt max len
    ap.add_argument("--max_new_tokens", type=int, default=128)  # reduce to avoid rambling
    ap.add_argument("--num_beams", type=int, default=4)
    ap.add_argument("--out_hyp", type=str, required=True)
    args = ap.parse_args()

    is_ddp, rank, world, local_rank = ddp_setup()

    dsd = load_from_disk(args.dataset_dir)
    if "test" not in dsd:
        raise ValueError("dataset_dir phải có split 'test' (dsd['test']).")
    test = dsd["test"]
    n = len(test)

    tok = AutoTokenizer.from_pretrained(args.base_model_id, use_fast=True, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "left"

    base = AutoModelForCausalLM.from_pretrained(
        args.base_model_id,
        torch_dtype=torch.float16,
        device_map={"": local_rank} if torch.cuda.is_available() else None,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
    model = PeftModel.from_pretrained(base, args.adapter_dir)
    model.eval()
    model.config.use_cache = True

    my_idxs = list(range(rank, n, world))
    results = []  # list[(idx, hyp)]

    for start in range(0, len(my_idxs), args.batch_size):
        idxs = my_idxs[start:start+args.batch_size]
        batch_prompts = [make_prompt_en2vi(test[i]["src"]) for i in idxs]

        enc = tok(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=args.max_prompt_len,
        ).to(model.device)

        out = model.generate(
            **enc,
            do_sample=False,
            num_beams=args.num_beams,
            max_new_tokens=args.max_new_tokens,
            eos_token_id=tok.eos_token_id,
            pad_token_id=tok.pad_token_id,
            use_cache=True,
        )

        # ✅ FIX: cắt phần prompt theo chiều dài input đã PAD trong batch (works for left/right padding)
        input_len = enc["input_ids"].shape[1]

        for j, idx in enumerate(idxs):
            gen_ids = out[j, input_len:]
            hyp = tok.decode(gen_ids, skip_special_tokens=True).strip()

            # (optional) nếu model thỉnh thoảng tự in lại nhãn
            if "### Vietnamese:" in hyp:
                hyp = hyp.split("### Vietnamese:")[-1].strip()

            results.append((idx, hyp))

    # gather về rank0
    if is_ddp:
        gathered = [None for _ in range(world)]
        dist.all_gather_object(gathered, results)
        if rank == 0:
            merged = {}
            for part in gathered:
                for idx, hyp in part:
                    merged[idx] = hyp
            hyps = [merged[i] for i in range(n)]
        dist.barrier()
        dist.destroy_process_group()
        if rank != 0:
            return
    else:
        merged = {idx: hyp for idx, hyp in results}
        hyps = [merged[i] for i in range(n)]

    refs = [ex["tgt"] for ex in test]

    with open(args.out_hyp, "w", encoding="utf-8") as f:
        for h in hyps:
            f.write(h.replace("\n", " ") + "\n")

    bleu = BLEU(tokenize="13a")
    score = bleu.corpus_score(hyps, [refs])
    print("TEST BLEU:", score.score)
    print("Signature:", score.format(signature=True))

    # quick debug samples (rank0 only)
    for i in [0, 1, 2, 3, 4]:
        print("\\n--- sample", i, "---")
        print("SRC:", test[i]["src"])
        print("REF:", refs[i])
        print("HYP:", hyps[i])

if __name__ == "__main__":
    main()
""", encoding="utf-8")

print("Wrote FIXED DDP eval script.")

Wrote FIXED DDP eval script.


In [6]:
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

# đọc từ /kaggle/input (read-only)
DS_DIR    = "/kaggle/input/qwenoutput/vlsp_en2vi_run/dataset_en2vi_raw"
TRAIN_OUT = "/kaggle/input/qwenoutput/vlsp_en2vi_run/lora_en2vi_qwen2.5_1.5b"

# ghi output BLEU/hyp ra /kaggle/working (writeable)
HYP_PATH = "/kaggle/working/test_hyp_en2vi_fixed.txt"

!TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=1 torchrun --nproc_per_node=2 /kaggle/working/eval_test_bleu_en2vi_ddp.py \
  --base_model_id "{MODEL_ID}" \
  --adapter_dir "{TRAIN_OUT}" \
  --dataset_dir "{DS_DIR}" \
  --batch_size 8 \
  --max_new_tokens 128 \
  --out_hyp "{HYP_PATH}"

tokenizer_config.json: 7.30kB [00:00, 29.0MB/s]
vocab.json: 2.78MB [00:00, 66.8MB/s]
merges.txt: 1.67MB [00:00, 107MB/s]
tokenizer.json: 7.03MB [00:00, 185MB/s]
config.json: 100%|█████████████████████████████| 660/660 [00:00<00:00, 4.42MB/s]
model.safetensors: 100%|████████████████████| 3.09G/3.09G [00:10<00:00, 306MB/s]
generation_config.json: 100%|██████████████████| 242/242 [00:00<00:00, 1.74MB/s]
TEST BLEU: 48.71697059103983
Signature: BLEU|True = 48.72 75.4/57.4/44.6/35.4 (BP = 0.953 ratio = 0.954 hyp_len = 96193 ref_len = 100870)
\n--- sample 0 ---
SRC: Knowledge, practices in public health service utilization among health insurance card’s holders and influencing factors in Vientiane, Lao
REF: Thực trạng kiến thức và thực hành của người có thẻ bảo hiểm y tế trong sử dụng dịch vụ khám chữa bệnh ở các cơ sở y tế công và một số yếu tố ảnh hưởng tại tỉnh Viêng Chăn, CHDCND Lào, năm 2017
HYP: Kiến thức, thực hành sử dụng dịch vụ y tế công cộng của người sử dụng thẻ bảo hiểm y tế và cá