# Config

In [None]:
from getpass import getpass
import requests, json

token = getpass("Enter GitHub token (hidden): ")

r = requests.get("https://api.github.com/user", headers={"Authorization": f"token {token}"})
print(r.status_code)
print(r.json().get("login"))

In [None]:
username = "Vu-Quoc-Tuan"
repo_name = "BTL-NLP-2526I_INT3406_3"

# Sử dụng biến token để xác thực trực tiếp trong URL
!git clone https://{token}@github.com/{username}/{repo_name}

In [None]:
# 1) vào thư mục repo (thay path nếu khác)
%cd /content/BTL-NLP-2526I_INT3406_3

# 2) kiểm tra hiện tại đang ở đâu và liệt kê file
!pwd
!ls -la
!ls -la vlsp-mt
!test -f vlsp-mt/requirements.txt && echo "requirements found" || echo "requirements NOT found"

# 3) show python interpreter đang dùng
import sys
print("Python executable:", sys.executable)

# 4) dùng chính interpreter đó để cài (an toàn hơn dùng !pip trực tiếp)
!{sys.executable} -m pip install -r vlsp-mt/requirements.txt

In [None]:
!pip install evaluate

In [None]:
%cd vlsp-mt

# Preprocessing

In [None]:
!python scripts/preprocess_vlsp.py \
    --src_in data/raw/train.en.txt \
    --tgt_in data/raw/train.vi.txt \
    --out_dir data/clean \
    --min_len 3 \
    --max_len 256 \
    --max_ratio 3.0 \
    --dev_size 1000 \
    --test_size 1000

In [None]:
!!python scripts/dedup_minhash.py \
    --src data/clean/train.en \
    --tgt data/clean/train.vi \
    --out_dir data/dedup \
    --threshold 0.85 \
    --dedup_by both \
    --rep_strategy longest

In [None]:
# 1. Tạo thư mục đích trước
!mkdir -p data/rl_subset

# 2. Chạy lệnh cắt file
!head -n 50000 data/dedup/train.en > data/rl_subset/en.txt
!head -n 50000 data/dedup/train.vi > data/rl_subset/vi.txt

!ls -lh data/rl_subset/

# Train

## en2vi

In [None]:
!python scripts/train_qwen_lora.py \
    --model_name Qwen/Qwen2.5-3B-Instruct \
    --direction en2vi \
    --src data/dedup/train.en \
    --tgt data/dedup/train.vi \
    --run_id qwen_final \
    --batch_size 32 \
    --grad_accum 1 \
    --epochs 1 \
    --neftune_alpha 3.0 \
    --no_grad_checkpoint \
    --label_smoothing 0

In [None]:
!python scripts/rl_train_grpo.py \
    --model_name Qwen/Qwen2.5-3B-Instruct \
    --sft_adapter runs/qwen_final/lora_en2vi_sft \
    --init_adapter runs/qwen_final/lora_en2vi_sft \
    --rl_src data/clean/train.en \
    --rl_tgt data/clean/train.vi \
    --run_id qwen_rl_final \
    --direction en2vi \
    --batch_size 16 \
    --grad_accum_steps 4 \
    --epochs 1 \
    --use_grpo \
    --save_interval 100

In [None]:
!python scripts/back_translate.py \
    --model_name Qwen/Qwen2.5-3B-Instruct \
    --adapter_path tuan243/adapter-loRA-vlsp-mt/vi2en/vi2en_v2/lora_vi2en_sft \
    --input data/rl_subset/vi.txt \
    --output data/augment/bt.en \
    --direction vi2en \
    --batch_size 16 \
    --temperature 0.7 \
    --filter

In [None]:
!cat data/clean/test.en data/augment/bt.en > data/augment/train_aug_en2vi.en
!cat data/clean/test.vi data/rl_subset/vi.txt > data/augment/train_aug_en2vi.vi

In [None]:
!python scripts/train_qwen_lora.py \
    --direction en2vi \
    --src data/augment/train_aug_en2vi.en \
    --tgt data/augment/train_aug_en2vi.vi \
    --val_src data/clean/dev.en \
    --val_tgt data/clean/dev.vi \
    --run_id en2vi_v2 \
    --resume_from runs/qwen_final/lora_en2vi_sft \
    --epochs 1 \
    --lr 5e-5

# Eval

In [None]:
# test
!python scripts/generate.py \
    --model_name Qwen/Qwen2.5-3B-Instruct \
    --adapter_path runs/qwen_final/lora_en2vi_sft \
    --direction en2vi \
    --input data/clean/test.en \
    --output outputs/test.hyp.vi \
    --batch_size 16 \
    --num_beams 4 \
    --repetition_penalty 1.1

!python scripts/eval_bleu.py \
    --hyp outputs/test.hyp.vi \
    --ref data/clean/test.vi \
    --src data/clean/test.en \
    --no_meteor --gemini --gemini_api_key <API> \
    --gemini_verbose --gemini_samples 100 --gemini_batch_size 10

In [None]:
# Test unseen
!python scripts/generate.py \
    --model_name Qwen/Qwen2.5-3B-Instruct \
    --adapter_path runs/qwen_final/lora_en2vi_sft \
    --direction en2vi \
    --input data/raw/test_unseen_v3.en.txt \
    --output outputs/test_unseen_v3.hyp.vi \
    --batch_size 16 \
    --num_beams 4 \
    --repetition_penalty 1.1

!python scripts/eval_bleu.py \
    --hyp outputs/test_unseen_v3.hyp.vi \
    --ref data/raw/test_unseen_v3.vi.txt \
    --src data/raw/test_unseen_v3.en.txt \
    --no_meteor --gemini --gemini_api_key <API> \
    --gemini_verbose --gemini_samples 100 --gemini_batch_size 10

In [None]:
# Public test
!python scripts/generate.py \
    --model_name Qwen/Qwen2.5-3B-Instruct \
    --adapter_path runs/qwen_final/lora_en2vi_sft \
    --direction en2vi \
    --input data/raw/public_test.en.txt \
    --output outputs/public_test.hyp.vi \
    --batch_size 16 \
    --num_beams 4 \
    --repetition_penalty 1.1

!python scripts/eval_bleu.py \
    --hyp outputs/public_test.hyp.vi \
    --ref data/raw/public_test.vi.txt \
    --src data/raw/public_test.en.txt \
    --no_meteor --gemini --gemini_api_key <API> \
    --gemini_verbose --gemini_samples 100 --gemini_batch_size 10

Tương tự với 2 model RL và Back translation