In [None]:
%pip install -q torch==2.5.1 transformers==4.45.2 datasets sentence-transformers peft accelerate trl==0.11.4 scikit-learn tensorboard

In [None]:
from pathlib import Path
import torch, os, json, random, ast
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    pipeline,
)
from transformers.trainer_utils import get_last_checkpoint
from peft import LoraConfig, get_peft_model, PeftModel
from trl import DPOTrainer, DPOConfig, SFTTrainer, SFTConfig
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import torch.nn.functional as F
from typing import Dict, Optional, List

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
WORKDIR = Path("./doaug_artifacts")
WORKDIR.mkdir(exist_ok=True)
HF_TOKEN = os.environ["HF_TOKEN"]
if not HF_TOKEN:
    raise ValueError("Hugging Face Hub 토큰을 'HF_TOKEN' 환경 변수로 설정해주세요.")

SYSTEM_MESSAGE = "You are a helpful assistant that only paraphrases."

print(f"Using {DEVICE}")
print(f"Artifacts will be saved to: {WORKDIR}")

  from .autonotebook import tqdm as notebook_tqdm


Using cuda
Artifacts will be saved to: doaug_artifacts_v2


# 1️⃣ Supervised Fine‑Tuning (SFT)
---

In [2]:
print("\n--- 1. SFT Stage ---")

dsft_path = WORKDIR / "DSFT_100k.jsonl"
raw_all = load_dataset("humarin/chatgpt-paraphrases", split="train")

if dsft_path.exists():
    print(f"DSFT_100k.jsonl already exists at {dsft_path}. Skipping generation.")
else:
    print("(i) Building DSFT_100k dataset...")

    rng = random.Random(321)
    shuffled_ds = raw_all.shuffle(seed=123)
    pairs = []
    dsft_sources = set()
    pbar = tqdm(total=100_000, desc="Generating 100k SFT pairs")
    for ex in shuffled_ds:
        if len(pairs) >= 100_000:
            break
        orig = ex["text"]
        if orig in dsft_sources:
            continue
        dsft_sources.add(orig)
        pars = ex["paraphrases"]
        if isinstance(pars, str):
            try:
                pars = ast.literal_eval(pars)
            except (ValueError, SyntaxError):
                continue
        for para in pars:
            if len(pairs) >= 100_000:
                break
            pairs.append({"sentence": orig, "paraphrase": para})
            pbar.update(1)
    pbar.close()
    assert len(pairs) == 100_000

    with open(dsft_path, "w", encoding="utf-8") as f:
        for p in pairs:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")
    print(f"DSFT saved to {dsft_path}")


--- 1. SFT Stage ---
DSFT_100k.jsonl already exists at doaug_artifacts_v2/DSFT_100k.jsonl. Skipping generation.


In [6]:
print("(ii) Tokenizing dataset for SFT...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


def format_and_mask_chat(example: Dict) -> Optional[Dict]:
    # ... (이전과 동일한 안정적인 마스킹 로직) ...
    chat_with_assistant = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {
            "role": "user",
            "content": f"You will be given a sentence. Please paraphrase the sentence.\nSentence: {example['sentence']}",
        },
        {"role": "assistant", "content": example["paraphrase"]},
    ]
    chat_prompt_only = chat_with_assistant[:-1]
    prompt_ids = tokenizer.apply_chat_template(
        chat_prompt_only, tokenize=True, add_generation_prompt=True
    )
    full_ids = tokenizer.apply_chat_template(
        chat_with_assistant, tokenize=True, add_generation_prompt=False
    )
    maxlen = tokenizer.model_max_length
    if len(prompt_ids) >= maxlen:
        return None
    labels = full_ids.copy()
    labels[: len(prompt_ids)] = [-100] * len(prompt_ids)
    input_ids = full_ids[:maxlen]
    attention_mask = [1] * len(input_ids)
    labels = labels[:maxlen]
    if all(l == -100 for l in labels):
        return None
    if len(labels) < len(input_ids):
        labels += [-100] * (len(input_ids) - len(labels))
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


dsft = load_dataset("json", data_files=str(dsft_path))["train"]
tokenized_dsft = dsft.map(format_and_mask_chat, remove_columns=dsft.column_names)
print(
    f"Tokenized SFT dataset created. {len(dsft) - len(tokenized_dsft)} examples were filtered out."
)

(ii) Tokenizing dataset for SFT...


Map: 100%|██████████| 100000/100000 [00:40<00:00, 2441.60 examples/s]

Tokenized SFT dataset created. 0 examples were filtered out.





In [None]:
print("(iii) Preparing model and training SFT LoRA adapter...")


class ChatDataCollator:
    def __init__(self, tokenizer, padding="longest"):
        self.pad = DataCollatorWithPadding(tokenizer, padding=padding)

    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
        labels = [f.pop("labels") for f in features]

        batch = self.pad(features)

        max_len = batch["input_ids"].size(1)
        padded = [l + [-100] * (max_len - len(l)) for l in labels]
        batch["labels"] = torch.tensor(padded, dtype=torch.long)
        return batch


collator = ChatDataCollator(tokenizer, padding="longest")

# 논문 명세: Llama-3.2-1B-Instruct with BF16 (부록 C)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", token=HF_TOKEN
)
# 논문 명세: LoRA rank r = 8 (부록 C)
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.config.use_cache = False
model.print_trainable_parameters()

# 논문 부록 C의 SFT 단계 하이퍼파라미터 설정
sft_cfg = SFTConfig(
    output_dir=str(WORKDIR / "sft"),
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=100,
    bf16=True,
    optim="adamw_torch",
    save_strategy="epoch",
    save_total_limit=3,
    max_seq_length=tokenizer.model_max_length,
)

trainer = SFTTrainer(
    model,
    train_dataset=tokenized_dsft,
    args=sft_cfg,
    tokenizer=tokenizer,
    data_collator=collator,
)
trainer.train(resume_from_checkpoint=get_last_checkpoint(sft_cfg.output_dir))
print("SFT training finished.")

In [11]:
print("(iv) Merging SFT LoRA adapter and saving the final model...")
del trainer, model
torch.cuda.empty_cache()

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", token=HF_TOKEN
)
ckpts = sorted(
    Path(sft_cfg.output_dir).glob("checkpoint-*"),
    key=lambda x: int(x.name.split("-")[-1]),
)
if not ckpts:
    raise ValueError("No SFT checkpoint found.")
last_checkpoint_path = ckpts[-1]

sft_model = PeftModel.from_pretrained(base_model, str(last_checkpoint_path))
sft_model = sft_model.merge_and_unload()

sft_merged_dir = WORKDIR / "sft_merged"
sft_model.save_pretrained(sft_merged_dir)
tokenizer.save_pretrained(sft_merged_dir)
print(f"SFT-merged model saved to: {sft_merged_dir}")
del base_model, sft_model
torch.cuda.empty_cache()

(iv) Merging SFT LoRA adapter and saving the final model...
SFT-merged model saved to: doaug_artifacts_v2/sft_merged


# 2️⃣ Direct Preference Optimization (DPO)
---

In [3]:
print("\n--- 2. DPO Stage ---")
ddpo_path = WORKDIR / "DDPO_50k.jsonl"

if ddpo_path.exists():
    print(f"DDPO_50k already exists at {ddpo_path}. Skipping generation.")
else:
    print("(i) Building DDPO_50k dataset...")
    EMB_MODEL = "sentence-transformers/all-MPNet-base-v2"
    embedder = SentenceTransformer(EMB_MODEL, device=DEVICE)
    raw_ddpo_candidates = [ex for ex in raw_all if ex["text"] not in dsft_sources]
    raw_ddpo = rng.sample(raw_ddpo_candidates, 50_000)

    prefs = []
    BATCH = 64
    for i in tqdm(range(0, len(raw_ddpo), BATCH), desc="Building DPO dataset"):
        chunk = raw_ddpo[i : i + BATCH]
        sentences = [ex["text"] for ex in chunk]
        paraphrase_lists = [
            (
                ast.literal_eval(ex["paraphrases"])
                if isinstance(ex["paraphrases"], str)
                else ex["paraphrases"]
            )
            for ex in chunk
        ]
        flat, valid_indices = [], []
        for j, (src, plist) in enumerate(zip(sentences, paraphrase_lists)):
            if isinstance(plist, list) and len(plist) >= 2:
                flat.append(src)
                flat.extend(plist)
                valid_indices.append(j)
        if not flat:
            continue
        embs = F.normalize(
            embedder.encode(flat, convert_to_tensor=True, device=DEVICE), p=2, dim=1
        )
        idx = 0
        for j in valid_indices:
            src, plist = sentences[j], paraphrase_lists[j]
            src_emb = embs[idx]
            par_embs = embs[idx + 1 : idx + 1 + len(plist)]
            idx += 1 + len(plist)
            dists = 1 - (par_embs @ src_emb)
            if dists.numel() < 2:
                continue
            iw, il = dists.argmax().item(), dists.argmin().item()
            if iw == il:
                continue
            chosen, rejected = plist[iw], plist[il]
            prompt_chat = [
                {"role": "system", "content": SYSTEM_MESSAGE},
                {
                    "role": "user",
                    "content": f"You will be given a sentence. Please paraphrase the sentence.\nSentence: {src}",
                },
            ]
            prompt_str = tokenizer.apply_chat_template(
                prompt_chat, tokenize=False, add_generation_prompt=True
            )
            prefs.append({"prompt": prompt_str, "chosen": chosen, "rejected": rejected})

    with ddpo_path.open("w", encoding="utf-8") as f:
        for p in prefs:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")
    print(f"DPO dataset saved. Size: {len(prefs)}")


--- 2. DPO Stage ---
DDPO_50k already exists at doaug_artifacts_v2/DDPO_50k.jsonl. Skipping generation.


In [6]:
print("(ii) Preparing model for DPO training...")
sft_dir = WORKDIR / "sft_merged"

tokenizer = AutoTokenizer.from_pretrained(
    sft_dir,
    use_fast=True,
    token=HF_TOKEN,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    sft_dir, torch_dtype=torch.bfloat16, device_map="auto", token=HF_TOKEN
)
lora_cfg_dpo = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg_dpo)
ref_model = AutoModelForCausalLM.from_pretrained(
    sft_dir, torch_dtype=torch.bfloat16, device_map="auto", token=HF_TOKEN
)
ref_model.requires_grad_(False)

(ii) Preparing model for DPO training...


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [7]:
print("(iii) Starting DPO training...")
ddpo_dataset = load_dataset("json", data_files=str(ddpo_path))["train"]

dpo_config = DPOConfig(
    output_dir=str(WORKDIR / "dpo"),
    max_length=256,
    max_prompt_length=128,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=5e-6,
    num_train_epochs=3,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=100,
    bf16=True,
    save_strategy="epoch",
    save_total_limit=3,
    beta=0.1,
    report_to="tensorboard",
)

dpo_trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=dpo_config,
    train_dataset=ddpo_dataset,
    tokenizer=tokenizer,
)
dpo_trainer.train(resume_from_checkpoint=get_last_checkpoint(dpo_config.output_dir))
print("DPO training finished.")

(iii) Starting DPO training...


Tokenizing train dataset: 100%|██████████| 50000/50000 [00:36<00:00, 1388.28 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
100,0.6931
200,0.6925
300,0.6832
400,0.6514
500,0.5682
600,0.4244
700,0.3473
800,0.3413
900,0.3371
1000,0.32


DPO training finished.


In [8]:
print("(iv) Merging DPO adapter and saving the final model...")
del dpo_trainer, model
torch.cuda.empty_cache()

final_model_peft = PeftModel.from_pretrained(
    AutoModelForCausalLM.from_pretrained(
        sft_dir, torch_dtype=torch.bfloat16, device_map="auto", token=HF_TOKEN
    ),
    get_last_checkpoint(dpo_config.output_dir),
)
final_model = final_model_peft.merge_and_unload()
final_dir = WORKDIR / "doaug_paraphraser"
final_model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)
print(f"DPO-finished model saved to {final_dir}")
del final_model, final_model_peft
torch.cuda.empty_cache()

(iv) Merging DPO adapter and saving the final model...
DPO-finished model saved to doaug_artifacts_v2/doaug_paraphraser


# 3️⃣ Quick Inference Check
---

In [9]:
print("\n--- 3. Inference Check ---")
paraphraser = pipeline(
    "text-generation",
    model=str(final_dir),
    tokenizer=tokenizer,
    device=0,
    torch_dtype=torch.bfloat16,
)


def paraphrase(sentence: str):
    prompt_chat = [
        {"role": "system", "content": SYSTEM_MESSAGE},
        {
            "role": "user",
            "content": f"You will be given a sentence. Please paraphrase the sentence.\nSentence: {sentence}",
        },
    ]
    prompt = tokenizer.apply_chat_template(
        prompt_chat, tokenize=False, add_generation_prompt=True
    )
    generated_text = paraphraser(
        prompt,
        max_new_tokens=64,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
    )[0]["generated_text"]
    response_part = (
        generated_text.split(prompt, 1)[-1].split(tokenizer.eos_token, 1)[0].strip()
    )
    return response_part


--- 3. Inference Check ---


In [10]:
print("\nTest Paraphrasing:")
test_sentence = "A single candle lit the dark, quiet room."
print(f"Original: {test_sentence}")
paraphrased_text = paraphrase(test_sentence)
print(f"Paraphrased: {paraphrased_text}")


Test Paraphrasing:
Original: A single candle lit the dark, quiet room.
Paraphrased: The ambiance was illuminated by a solitary source of illumination.
