In [82]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
cache_dir = "/data/csy/huggingface"

In [23]:
# Extract the Lima dataset’s instruction
from datasets import load_dataset

lima = load_dataset("GAIR/lima", cache_dir=cache_dir)

In [24]:
import json
parsed_conversations = [
    json.loads(conv) if isinstance(conv, str) else conv
    for conv in lima["train"]["conversations"]
]

In [25]:
instructions = []
for conv in parsed_conversations:
    try:
        instructions.append(conv[0]["content"])
    except (KeyError, IndexError, TypeError):
        instructions.append("")  # 跳过无效条目或标记为空

In [26]:
#Sample 50 instructions
from datasets import Dataset
sampled_dataset = Dataset.from_dict({"instruction": instructions}).shuffle(seed=42).select(range(50))

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", cache_dir=cache_dir)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [35]:
# use mistralai/Mistral-7B-Instruct-v0.2 to generate 5 responses for each instruction
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.auto import tqdm

# 1. Load the tokenizer and model
model_name  = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer   = AutoTokenizer.from_pretrained(model_name)
model       = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# 2. Tell tokenizer & model which token to use for padding, then switch to eval mode
tokenizer.pad_token            = tokenizer.eos_token
model.config.pad_token_id      = tokenizer.eos_token_id
model.eval()

# 3. Generate 5 responses for each instruction
all_responses = []
with torch.no_grad():
    for instruction in tqdm(sampled_dataset["instruction"], desc="Generating"):
        # 3.1 Tokenize with padding/truncation
        enc = tokenizer.apply_chat_template(
            [{"role": "user", "content": instruction}],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to(model.device)

        # 3.2 Build attention mask (1 for non-pad, 0 for pad)
        input_ids     = enc
        attention_mask = (input_ids != tokenizer.pad_token_id).long()

        # 3.3 Generate 5 samples in one call
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            num_return_sequences=5,
            pad_token_id=tokenizer.pad_token_id,
        )

        # 3.4 Decode each sample and collect
        responses = [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]
        all_responses.append(responses)

# now all_responses is a list of lists of strings, shape (num_instructions, 5)




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Generating:   0%|          | 0/50 [00:00<?, ?it/s]

In [73]:
import llm_blender

blender = llm_blender.Blender()
# 注意：方法名是 loadranker，不是 load_ranker
blender.loadranker("llm-blender/PairRM", from_tf=True)



Successfully loaded ranker from  /home/csy/.cache/huggingface/hub/llm-blender/PairRM


In [79]:
#use PairRM to create a preference dataset
from datasets    import Dataset
from tqdm.auto   import tqdm

# --- 下面假设你已经有：
# sampled_dataset: 一个包含 50 条 instruction 的 Dataset 或 dict
# all_responses:   一个大小为 (50,5) 的 list，all_responses[i] 是第 i 条 instruction 对应的 5 个回复 list

preference_data = []

# 2. 对每条 instruction，下属 5 个回复，做两两比较累积胜场数
for instr, candidates in tqdm(
    zip(sampled_dataset["instruction"], all_responses),
    total=len(sampled_dataset),
    desc="Building preference data"
):
    n = len(candidates)  # 应该是 5
    wins = [0] * n

    # 枚举所有 i<j 的组合
    pairs = [(i, j) for i in range(n) for j in range(i+1, n)]
    for i, j in pairs:
        # 调用高阶接口 blender.compare，自动做 tokenize+forward
        # 返回一个 bool list，取第 0 个元素
        better = blender.compare(
            [instr],          # batch of 1 prompt
            [candidates[i]],  # batch of 1 chosen
            [candidates[j]]   # batch of 1 rejected
        )[0]

        # 胜者加一
        if better:
            wins[i] += 1
        else:
            wins[j] += 1

    # 3. 挑出胜场最多/最少
    best_i  = max(range(n), key=lambda k: wins[k])
    worst_j = min(range(n), key=lambda k: wins[k])

    preference_data.append({
        "instruction": instr,
        "chosen":     candidates[best_i],
        "rejected":   candidates[worst_j],
    })




Building preference data:   0%|          | 0/50 [00:00<?, ?it/s]

Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.24it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.26it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.09it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.08it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.23it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.20it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.27it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.77it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.15it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.14it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.21it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.56it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 17.14it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.90it/s]
Ranking candidates: 100%|██████████| 1/1 [00:00<00:00, 16.46it/s]
Ranking ca

In [83]:
# 4. 转成 HuggingFace Dataset 并上传
ds = Dataset.from_dict({
    "instruction": [d["instruction"] for d in preference_data],
    "chosen":      [d["chosen"]      for d in preference_data],
    "rejected":    [d["rejected"]    for d in preference_data],
})
ds.push_to_hub("ShuyanCHEN/DSAA6000_assignment4_PairRM")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ShuyanCHEN/DSAA6000_assignment4_PairRM/commit/11fc854a0df5683384233ce2e20c1f373ae31506', commit_message='Upload dataset', commit_description='', oid='11fc854a0df5683384233ce2e20c1f373ae31506', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ShuyanCHEN/DSAA6000_assignment4_PairRM', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ShuyanCHEN/DSAA6000_assignment4_PairRM'), pr_revision=None, pr_num=None)

In [None]:
#Use DPO to fine tune mistralai/Mistral-7B-Instruct-v0.2
from transformers import TrainingArguments
from trl import DPOTrainer
import pandas as pd
import torch

#sample 10 instructions that were not seen in training and generate samples
dpo_dataset = dataset.train_test_split(
    test_size=0.2,
    shuffle=True,
    seed=42
)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=5e-6,
    num_train_epochs=2,
    logging_steps=10,
    evaluation_strategy="no",
    output_dir="./dpo_model",
    fp16=True,
    optim="adamw_torch",
    report_to="none"
)

In [None]:
#Compare the completions from the original model (mistralai/Mistral-7B-Instruct-v0.2 and your DPO fine tuned model
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=dpo_dataset["train"],
    tokenizer=tokenizer,
    beta=0.1,
    max_length=512,
)
dpo_trainer.train()

In [None]:

test_instructions = dpo_dataset["test"]["instruction"][:10]

original_outputs = []
dpo_outputs = []


def generate_response(instruction, model):
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": instruction}],
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


for instr in test_instructions:

    original_model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        device_map="auto"
    )
    original_outputs.append(generate_response(instr, original_model))


    dpo_outputs.append(generate_response(instr, dpo_trainer.model))


In [None]:
# Display the instruction, original model completion, and DPO fine-tuned model completion as a pandas dataframe
df = pd.DataFrame({
    "Instruction": test_instructions,
    "Original Model": original_outputs,
    "DPO Model": dpo_outputs,
})

pd.set_option("display.max_colwidth", 200)
pd.set_option("display.width", 1000)

#print out the dataframe to stdout
print(df.to_markdown(index=False))



In [None]:
#Push the PEFT adapter to huggingface
dpo_trainer.model.save_pretrained("mistral-7b-dpo-adapter")

from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="mistral-7b-dpo-adapter",
    repo_id="ShuyanCHEN/DSAA6000_assignment4",
    repo_type="model"
)

print("\nhttps://huggingface.co/ShuyanCHEN/DSAA6000_assignment4")
