***下载数据集***

In [None]:
from huggingface_hub import snapshot_download

data_dir = snapshot_download(
    repo_id="bigcode/the-stack",
    repo_type="dataset",
    allow_patterns="data/java/*",
    local_dir="./the-stack-java",
    token="Token",  # 如果未用 huggingface-cli 登录，需手动传 Token
)
print(f"数据已下载到: {data_dir}")

***制作SFT数据集***

In [None]:
import json
from datasets import load_dataset, Dataset
import random
import os

def generate_fim_samples(code):
    lines = code.split('\n')
    if len(lines) < 10:
        return None
    start_line = random.randint(3, len(lines)-5)
    end_line = min(start_line + random.randint(2, 6), len(lines))  # 补全 2-6 行
    return {
        "prefix": "\n".join(lines[:start_line]),
        "middle": "\n".join(lines[start_line:end_line]),
        "suffix": "\n".join(lines[end_line:])
    }

# 加载数据（非流模式，确保能拆分）
dataset = load_dataset(
    "/root/autodl-tmp/fim_dataset/group_2",
    split="train"
)

fim_samples = []
for sample in dataset:
    sample_code = sample["content"]
    if len(sample_code) > 10:
        fim_sample = generate_fim_samples(sample_code)
        if fim_sample:
            fim_samples.append(fim_sample)

def format_for_llama_factory(s):
    return {
        "instruction": "Complete the Java code between <|fim_begin|> and <|fim_end|> markers",
        "input": f"<|fim_begin|>{s['prefix']}<|fim_hole|>{s['suffix']}<|fim_end|>",
        "output": s['middle']
    }

formatted_data = [format_for_llama_factory(s) for s in fim_samples]
dataset = Dataset.from_list(formatted_data)
dataset = dataset.train_test_split(test_size=0.05, seed=42)

output_dir = "/root/autodl-tmp/LLaMA-Factory/data"
os.makedirs(output_dir, exist_ok=True)

# 显式保存为 JSON（关键步骤）
dataset["train"].to_json(f"{output_dir}/train2.json")
dataset["test"].to_json(f"{output_dir}/test2.json")

# 配置文件指向正确的 JSON 文件
with open(f"{output_dir}/dataset_info.json", "w") as f:
    json.dump({
        "java_fim": {
            "file_name": "train.json",  # 与实际文件名一致
            "columns": {"prompt": "input", "query": "instruction", "response": "output"}
        }
    }, f, indent=2)

***制作DPO数据集***

In [None]:
import json

# 存储构造好的数据集
new_dataset = []

# JSONL文件路径
jsonl_file_path = "/root/autodl-tmp/LLaMA-Factory/saves/DeepSeek-Coder-6.7B-Base/lora/eval_2025-05-11-11-18-32/generated_predictions.jsonl"

# 逐行读取JSONL文件
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        # 构造新的数据结构
        new_data = {
            "conversations": [
                {
                    "from": "human",
                    "value": data["prompt"]
                }
            ],
            "chosen": {
                "from": "gpt",
                "value": data["label"]
            },
            "rejected": {
                "from": "gpt",
                "value": data["predict"]
            }
        }
        new_dataset.append(new_data)

# 输出构造好的数据集，这里只是打印，你也可以将其保存为新的文件
# 将构造好的数据集保存为新的JSON文件
output_file_path = "/root/autodl-tmp/LLaMA-Factory/data/dpo_train.json"
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    json.dump(new_dataset, output_file, indent=4)