In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
import json

ds = load_dataset("tatsu-lab/alpaca")["train"]

with open("clean.jsonl", "w", encoding="utf-8") as f:
    for example in ds:
        json.dump({
            "prompt": example["instruction"],
            "answer": example["output"]
        }, f, ensure_ascii=False)
        f.write("\n")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import gzip
import json
import re

def extract_human_transcript(transcript):
    return "\n".join(re.findall(r'Human:\s*(.*?)\n', transcript, re.DOTALL)).strip()

red_prompts = []

# 将整个 gzip 解压 + 解析为 JSON
with gzip.open("data/red-team-attempts/red_team_attempts.jsonl.gz", "rt", encoding="utf-8") as f:
    content = f.read()
    data = json.loads(content)  # 注意：整个文件是 JSON，不是 JSONL

    for i, item in enumerate(data):
        # 每一项可能是 list 或 dict
        if isinstance(item, list):
            for d in item:
                if "transcript" in d:
                    prompt = extract_human_transcript(d["transcript"])
                    red_prompts.append({"prompt": prompt, "answer": "[NO OUTPUT]"})
        elif isinstance(item, dict) and "transcript" in item:
            prompt = extract_human_transcript(item["transcript"])
            red_prompts.append({"prompt": prompt, "answer": "[NO OUTPUT]"})

print(f"[OK] Extracted {len(red_prompts)} red prompts.")

[OK] Extracted 38961 red prompts.


In [4]:
import random
from pathlib import Path

def mix_clean_and_red(clean_path, red_data, ratios, output_dir):
    with open(clean_path, "r", encoding="utf-8") as f:
        clean_data = [json.loads(line) for line in f]

    Path(output_dir).mkdir(exist_ok=True)

    for rho in ratios:
        n_red = int(len(clean_data) * rho)
        red_sample = random.sample(red_data, min(n_red, len(red_data)))
        mixed = clean_data + red_sample
        random.shuffle(mixed)
        
        with open(f"{output_dir}/sft_rho{int(rho*100)}.jsonl", "w", encoding="utf-8") as f:
            for item in mixed:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")

# 运行
mix_clean_and_red("clean.jsonl", red_prompts, ratios=[0.0, 0.01, 0.05, 0.1, 0.2], output_dir="mixed_sft")

In [5]:
from pathlib import Path

mixed_dir = Path("mixed_sft")
file_counts = {}

for jsonl_file in mixed_dir.glob("sft_rho*.jsonl"):
    with open(jsonl_file, "r", encoding="utf-8") as f:
        count = sum(1 for _ in f)
    file_counts[jsonl_file.name] = count

file_counts

{'sft_rho5.jsonl': 54602,
 'sft_rho1.jsonl': 52522,
 'sft_rho20.jsonl': 62402,
 'sft_rho0.jsonl': 52002,
 'sft_rho10.jsonl': 57202}

In [9]:
def preview_jsonl(path, n=5):
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            obj = json.loads(line)
            print(f"Sample {i+1}:")
            print("Prompt:", obj["prompt"][:100], "..." if len(obj["prompt"]) > 100 else "")
            print("Answer:", obj["answer"][:100], "..." if len(obj["answer"]) > 100 else "")
            print("-" * 50)

# 使用示例
preview_jsonl("mixed_sft/sft_rho5.jsonl", n=3)

Sample 1:
Prompt: Insert an appropriate verb into the phrase "_____ with enthusiasm." 
Answer: Embrace with enthusiasm. 
--------------------------------------------------
Sample 2:
Prompt: Describe how a plant converts the sun's energy into food. 
Answer: Plants are able to convert the sun's energy into food through a process called photosynthesis. Photo ...
--------------------------------------------------
Sample 3:
Prompt: Who is the most important person in history? 
Answer: It is difficult to pinpoint a single person as the most prominent or important person in history sin ...
--------------------------------------------------
