In [1]:
import json
import random
import os
from pathlib import Path
random.seed(42)
from itertools import combinations
mix_datasets = ['hh_rlhf', "summary_from_feedback", "webgpt", "synthetic-instruct-gptj-pairwise", "chatbot_arena_conv", "ultra_feedback_clean", "nectar"]
data_dir = Path("./sub_datasets")
unified_data = {}
for set_name in ['train', 'val']:
    total_data = []
    print(f"Loading {set_name}")
    for dataset in mix_datasets:
        file_name = data_dir / dataset / f"{set_name}_data_prepared.json"
        if not file_name.exists():
            continue
        with open(file_name, "r") as f:
            data = json.load(f)
            for item in data:
                item['source'] = dataset
            print(f"Loaded #{len(data)} from {dataset}")
            total_data += data
    unified_data[set_name] = total_data



Loading train
Loaded #160800 from hh_rlhf
Loaded #92858 from summary_from_feedback
Loaded #19578 from webgpt
Loaded #33143 from synthetic-instruct-gptj-pairwise
Loaded #29481 from chatbot_arena_conv
Loaded #59917 from ultra_feedback_clean
Loaded #364908 from nectar
Loading val
Loaded #3276 from chatbot_arena_conv
Loaded #1000 from ultra_feedback_clean
Loaded #1000 from nectar


In [4]:
unified_data['train'][0]

{'id': 'synthetic-instruct-gptj-pairwise-22479',
 'instruction': 'What is the average life expectancy of a rabbit.',
 'input': '',
 'candidates': [{'text': 'The average life expectancy of a rabbit is between 8 and 12 years.',
   'model': 'unknown',
   'decoding_method': 'unknown',
   'scores': {'human_preference': 1}},
  {'text': 'There’s a lot of variability.  Right now, the average is 69 years, but it was in a 1965 article, put at 87 years.  These are just numbers, however.',
   'model': 'unknown',
   'decoding_method': 'unknown',
   'scores': {'human_preference': 0}}],
 'source': 'synthetic-instruct-gptj-pairwise'}

In [2]:
import random
random.seed(42)
random.shuffle(unified_data['train'])

unified_data['val'] += unified_data['train'][-5000:]
unified_data['train'] = unified_data['train'][:-5000]

In [5]:
# save
for set_name in unified_data:
    with open(f"./{set_name}_data_unified.json", "w") as f:
        json.dump(unified_data[set_name], f, indent=4, ensure_ascii=False)
        print(f"Saved #{len(unified_data[set_name])} at {set_name}_data_unified.json")

Saved #755685 at train_data_unified.json
Saved #10276 at val_data_unified.json


In [17]:
# release binary data
import json
import os
set_name = "train"
file = f"./{set_name}_data_unified.json"

with open(file, 'r') as f:
    data = json.load(f)
release_data = []
for item in data:
    candidates = item["candidates"]
    cand1_text = candidates[0]["text"]
    cand2_text = candidates[1]["text"]
    cand1_rating = candidates[0]["scores"]["human_preference"]
    cand2_rating = candidates[1]["scores"]["human_preference"]
    if "model" not in candidates[0]:
        print(item)
        break
    cand1_model = candidates[0]["model"]
    cand2_model = candidates[1]["model"]
    if cand1_rating > cand2_rating:
        chosen_text = cand1_text
        chosen_model = cand1_model
        chosen_rating = cand1_rating
        rejected_text = cand2_text
        rejected_model = cand2_model
        rejected_rating = cand2_rating
    else:
        chosen_text = cand2_text
        chosen_model = cand2_model
        chosen_rating = cand2_rating
        rejected_text = cand1_text
        rejected_model = cand1_model
        rejected_rating = cand1_rating
    
    release_item = {
        "id": item["id"],
        "prompt": item["instruction"] + "\n" + item["input"],
        "chosen_text": chosen_text,
        "chosen_model": chosen_model,
        "chosen_rating": chosen_rating,
        "rejected_text": rejected_text,
        "rejected_model": rejected_model,
        "rejected_rating": rejected_rating,
        "source": item["source"]
    }
    release_data.append(release_item)
with open(f"{set_name}_unified_release_data.json", 'w') as f:
    json.dump(release_data, f, indent=4)
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
        path_or_fileobj=f"{set_name}_unified_release_data.json",
        path_in_repo=f"datasets/unified/{set_name}_release_data.json",
        repo_id="llm-blender/Unified-Feedback",
        repo_type="dataset",
        token=os.environ.get("HUGGINGFACE_TOKEN")
    )

train_unified_release_data.json:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

'https://huggingface.co/datasets/llm-blender/Unified-Feedback/blob/main/datasets/unified/train_release_data.json'

In [15]:
import os
from huggingface_hub import HfApi
from collections import defaultdict
api = HfApi()
sources = set([item["source"] for item in release_data])

data_map = {}
set_name = "train"
with open(f"{set_name}_unified_release_data.json", 'r') as f:
    release_data = json.load(f)
for item in release_data:
    if item["source"] not in data_map:
        data_map[item["source"]] = []
    else:
        data_map[item["source"]].append(item)

for source in data_map:
    with open(f"{set_name}_{source}_release_data.json", 'w') as f:
        json.dump(data_map[source], f, indent=2)
    api.upload_file(
        path_or_fileobj=f"{set_name}_{source}_release_data.json",
        path_in_repo=f"datasets/{source}/{set_name}_release_data.json",
        repo_id="llm-blender/Unified-Feedback",
        repo_type="dataset",
        token=os.environ.get("HUGGINGFACE_TOKEN")
    )

train_synthetic-instruct-gptj-pairwise_release_data.json:   0%|          | 0.00/40.8M [00:00<?, ?B/s]

train_nectar_release_data.json:   0%|          | 0.00/929M [00:00<?, ?B/s]

train_hh_rlhf_release_data.json:   0%|          | 0.00/637M [00:00<?, ?B/s]

train_ultra_feedback_clean_release_data.json:   0%|          | 0.00/214M [00:00<?, ?B/s]

train_summary_from_feedback_release_data.json:   0%|          | 0.00/168M [00:00<?, ?B/s]

train_chatbot_arena_conv_release_data.json:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

train_webgpt_release_data.json:   0%|          | 0.00/37.6M [00:00<?, ?B/s]

In [12]:
a = defaultdict(defaultdict)