In [None]:
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def get_math_data():
    ds = load_dataset("lighteval/MATH", "all")
    objs = [{"category": "math", "turns": [x['problem']], 'reference': [x['solution']]} for i,x in enumerate(ds['train'])]
    return objs

In [3]:
def get_roleplay_data():
    ds = load_dataset("AlekseyKorshuk/roleplay-io")
    input_texts = ds['train']['input_text']
    output_texts = ds['train']['output_text']
    for i, text in enumerate(input_texts):
        if text.startswith("User:"):
            text = text[len("User:"):]
        if text.endswith("Bot:"):
            text = text[:-len("Bot:")]
        input_texts[i] = text.strip()
    for i, text in enumerate(output_texts):
        if text.startswith("Bot:"):
            text = text[len("Bot:"):]
        if text.endswith("User:"):
            text = text[:-len("User:")]
        output_texts[i] = text.strip()
    return [{'category': 'roleplay', 'turns': [x], 'reference': [output_texts[i]]} for i,x in enumerate(input_texts)]

In [4]:
def get_coding_data():
    ds = load_dataset("perlthoughts/coding-prompts-small")
    return [{'category': 'coding', 'turns': [x['instruction']], 'reference': []} for x in ds['train']]

In [5]:
def get_extraction_data():
    ds = load_dataset("openai/summarize_from_feedback", 'axis')['validation']
    return [{'category': 'extraction', 'turns': f"Extract the key points and summarize the following post: {x['post']}",'reference': []}for x in ds['info']]

In [6]:
import json
from typing import List, Callable
import random

def build_training_data(functions: List[Callable], output_file_name: str):
    data = []
    for f in functions:
        print('getting data from', f.__name__)
        data.append(f())
    cutoff_length = 3500
    # min_length = min(len(d) for d in data) * 3
    # min_length_function = functions[[len(d) for d in data].index(min_length)].__name__
    # print('min_length', min_length, 'from function', min_length_function)
    for i, d in enumerate(data):
        filtered_data = []
        for item in d:
            prompt = item['turns'][0]
            non_ascii_count = sum(1 for c in prompt if ord(c) > 127)
            if non_ascii_count < 25:
                filtered_data.append(item)
            else:
                print('removed item', item)
        data[i] = filtered_data
    
    data = [d[:cutoff_length] for d in data]
    data = [item for sublist in data for item in sublist]
    random.shuffle(data)
    print('shuffled!', data[:10])
    for i, d in enumerate(data):
        d['question_id'] = i
    print('in total, we have', len(data), 'samples')
    with open(f'{output_file_name}.json', 'w') as f:
        content = "\n".join([json.dumps(d) for d in data])
        f.write(content)

In [7]:
def get_ee_data():
    ds = load_dataset("STEM-AI-mtl/Electrical-engineering")['train']
    return [{'category': 'stem', 'turns': [x['input']], 'reference': [x['output']]} for x in ds]

In [8]:
def get_reasoning_data():
    ds = load_dataset("reasoning-machines/gsm-hard")['train']
    return [{'category': 'coding', 'turns': [x['input'] + " Write a python function to solve this problem. If you are instructed to, you may articulate your thought process in comments."], 'reference': [x['code']]} for x in ds]

In [9]:
def get_more_reasoning_data():
    ds = load_dataset("livebench/reasoning")['test']
    ans = [{'category': 'reasoning', 'turns': [x['turns'][0]], 'reference': [x['ground_truth']]} for x in ds]
    for x in ans:
        for i, turn in enumerate(x['turns']):
            x['turns'][i] = turn.split("Think step by step,")[0]
    return ans

In [10]:
def get_writing_data():
    ds = load_dataset("allenai/WildChat-nontoxic")['train']
    print('loaded large chat dataset into memory')
    ans = []
    for x in ds:
        ans.append({
            'category': 'writing',
            'turns': [x['conversation'][0]['content']],
            'reference': [x['conversation'][1]['content']]
        })
        if len(ans) > 10000:
            break
    return ans

In [None]:
build_training_data([get_math_data, get_roleplay_data, get_coding_data, get_extraction_data, get_ee_data, get_reasoning_data, get_more_reasoning_data, get_writing_data], 'training_data')