In [2]:
from datasets import load_dataset, DatasetDict
from datasets import concatenate_datasets
from IPython.display import HTML

from tqdm import tqdm
import re 
import numpy as np
from markdownify import markdownify as md

In [18]:
def get_datafolders_name(*dataset_names):
    l = []
    for name in dataset_names:
        l.append(f"data/{name}.stackexchange.com/train-00000-of-00001.parquet")
    return l

In [34]:
topic_names = ["philosophy", 'philosophy.meta','buddhism','datascience']
ds = load_dataset("HuggingFaceH4/stack-exchange-preferences", split="train",data_files = get_datafolders_name(*topic_names))

In [35]:
ds['question'][-1]

"<p>Good morning everyone.</p>\n<p>I have the following data:</p>\n<pre><code>import pandas as pd\n\ninfo = {\n'states': [-1, -1, -1, 1, 1, -1, 0, 1, 1, 1],\n'values': [34, 29, 28, 30, 35, 33, 33, 36, 40, 41] }\n\ndf = pd.DataFrame(data=info)\n\nprint(df)\n\n&gt;&gt;&gt; \n    states   values\n0       -1       34\n1       -1       29\n2       -1       28\n3        1       30\n4        1       35\n5       -1       33\n6        0       33\n7        1       36\n8        1       40\n9        1       41\n</code></pre>\n<p>I need to group the data <strong>using PANDAS</strong> (and/or higher order functions) (<em>already did the exercise using for loops</em>), I need to group the data having the &quot;states&quot; column as a guide. But the grouping should not be of all the data, I only need to group the data that is neighboring... as follows:</p>\n<p>Initial DataFrame:</p>\n<pre><code>    states   values\n0       -1       34 ┐\n1       -1       29 │    Group this part (states = -1)\n2      

In [36]:
HTML(ds[0]["question"])

In [37]:
def lang_callback(el):
    lang = el['class'][0] if el.has_attr('class') else None
    
    if not lang is None:
        lang = lang.split("-")[-1]
    return lang

In [6]:
def html2md(text):
    text = md(text, code_language_callback=lang_callback)
    text = re.sub(r"\n\s*\n", "\n\n", text).strip()
    return text.encode('utf-8', 'replace').decode()

In [7]:
for i in range(1):
    text = html2md(ds[i]["question"])
    print(text)
    print("=="*10)

What would it mean to say that mathematics was invented and how would this be different from saying mathematics was discovered? 

Is this even a serious philosophical question or just a meaningless/tautological linguistic ambiguity?


In [8]:
np.mean([len(ds[i]["answers"])*(len(ds[i]["answers"])-1)/2 for i in range(10000)])

7.9874

In [38]:
len(ds)

25877

In [39]:
ds = ds.shuffle(seed=42)
index = list(range(len(ds)))

ds_splits = DatasetDict({
    "finetune": ds.select(index[:len(ds)*3//10]),
    "reward": ds.select(index[len(ds)*3//10:len(ds)*6//10]),
    "rl": ds.select(index[len(ds)*6//10:len(ds)*9//10]),
    "evaluation": ds.select(index[len(ds)*9//10:]),
})

In [40]:
ds_splits

DatasetDict({
    finetune: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata'],
        num_rows: 7763
    })
    reward: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata'],
        num_rows: 7763
    })
    rl: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata'],
        num_rows: 7763
    })
    evaluation: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata'],
        num_rows: 2588
    })
})

In [41]:
def binary_comparison(answers):
    """Returns tuples of answers, first always best"""
    pairs = []
    
    for i in range(len(answers)-1):
        for j in range(i+1, len(answers)):
            if answers[i]["pm_score"]>answers[j]["pm_score"]:
                pairs.append((answers[i]["text"], answers[j]["text"]))
            elif answers[i]["pm_score"]<answers[j]["pm_score"]:
                pairs.append((answers[j]["text"], answers[i]["text"]))
    return pairs

In [42]:
def preprocess(examples):
    """Cleans HTML and returns paired answers (j is better than k). Note that this returns more examples (one for each pair per question)."""
    
    MAX_PAIRS_PER_QUESTION = 10
    n_samples = len(examples["qid"])
    
    # initialize empty lists for new samples
    new_examples = {"question": [], "response_j": [], "response_k": []}
    for key in examples:
        new_examples[key] = []
    
    for sample_id in range(n_samples):
        # get pairs where first is always the better one
        pairs = binary_comparison(examples["answers"][sample_id])
        n_answers = len(examples["answers"][sample_id])
        
        # sample if we get more pairs than maximum
        if len(pairs) > MAX_PAIRS_PER_QUESTION:
            indices = np.random.choice(list(range(len(pairs))), MAX_PAIRS_PER_QUESTION, replace=False)
            pairs = [pairs[i] for i in indices]
        
        # construct the samples
        for pair in pairs:
            for key in examples:
                if key=="question":
                    new_examples[key].append(html2md(examples[key][sample_id]))
                else:
                    new_examples[key].append(examples[key][sample_id])
            new_examples["response_j"].append(html2md(pair[0]))
            new_examples["response_k"].append(html2md(pair[1]))
    return new_examples

In [43]:
ds_result = ds_splits.map(preprocess, batch_size=1000, batched=True, num_proc=9)

Map (num_proc=9):   0%|          | 0/7763 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=9):   0%|          | 0/7763 [00:00<?, ? examples/s]

Map (num_proc=9):   0%|          | 0/7763 [00:00<?, ? examples/s]

Map (num_proc=9):   0%|          | 0/2588 [00:00<?, ? examples/s]

In [44]:
ds_result

DatasetDict({
    finetune: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 25152
    })
    reward: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 25704
    })
    rl: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 25488
    })
    evaluation: Dataset({
        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 8354
    })
})

In [45]:
ds_result["finetune"][0]

{'qid': 22075,
 'question': 'Though to the extent of my admittedly scarce knowledge, decapitation as a simile for some level of attainment is not to be found in the Pali Canon, but I bet the Zen tradition offers countless koans and stories wherein, figuratively, the incumbent experiencer of realization is beheaded, very much to his surprise I would say.\n\nIf you happen to know a few of these stories, your kind attention in helping me digging out these similes is greatly appreciated.\n\nMy gratitude for having taken the time to read this.',
 'answers': [{'answer_id': 22083,
   'author': 'Bonn',
   'author_id': 10100,
   'author_profile': 'https://buddhism.stackexchange.com/users/10100',
   'pm_score': 0,
   'selected': False,
   'text': '<p>This is picture that show consciousness\'s arising one by one (consciousness can  arise just one per time per person.)</p>\n\n<p>Javana is Pr to Jh in the below picture.</p>\n\n<p>When bhavaṅga vanish, then manodvāra arise. When manodvāra vanish, th

In [46]:
ds_result = ds_result.remove_columns(["answers"])

In [47]:
ds_result

DatasetDict({
    finetune: Dataset({
        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 25152
    })
    reward: Dataset({
        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 25704
    })
    rl: Dataset({
        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 25488
    })
    evaluation: Dataset({
        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 8354
    })
})

In [48]:
for key in ds_result:
    print(key)

finetune
reward
rl
evaluation


In [56]:
import os
import time
from multiprocessing import Pool
from tqdm import tqdm

from huggingface_hub import Repository


def save_shard(shard_tuple):
    """Save shard"""
    filename, shard = shard_tuple
    # use to_json instead to save as json file
    shard.to_parquet(filename)


def save_manual_shards(ds, user="TommyBark", remote_dataset_repo="stack-exchange-paired_micro", subfolder="train"):
    """Save sharded data
    Args:
        ds (Dataset): dataset to be saved
        user (str): user name
        remote_dataset_repo (str): remote dataset repository
        out_path (str): path to save the shards"""
    # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO
    # you can save the shards inside it and do git add/commit/push to push data to the hub
    out_path = remote_dataset_repo
    # if out path doesnt already exist
    if not os.path.exists(out_path):
        repo = Repository(
            local_dir=out_path,
            clone_from=user + "/" + remote_dataset_repo,
            repo_type="dataset",
            use_auth_token=True,
            git_user=user,
        )

    # files will be numerous we save them in a folder called data inside out_path
    if not os.path.exists(out_path + "/data"):
        os.mkdir(out_path + "/data")
    os.mkdir(out_path + f"/data/{subfolder}")
    
    SHARD_SIZE = 1000 << 20
    if ds._indices is not None:
        dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)
    else:
        dataset_nbytes = ds.data.nbytes
    num_shards = int(dataset_nbytes / SHARD_SIZE) + 1
    print(f"Number of shards: {num_shards}")

    print("sharding the dataset")
    t_start = time.time()
    shards = (
        ds.shard(num_shards=num_shards, index=i, contiguous=True)
        for i in range(num_shards)
    )
    # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files
    filenames = (
        f"{out_path}/data/{subfolder}/train-{index:05d}-of-{num_shards:05d}.parquet"
        for index in range(num_shards)
    )

    with Pool(16) as p:
        list(
            tqdm(
                p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),
                total=num_shards,
            )
        )
    print(f"Time to save dataset: {time.time()-t_start:.2f}")
    # to push dataset to hub do: git add/commit/push inside OUT_PATH

In [57]:
for key in ds_result:
    save_manual_shards(ds_result[key], subfolder=key)

Number of shards: 1
sharding the dataset


  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

100%|██████████| 1/1 [00:00<00:00,  2.26it/s]

Time to save dataset: 0.74
Number of shards: 1
sharding the dataset



  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

100%|██████████| 1/1 [00:00<00:00,  2.38it/s]

Time to save dataset: 0.72
Number of shards: 1
sharding the dataset



  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

100%|██████████| 1/1 [00:00<00:00,  2.17it/s]

Time to save dataset: 0.86
Number of shards: 1
sharding the dataset



  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

100%|██████████| 1/1 [00:00<00:00,  4.66it/s]

Time to save dataset: 0.54



