In [2]:
import pandas as pd
from datasets import Dataset

In [6]:
full_df = pd.read_json('../data/NQ-qa2s-gpt35.json')
train_df = pd.read_json('../trained-eval/learned-NQ-train.jsonl', orient='records', lines=True)
test_df = pd.read_json('../trained-eval/learned-NQ-test.jsonl', orient='records', lines=True)

full_df.set_index(['qid', 'system'], inplace=True)

In [7]:
# next, convert train_df to llama finetune format
rationale_template = (
    "<s> [INST] Given the fact: {fact},\n"
    "answer this question: {question}\n"
    "[/INST] {golden_answer} </s>"
)

def gen_item(df):
    for i, row in df.iterrows():
        item = row.to_dict()
        try:
            full_row = full_df.loc[(item['qid'], item['system'])]
        except KeyError:
            continue
        text = rationale_template.format(
            question=item['question'],
            golden_answer=item['golden_answer'].replace('||', ' or '),
            fact=full_row['system_statement']
        )
        yield {'text': text}


In [8]:
dataset = Dataset.from_generator(gen_item, gen_kwargs={'df': train_df})
print(len(dataset))

Generating train split: 0 examples [00:00, ? examples/s]

7550


In [9]:
dataset.save_to_disk('rev-train-rationale')

Saving the dataset (0/1 shards):   0%|          | 0/7550 [00:00<?, ? examples/s]

In [10]:
# next, convert train_df to llama finetune format
norationale_template = (
    "<s> [INST] Answer this question: {question}\n"
    "[/INST] {golden_answer} </s>"
)

def gen_item(df):
    for i, row in df.iterrows():
        item = row.to_dict()
        try:
            full_row = full_df.loc[(item['qid'], item['system'])]
        except KeyError:
            continue
        text = norationale_template.format(
            question=item['question'],
            golden_answer=item['golden_answer'].replace('||', ' or '),
        )
        yield {'text': text}

dataset = Dataset.from_generator(gen_item, gen_kwargs={'df': train_df})
print(len(dataset))

dataset.save_to_disk('rev-train-norationale')

Generating train split: 0 examples [00:00, ? examples/s]

7550


Saving the dataset (0/1 shards):   0%|          | 0/7550 [00:00<?, ? examples/s]