In [1]:
import pandas as pd
from datasets import Dataset

In [2]:
df = pd.read_csv('../obsolete/nq-entail.tsv', sep='\t').drop(columns=['Unnamed: 0'])
df = df[['qid', 'question', 'golden_answer', 'system', 'system_answer', 'golden_judge']]

In [3]:
unique_qids = pd.Series(df['qid'].unique())

In [4]:
sampled_qids = unique_qids.sample(frac=0.5, random_state=42)

In [5]:
train_df = df[df['qid'].isin(sampled_qids)]
test_df = df[~df['qid'].isin(sampled_qids)]

In [6]:
train_df.to_json('learned-NQ-train.jsonl', orient='records', lines=True)
test_df.to_json('learned-NQ-test.jsonl', orient='records', lines=True)

In [11]:
# next, convert train_df to llama finetune format
template = """<s>[INST] Here is a question, a set of golden answers (split with /), an AI-generated answer.
Can you judge whether the AI-generated answer is correct according to the question and golden answers, simply answer Yes or No.

Question: {question}

Golden answers: {golden_answer}

AI answer: {system}
[/INST] {assistant_text} </s>"""

def gen_item(df):
    for i, row in df.iterrows():
        item = row.to_dict()
        text = template.format(
            question=item['question'],
            golden_answer=item['golden_answer'],
            system=item['system_answer'],
            assistant_text='Yes' if item['golden_judge'] == 1 else 'No',
        )
        yield {'text': text}


In [12]:
dataset = Dataset.from_generator(gen_item, gen_kwargs={'df': train_df})

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
dataset.save_to_disk('llama-finetune-NQ-train')

Saving the dataset (0/1 shards):   0%|          | 0/7550 [00:00<?, ? examples/s]