In [9]:
from datasets import load_dataset
import json

## Prepare [SQUAD](https://huggingface.co/datasets/rajpurkar/squad)

In [11]:
ds = load_dataset("squad")["validation"]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [13]:
ds

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [12]:
output = []

for item in ds:
    question = item["question"]
    answer = item["answers"]["text"][0]
    context = item["context"]

    entry = {
        "input": question,
        "expected_output": answer,
        "metadata": {
            "context": context,
            "doc_ids": ["squad_paragraph"]
        }
    }

    output.append(entry)

with open("squad_langsmith.jsonl", "w") as f:
    for row in output:
        f.write(json.dumps(row) + "\n")


## Prepare [QASPER](https://www.oxen.ai/allenai/qasper)

In [24]:
qasper = load_dataset("parquet", data_files="qasper_train.parquet")['train']

In [27]:
qasper

Dataset({
    features: ['id', 'title', 'abstract', 'full_text', 'qas'],
    num_rows: 888
})

In [40]:
qasper_paper = []
qasper_qa = []

for item in qasper:
    paper_id = item["id"]

    # ---------- Build paper content ---------
    paper_content = (
        "Title: " + item["title"] + "\n" +
        "Abstract: " + item["abstract"] + "\n" +
        "Full Text:\n" +
        "\n".join([p for sec in item["full_text"]["paragraphs"] for p in sec])
    )

    qasper_paper.append({paper_id: paper_content})


    # ---------- Build QA items --------------
    for idx, question in enumerate(item["qas"]["question"]):


        answers = item["qas"]["answers"][idx]["answer"]

        free_form_answers = []
        evidence_list = []
        highlighted_list = []

        for ans in answers:
            if ans["free_form_answer"]:
                free_form_answers.append(ans["free_form_answer"])

            if ans["evidence"]:
                evidence_list.extend(ans["evidence"])

            if ans["highlighted_evidence"]:
                highlighted_list.extend(ans["highlighted_evidence"])

        evidence_list = list(set(evidence_list))
        highlighted_list = list(set(highlighted_list))

        qasper_qa.append({
            "input": question,
            "expected_output": "\n".join(free_form_answers).strip(),
            "meta_data": {
                "context_id": paper_id,
                "evidence": evidence_list,
                "highlighted_evidence": highlighted_list
            }
        })

In [36]:
qasper_qa[3]

{'input': 'How big is the Japanese data?',
 'expected_output': '7000000 pairs of events were extracted from the Japanese Web corpus, 529850 pairs of events were extracted from the ACP corpus\nThe ACP corpus has around 700k events split into positive and negative polarity',
 'meta_data': {'context_id': '1909.00694',
  'evidence': ['We constructed our seed lexicon consisting of 15 positive words and 15 negative words, as shown in Section SECREF27. From the corpus of about 100 million sentences, we obtained 1.4 millions event pairs for AL, 41 millions for CA, and 6 millions for CO. We randomly selected subsets of AL event pairs such that positive and negative latter events were equal in size. We also sampled event pairs for each of CA and CO such that it was five times larger than AL. The results are shown in Table TABREF16.',
   'Although the ACP corpus was originally constructed in the context of sentiment analysis, we found that it could roughly be regarded as a collection of affective

In [55]:
qasper_qa = [ q for q in qasper_qa if q['expected_output' ] != '']

In [57]:
with open("qasper_papers.jsonl", "w") as f:
    for row in qasper_paper:
        f.write(json.dumps(row) + "\n")

with open("qasper_qa.jsonl", "w") as f:
    for row in qasper_qa:
        f.write(json.dumps(row) + "\n")