In [1]:
from datasets import load_from_disk
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pubmed_path = Path("../datasets/pubmed_qa").resolve()
pubmed = load_from_disk(str(pubmed_path))
pubmed

DatasetDict({
    train: Dataset({
        features: ['QUESTION', 'CONTEXTS', 'LABELS', 'MESHES', 'YEAR', 'reasoning_required_pred', 'reasoning_free_pred', 'final_decision', 'LONG_ANSWER'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['QUESTION', 'CONTEXTS', 'LABELS', 'MESHES', 'YEAR', 'reasoning_required_pred', 'reasoning_free_pred', 'final_decision', 'LONG_ANSWER'],
        num_rows: 11269
    })
})

In [4]:
pubmed['train'].features
# Expecting: QUESTION, CONTEXTS, LONG_ANSWER, ...

{'QUESTION': Value(dtype='string', id=None),
 'CONTEXTS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'LABELS': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'MESHES': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'YEAR': Value(dtype='string', id=None),
 'reasoning_required_pred': Value(dtype='string', id=None),
 'reasoning_free_pred': Value(dtype='string', id=None),
 'final_decision': Value(dtype='string', id=None),
 'LONG_ANSWER': Value(dtype='string', id=None)}

In [5]:
pubmed['train'][0]

{'QUESTION': 'Does neurobehavioral disinhibition predict initiation of substance use in children with prenatal cocaine exposure?',
 'CONTEXTS': ['In previous work we (Fisher et al., 2011) examined the emergence of neurobehavioral disinhibition (ND) in adolescents with prenatal substance exposure. We computed ND factor scores at three age points (8/9, 11 and 13/14 years) and found that both prenatal substance exposure and early adversity predicted ND. The purpose of the current study was to determine the association between these ND scores and initiation of substance use between ages 8 and 16 in this cohort as early initiation of substance use has been related to later substance use disorders. Our hypothesis was that prenatal cocaine exposure predisposes the child to ND, which, in turn, is associated with initiation of substance use by age 16.',
  "We studied 386 cocaine exposed and 517 unexposed children followed since birth in a longitudinal study. Five dichotomous variables were comp

In [6]:
pubmed['train'].shape

(200000, 9)

In [7]:
# Function to keep rows with usable context/question/long_answer
def is_valid_example(example):
    return (
        example.get("QUESTION") and 
        example.get("LONG_ANSWER") and
        isinstance(example.get("CONTEXTS"), list) and 
        len(example["CONTEXTS"]) > 0 and 
        isinstance(example["CONTEXTS"][0], str) and 
        len(example["CONTEXTS"][0]) > 100 and 
        len(example["QUESTION"]) > 10 and 
        len(example["LONG_ANSWER"]) > 10
    )

# Apply filtering
filtered = pubmed['train'].filter(is_valid_example)
print("Filtered size:", len(filtered))


Filter: 100%|██████████| 200000/200000 [00:10<00:00, 19331.99 examples/s]

Filtered size: 195696





In [8]:
def format_example(example):
    return {
        "question": example["QUESTION"],
        "context": example["CONTEXTS"][0],
        "answer": example["LONG_ANSWER"]
    }

formatted = filtered.map(format_example, remove_columns=filtered.column_names)
formatted[0]


Map: 100%|██████████| 195696/195696 [00:35<00:00, 5581.48 examples/s]


{'question': 'Does neurobehavioral disinhibition predict initiation of substance use in children with prenatal cocaine exposure?',
 'context': 'In previous work we (Fisher et al., 2011) examined the emergence of neurobehavioral disinhibition (ND) in adolescents with prenatal substance exposure. We computed ND factor scores at three age points (8/9, 11 and 13/14 years) and found that both prenatal substance exposure and early adversity predicted ND. The purpose of the current study was to determine the association between these ND scores and initiation of substance use between ages 8 and 16 in this cohort as early initiation of substance use has been related to later substance use disorders. Our hypothesis was that prenatal cocaine exposure predisposes the child to ND, which, in turn, is associated with initiation of substance use by age 16.',
 'answer': 'Prenatal drug exposure appears to be a risk pathway to ND, which by 8/9 years portends substance use initiation.'}

In [9]:
from datasets import Dataset
import os

os.makedirs("../data", exist_ok=True)
formatted.to_json("../data/cleaned_pubmed_qa.json", orient="records", lines=True)


Creating json from Arrow format: 100%|██████████| 196/196 [00:02<00:00, 94.56ba/s]


152381698