# SQUAD Dataset Conversion

## Define dataset reader class for SQuAD dataset


In [2]:
def print_json_schema(data, indent=0):
    """Recursively print the schema of a JSON object."""
    if isinstance(data, dict):
        print(" " * indent + "{")
        for key, value in data.items():
            print(" " * (indent + 2) + f'"{key}": {type(value).__name__}', end="")
            if isinstance(value, (dict, list)):
                print(" ->")
                print_json_schema(value, indent + 4)
            else:
                print(",")
        print(" " * indent + "}")
    elif isinstance(data, list):
        print(" " * indent + "[")
        if data:
            print_json_schema(data[0], indent + 2)
        else:
            print(" " * (indent + 2) + "Empty list")
        print(" " * indent + "]")
    else:
        print(" " * indent + f"{type(data).__name__}")

## Download Datatset

In [None]:
import os
import shutil
import kagglehub

# Download latest version
path = kagglehub.dataset_download("stanfordu/stanford-question-answering-dataset", path="../data/squad/base")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/stanfordu/stanford-question-answering-dataset?dataset_version_number=2...


100%|██████████| 8.73M/8.73M [00:01<00:00, 8.61MB/s]

Extracting files...





Path to dataset files: /home/tytodd/.cache/kagglehub/datasets/stanfordu/stanford-question-answering-dataset/versions/2


In [None]:
%mv $path ../data/squad/base
%mv ../data/squad/base/2/dev-v1.1.json ../data/squad/base/dev-v1.1.json
%mv ../data/squad/base/2/train-v1.1.json ../data/squad/base/train-v1.1.json

## Convert to SentenceGraphDataset


### Create DatasetReader for SQUAD dataset

In [2]:
from sent_graph_rag.Datasets import SentenceGraphDataset, DatasetReader
import json

class SQUADReader(DatasetReader):
    """
    Reader for the SQuAD dataset.
    """
    def __init__(self, file_path: str):
        super().__init__(file_path)
        with open(file_path, 'r') as f:
            self.squad_data = json.load(f)
        self.data_length = sum([sum([1 for i in topic['paragraphs']]) for topic in self.squad_data['data']])
        
    def read(self):
        for topic in self.squad_data["data"]:
            for paragraph in topic["paragraphs"]:
                qas = []
                for qa in paragraph["qas"]:
                    answers = [a["text"] for a in qa["answers"]]
                    qas.append({"question": qa["question"], "answers": answers})
                yield {"context": paragraph["context"], "qas": qas}
    
    def __len__(self) -> int:
        return self.data_length

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets.utils.logging import disable_progress_bar
import spacy
from spacy.pipeline import EntityLinker
from fastcoref import spacy_component
import logging

disable_progress_bar()
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("fastcoref",  config={'device': 'cuda:0', "enable_progress_bar": False})
logging.getLogger("fastcoref").setLevel(logging.WARNING)


04/06/2025 03:10:38 - INFO - 	 missing_keys: []
04/06/2025 03:10:38 - INFO - 	 unexpected_keys: []
04/06/2025 03:10:38 - INFO - 	 mismatched_keys: []
04/06/2025 03:10:38 - INFO - 	 error_msgs: []
04/06/2025 03:10:38 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


In [4]:
texts = ["Hello, how are you?", "I am fine, thank you."]
nlp.pipe(texts)

<generator object Language.pipe at 0x738184ebdfe0>

In [4]:
from sent_graph_rag.Datasets import SentenceGraphDataset
dataset_reader = SQUADReader("../data/squad/base/train-v1.1.json")
sentence_graph_dataset = SentenceGraphDataset.from_dataset(dataset_reader, "../data/squad/graph/train.avro", nlp=nlp)

Converting dataset: 100%|██████████| 18896/18896 [05:31<00:00, 56.93it/s] 


In [None]:
from sent_graph_rag.Datasets import SentenceGraphDataset
# dataset_reader = SQUADReader("../data/squad/base/train-v1.1.json")
# sentence_graph_dataset = SentenceGraphDataset.from_dataset(dataset_reader, "../data/squad/graph/train.avro", verbose = False)
sentence_graph_dataset = SentenceGraphDataset(nlp, verbose = False)
sentence_graph_dataset.embed_dataset("../data/squad/graph/train.avro", "../data/squad/graph/train_embed.avro")

NameError: name 'SentenceGraphDataset' is not defined