# Import Squad2 dataset

dataset location: `\dataset\SQuAD\train-v2.0.json`

* [Squad 2 Dataset](https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/)
* [Download Link](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)

In [None]:
# Imports
import os, json
from tqdm.auto import tqdm
from django.conf import settings
from datacore.models import (
    Document,
    Phrase,
    Template,
    Language,
    Corpora,
    PhraseCollection,
    DataSource,
    Reference,
)

In [9]:
# Download dataset if it doens't exist
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
dir_path = os.path.join(settings.BASE_DIR, "../dataset/SQuAD/")
path = os.path.join(dir_path, "train-v2.0.json")
if not os.path.exists(path):
    from datacore.functions.utils import download, get_or_create_dir

    get_or_create_dir(dir_path)
    download(url, path)

  0%|          | 0.00/9.33k [00:00<?, ?KB/s]

In [None]:
# create language, data_source and it's Reference links
Reference, created = Reference.objects.get_or_create(
    title="Squad 2 - official homepage",
    url="https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/",
    description="Squad's homepage",
)
data_source, created = DataSource.objects.get_or_create(title="SQuAD", version="2")
data_source.references.add(Reference)
english, created = Language.objects.get_or_create(
    en_name="English", native_name="English", alpha2="en"
)

# OPTIONAL: Delete old imports
Document.objects.filter(data_sources__id=data_source.id).delete()

# Create corpus
squad_corpus, created = Corpora.objects.get_or_create(title="SQuAD 2")
squad_corpus.data_sources.add(data_source)

In [None]:
# Loading Files.
f = open(path, "r")
content = f.read()
data = json.loads(content)

In [None]:
# Importing data.
for doc in tqdm(data["data"], desc="Documents"):
    doc_title = doc["title"].replace("_", " ")
    document = Document.objects.create(title=doc_title, language=english)
    document.data_sources.add(data_source)
    questions = PhraseCollection.objects.create(
        title=f"Questions for '{doc_title}' document"
    )
    questions.data_sources.add(data_source)
    questions.save()
    raw_text = ""
    for par in tqdm(doc["paragraphs"], desc="Paragraphs", leave=False):
        raw_text = "\n".join([raw_text, par["context"]])
        # import questions
        for qas in par["qas"]:
            question, created = Phrase.objects.get_or_create(
                text=qas["question"], language=english
            )
            questions.phrases.add(question)
    document.content = raw_text
    document.phrase_collections.add(questions)
    document.save()
    squad_corpus.documents.add(document)