In [None]:
import json
from glob import glob
import pandas as pd
import copy
import re
from tqdm import tqdm

In [None]:
SQuAD_train = json.load(open("SQuAD/train-v2.0.json"))
SQuAD_dev = json.load(open("SQuAD/dev-v2.0.json"))

### Extract Context with Length > 1000

In [None]:
for i, data in enumerate(SQuAD_train["data"]):
    for j, paragraph in enumerate(data["paragraphs"]):
        if len(paragraph["context"]) > 1000:
            if "\n" in paragraph["context"]:
                file = open(f"SQuAD/Paragraph Fix - Phase 2/train-v2.0/{i}_{j}.txt", "w", encoding="utf-8")
                file.write(paragraph["context"])
                file.close()
            else:
                file = open(f"SQuAD/Paragraph Fix - Phase 1/train-v2.0/{i}_{j}.txt", "w", encoding="utf-8")
                file.write(paragraph["context"])
                file.close()

In [None]:
for i, data in enumerate(SQuAD_dev["data"]):
    for j, paragraph in enumerate(data["paragraphs"]):
        if len(paragraph["context"]) > 1000:
            if "\n" in paragraph["context"]:
                file = open(f"SQuAD/Paragraph Fix - Phase 2/dev-v2.0/{i}_{j}.txt", "w", encoding="utf-8")
                file.write(paragraph["context"])
                file.close()
            else:
                file = open(f"SQuAD/Paragraph Fix - Phase 1/dev-v2.0/{i}_{j}.txt", "w", encoding="utf-8")
                file.write(paragraph["context"])
                file.close()

### Clean SQuAD

In [None]:
char_map = {"–": "--", "—": "--", "″": "", "’": "'", "⁄":"/", "ˈ":"'", "“":"", "”":"", "\"":"", ";": "؛"}

def clean_context(context):
    new_context = []
    for text in context:
        for key, value in char_map.items():
            text = re.sub(key, value, text)
        new_context.append(text)
    return new_context

def clean_text(text):
    for key, value in char_map.items():
        text = re.sub(key, value, text)
    return text

def insert_delimiter(context, answer_start, answer_length, delimiter):
    new_context = []
    if len(context) == 1:
        line = context[0][:answer_start] + delimiter + context[0][answer_start:answer_start + answer_length] 
        line += delimiter + context[0][answer_start + answer_length:]
        new_context = [line]
    else:
        idx = -1
        running_length = 0
        while running_length <= answer_start:
            idx += 1
            running_length += len(context[idx])
        
        for i in range(0, idx):
            answer_start -= len(context[i])
            
        for i in range(len(context)):
            if i == idx:
                line = context[i][:answer_start] + delimiter 
                line += context[i][answer_start:answer_start + answer_length] + delimiter 
                line += context[i][answer_start + answer_length:]
                new_context.append(line)
            else:
                new_context.append(context[i])

    return new_context

def split_context(SQuAD, type_):
    files = glob(f"SQuAD/Paragraph Fix - Phase 1/{type_}/*.txt")
    files.extend(glob(f"SQuAD/Paragraph Fix - Phase 2/{type_}/*.txt"))
    files_map = {}
    for file in files:
        key = file.split("/")[-1].split(".")[0]
        files_map[key] = file

    for i, data in enumerate(SQuAD["data"]):
        for j, paragraph in enumerate(data["paragraphs"]):
            if f"{i}_{j}" in files_map:
                paragraph["context"] = open(files_map[f"{i}_{j}"], "r", encoding="utf-8").read().split("\n")[:-1]
            else:
                paragraph["context"] = [paragraph["context"]]

    return SQuAD
        
def clean_SQuAD(SQuAD):
    df = pd.DataFrame(columns=["data_num", "paragraph_num", "id", "title", "context", "question", "answer", 
                               "is_impossible"])
    for i, data in enumerate(tqdm(SQuAD["data"])):
        title = data["title"]
        for j, paragraph in enumerate(data["paragraphs"]):
            for qas in paragraph["qas"]:
                question = qas["question"]
                id_ = qas["id"]
                is_impossible = qas["is_impossible"]
                type_ = "answers"
                if is_impossible:
                    type_ = "plausible_answers"
                prev_answer = ""
                for answer in qas[type_]:
                    answer_ = answer["text"]
                    restore_quote = False
                    if prev_answer == "":
                        prev_answer = answer_
                    elif prev_answer == answer_:
                        continue
                    context = insert_delimiter(paragraph["context"], answer["answer_start"], 
                                               len(answer_), "••")
                    answer_ = clean_text(answer["text"])
                    context = clean_context(context)
                    
                    row = [i, j, id_, title, context, question, answer_, is_impossible]
                    df.loc[len(df)] = row
    return df

In [None]:
SQuAD_dev_split = split_context(copy.deepcopy(SQuAD_dev), "dev-v2.0")
with open("SQuAD/dev-2.0-split.json", "w", encoding="utf-8") as f:
    json.dump(SQuAD_dev_split, f)

In [None]:
SQuAD_dev_clean = clean_SQuAD(copy.deepcopy(SQuAD_dev_split))
SQuAD_dev_clean.to_csv("SQuAD/dev-v2.0-clean.csv")

In [None]:
SQuAD_train_split = split_context(copy.deepcopy(SQuAD_train), "train-v2.0")
with open("./SQuAD/train-2.0-split.json", "w", encoding="utf-8") as f:
    json.dump(SQuAD_train_split, f)

In [None]:
SQuAD_train_clean = clean_SQuAD(copy.deepcopy(SQuAD_train_split))
SQuAD_train_clean.to_csv("./SQuAD/train-v2.0-clean.csv")