In [208]:
import torch
import numpy as np
import pandas as pd
import pickle 
import re, os, string, typing, gc, json
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [209]:
def load_json(path):
    '''
    input: the path where the JSON file of the SQuAD 2.0 sets are
    output: the json object of the dataset
    '''
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print("Length", len(data['data']))
    #print("Data keys: ", data['data'][0].keys())
    #print("Title: ", data['data'][0]['title'])

    return data

In [210]:
train_path = './train-v2.0.json'
train_data = load_json(train_path)

dev_path = './dev-v2.0.json'
dev_data = load_json(dev_path)


Length 442
Length 35


In [211]:
def parse_data(data:dict)->list:
    '''
    Parses the JSON file of Squad dataset by looping through the
    keys and values and returns a list of dictionaries with
    context, query and label triplets being the keys of each dict.
    '''
    data = data['data']   
    qa_list = []

    for paragraphs in data:

        for para in paragraphs['paragraphs']:
            context = para['context']

            for qa in para['qas']:
                
                id = qa['id']
                question = qa['question']
                
                for ans in qa['answers']:
                    answer = ans['text']
                    ans_start = ans['answer_start']
                    ans_end = ans_start + len(answer)
                    
                    qa_dict = {}
                    qa_dict['id'] = id
                    qa_dict['context'] = context
                    qa_dict['question'] = question
                    qa_dict['label'] = [ans_start, ans_end]

                    qa_dict['answer'] = answer
                    qa_list.append(qa_dict)    

    return qa_list

    

In [212]:
train_data = parse_data(train_data)
print("Length of train_data: ", len(train_data))
print(train_data[0:4])

Length of train_data:  86821
[{'id': '56be85543aeaaa14008c9063', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'label': [269, 286], 'answer': 'in the late 1990s'}, {'id': '56be85543aeaaa14008c9065', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born Sep

In [201]:

dev_data = parse_data(dev_data)
print("Length of dev_data: ", len(dev_data))

Length of dev_data:  20302


In [202]:
def lemmatize(doc):
    return [token.lemma_ for token in doc]


In [203]:
def collectExi(data):
    exi_data = []

    for pair in data:
        context = nlp(pair['context'])
        theres_indices = []
        lemma = lemmatize(context)
        lemma_enum = enumerate(lemma)             #"enumerate" helps get the indices of all "there"s that occur in the context
        for i,j in lemma_enum:
            if j == 'there':                      #identify the expletive "there"
                theres_indices.append(i)
        
        if theres_indices != []:
            #print(theres_indices)
            
            for i in theres_indices:
                for k in range(3):                #considering up to 3 lemmas succeeding each "there" to check if there is any copula
                    if lemma[i+k] == 'be':        #identify the copula
                        exi_data.append(pair)
                        break
                    if lemma[i+k] == '.':         #If the sentence ends before the occurrence of any 'be', this "there" must be irrelevant to this case.
                        break
                    
    print("Length of exi_data: ", len(exi_data))
    
    

    return exi_data

In [195]:
collectExi(train_data)

Length of exi_data:  11371


TypeError: list indices must be integers or slices, not str

In [None]:
collectExi(dev_data)