In [158]:
import numpy as np
import pandas as pd
import json

In [159]:
# Loading dataset

file_path = "./data/training/"
input_file = file_path + "training_set.json"
record_path = ['data','paragraphs','qas','answers']

def load_dataset(input_file, record_path = ['data', 'paragraphs','qas','answers']):
    """
    :params input_file_path: path to the squad json file
    :params record_path: path to deepest level in json file; default value is ['data','paragraphs','qas','answers']

    :return df: dataframe resulting from json flattening
    """
    print("Reading json file...")    
    file = json.loads(open(input_file).read())
    print("processing...")

    js = pd.io.json.json_normalize(file, record_path, meta=[['data','title']])
    m = pd.io.json.json_normalize(file, record_path[:-1], meta=[['data','title']])
    r = pd.io.json.json_normalize(file, record_path[:-2], meta=[['data','title']])

    print("Json flattening completed!")
    print()
    print('Building flatten dataframe')
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    df = pd.concat([ m[['id','question','context']].set_index('id'), js.set_index('q_idx')],1,sort=False).reset_index()
    df['context_id'] = df['context'].factorize()[0]
    df.rename(columns={"data.title": "title"}, inplace=True)

    col_order = ['index', 'question', 'title', 'context_id', 'context', 'text', 'answer_start']
    df = df[col_order]

    print("shape of the dataframe is {}".format(df.shape))
    print("Done")
    print()
    return df

df = load_dataset(input_file)

Reading json file...
processing...
Json flattening completed!

Building flatten dataframe
shape of the dataframe is (87599, 7)
Done



In [160]:
df.head(10)

Unnamed: 0,index,question,title,context_id,context,text,answer_start
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",Saint Bernadette Soubirous,515
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",a copper statue of Christ,188
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",the Main Building,279
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",a Marian place of prayer and reflection,381
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",a golden statue of the Virgin Mary,92
5,5733bf84d058e614000b61be,When did the Scholastic Magazine of Notre dame...,University_of_Notre_Dame,1,"As at most other universities, Notre Dame's st...",September 1876,248
6,5733bf84d058e614000b61bf,How often is Notre Dame's the Juggler published?,University_of_Notre_Dame,1,"As at most other universities, Notre Dame's st...",twice,441
7,5733bf84d058e614000b61c0,What is the daily student paper at Notre Dame ...,University_of_Notre_Dame,1,"As at most other universities, Notre Dame's st...",The Observer,598
8,5733bf84d058e614000b61bd,How many student news papers are found at Notr...,University_of_Notre_Dame,1,"As at most other universities, Notre Dame's st...",three,126
9,5733bf84d058e614000b61c1,In what year did the student paper Common Sens...,University_of_Notre_Dame,1,"As at most other universities, Notre Dame's st...",1987,908


In [163]:
# Add answer end index

def add_end_idx(df):
    """
    Function that takes as input SQuAD dataframe, with column of start index, and returns the same dataframe with the column containing the index of last answer character.

    :params df: dataframe with at least ['context', 'text', 'answer_start'] columns
    
    return df: the same dataframe as the input but with the new column 'answer_end' that consists of last answer character index
    """
    ans_end=[]
    for index, row in df.iterrows():
        v = row.context
        t = row.text
        s = row.answer_start
        ans_end.append(s+len(t))
    df['answer_end'] = ans_end
    return df

df = add_end_idx(df)

In [166]:
df.head()

Unnamed: 0,index,question,title,context_id,context,text,answer_start,answer_end
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",Saint Bernadette Soubirous,515,541
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",a copper statue of Christ,188,213
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",the Main Building,279,296
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",a Marian place of prayer and reflection,381,420
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,University_of_Notre_Dame,0,"Architecturally, the school has a Catholic cha...",a golden statue of the Virgin Mary,92,126


In [167]:
# Save df to file_path
dataframe_name = 'squad_df'

dataframe_path = os.path.join(file_path, dataframe_name + ".pkl")
df.to_pickle(dataframe_path)

### Split df into train + validation set

In [200]:
val_set_size = round(df.shape[0]*.2)
print(f'Validation set size based on 4:1 split: {val_set_size}')

Validation set size based on 4:1 split: 17520


In [199]:
# Looking for rows to remove from df
size = 0
val_keys=[]
title_counts = df['title'].value_counts().to_dict()

for t, n in title_counts.items():
    if size + n <= val_set_size:
        val_keys.append(t)
        size += n
    else:
        pass

print(f'Number of rows to remove from df: {size}')
print()
print(f'Title of rows to remove from df: {val_keys}')


Number of rows to remove from df: 17520

Title of rows to remove from df: ['New_York_City', 'American_Idol', 'Beyoncé', 'Frédéric_Chopin', 'Queen_Victoria', 'Buddhism', 'New_Haven,_Connecticut', '2008_Sichuan_earthquake', '2008_Summer_Olympics_torch_relay', 'Muammar_Gaddafi', 'Hellenistic_period', 'Napoleon', 'Middle_Ages', 'Modern_history', 'Portugal', 'Gamal_Abdel_Nasser', 'Dwight_D._Eisenhower', 'Kanye_West', 'Southampton', 'The_Blitz', 'Greece', 'Religion_in_ancient_Rome', 'Gramophone_record', 'Dog', 'Roman_Republic', 'Pacific_War', 'Financial_crisis_of_2007%E2%80%9308', 'Mexico_City', 'Paris', 'History_of_India', 'London', 'Pub', 'Tucson,_Arizona', 'Protestantism', 'Pharmaceutical_industry', 'Plymouth', 'Boston', 'Group_(mathematics)']


In [210]:
# Training set

train_df = df[~df['title'].isin(val_keys)].reset_index(drop=True)
print(f'Train set shape: {train_df.shape}')
train_df.head()

Train set shape: (70079, 8)


Unnamed: 0,index,question,title,context_id,context,text,answer_start,answer_end
0,5730401c04bcaa1900d7740f,When was the constitution accepted by the elec...,51st_state,16284,This constitution was created when the U.S. Co...,1952,285,289
1,5730429d04bcaa1900d77437,What percentage of voters rejected the status ...,51st_state,16286,"In November 2012, a referendum resulted in 54 ...",54 percent,43,53
2,5730401c04bcaa1900d7740e,When was the Puerto Rican constitution written?,51st_state,16284,This constitution was created when the U.S. Co...,1951,160,164
3,5730401c04bcaa1900d77410,What clause gives Puerto Rican citizens the sa...,51st_state,16284,This constitution was created when the U.S. Co...,Privileges and Immunities Clause,513,545
4,57304103947a6a140053d344,How is Puerto Rico designated in its constitut...,51st_state,16285,Puerto Rico is designated in its constitution ...,Commonwealth of Puerto Rico,54,81


In [209]:
# Validation set

val_df = df[df['title'].isin(val_keys)].reset_index(drop=True)
print(f'Validation set shape: {val_df.shape}')
val_df.head()

Validation set shape: (17520, 8)


Unnamed: 0,index,question,title,context_id,context,text,answer_start,answer_end
0,56d646091c8504140094705d,What did stations replace programming with?,2008_Sichuan_earthquake,577,All Mainland Chinese television stations (alon...,live earthquake footage,235,258
1,56cebd0faab44d1400b88973,What did their findings show?,2008_Sichuan_earthquake,533,"In the days following the disaster, an interna...",a variety of reasons why many constructions fa...,195,274
2,56d53a0e2593cc1400307af5,After the quake what kind of international tea...,2008_Sichuan_earthquake,533,"In the days following the disaster, an interna...",team of engineers,68,85
3,56d53a0e2593cc1400307af6,What was the team sent to China to make?,2008_Sichuan_earthquake,533,"In the days following the disaster, an interna...",survey of damaged buildings,146,173
4,56d53a0e2593cc1400307af7,What kind of reasons do their findings show?,2008_Sichuan_earthquake,533,"In the days following the disaster, an interna...",variety of reasons,197,215
