In [4]:
import os
import urllib.request
from tqdm import tqdm
import pandas as pd
import json

## Dataset Download

In [5]:
import os
import urllib.request
from tqdm import tqdm


class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(
            url, filename=output_path, reporthook=t.update_to)


def download_data(data_path, url_path, suffix):
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")


In [6]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

Downloading CoQA train data split... (it may take a while)


coqa-train-v1.0.json: 49.0MB [00:50, 970kB/s]                               


Download completed!
Downloading CoQA test data split... (it may take a while)


coqa-dev-v1.0.json: 9.09MB [00:05, 1.64MB/s]                            

Download completed!





## Creating DataFrame

In [11]:
def create_df(url):
    with open(url, 'r') as json_file:
    	data = json.load(json_file)['data']
    
    dataframe_rows = []

    for x in data:
        story = x['story']

        for q, a in zip(x['questions'], x['answers']):
            if (a["span_text"], a["span_start"], a["span_end"], a["input_text"]) == ("unknown", -1, -1,"unknown"):
                continue

            question = q['input_text']
            answer = a['input_text']
            span_text = a['span_text']
            span_start = a['span_start']
            span_end = a['span_end']

            # create single dataframe row
            dataframe_row = {
                "story": story,
                "question": question,
                "answer": answer,
                "span_text": span_text,
                "span_start": span_start,
                "span_end": span_end,
            }

            dataframe_rows.append(dataframe_row)

    return pd.DataFrame(dataframe_rows)

In [14]:
df_train = create_df('./coqa/train.json')
df_test = create_df('./coqa/test.json')

In [17]:
df = pd.concat([df_train, df_test])
df

Unnamed: 0,story,question,answer,span_text,span_start,span_end
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research,he Vatican Library is a research library,454,494
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law",Vatican Library is a research library for hist...,457,511
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology",Vatican Library is a research library for hist...,457,545
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project,"March 2014, the Vatican Library began an initi...",769,879
...,...,...,...,...,...,...
7978,"Las Vegas (, Spanish for ""The Meadows""), offic...","where does the nickname ""Sin City"" come from?",The city's tolerance for numerous forms of adu...,The city's tolerance for numerous forms of adu...,1037,1131
7979,"Las Vegas (, Spanish for ""The Meadows""), offic...",Which state is it in?,Nevada,"Vegas, is the 28th-most populated city in the ...",100,207
7980,"Las Vegas (, Spanish for ""The Meadows""), offic...",Is it located in a desert?,Yes,within the greater Mojave Desert,326,358
7981,"Las Vegas (, Spanish for ""The Meadows""), offic...",what is the name of the desert?,Mojave Desert.,Mojave Desert.,345,359


In [15]:
df

Unnamed: 0,story,question,answer,span_text,span_start,span_end
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?What color wa...,It was formally established in 1475white,Formally established in 1475a little white kit...,210.0,272.0
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?Where did she live?,researchin a barn,he Vatican Library is a research libraryin a b...,472.0,574.0
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?Did she live alone?,"history, and lawno",Vatican Library is a research library for hist...,653.0,726.0
3,"The Vatican Apostolic Library (), more commonl...",and?Who did she live with?,"philosophy, science and theologywith her mommy...",Vatican Library is a research library for hist...,738.0,860.0
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?What color were her s...,a projectorange and white,"March 2014, the Vatican Library began an initi...",1197.0,1369.0
...,...,...,...,...,...,...
108642,,,,,,
108643,,,,,,
108644,,,,,,
108645,,,,,,


In [9]:
df.loc[(df['answer'] == 'unknown')]


Unnamed: 0,story,question,answer,span_text,span_start,span_end
20,Once there was a beautiful fish named Asta. As...,What did they do with the note,unknown,unknown,-1,-1
22,Once there was a beautiful fish named Asta. As...,were they excited,unknown,unknown,-1,-1
130,Which country grows the most tea? The answer i...,How did his body react to the tea?,unknown,unknown,-1,-1
229,"CHAPTER XXII \n\nNorthward, along the leeward ...",Were there cannibals?,unknown,unknown,-1,-1
287,CHAPTER FIFTY FIVE. \n\nWAITING. \n\nThe lengt...,Why was he excommunicated then?,unknown,unknown,-1,-1
...,...,...,...,...,...,...
6789,"Futsal, (literally ""mini-football"", ""futebol d...",Does it have the same number of players as reg...,unknown,unknown,-1,-1
6972,"CHAPTER XXIV: A Merry Home-Going \n\n""The Laug...",Which one was made of paper?,unknown,CHAPTER XXIV: A Merry Home-Going \n,0,34
7449,Guam (i/ˈɡwɑːm/ or /ˈɡwɒm/; Chamorro: Guåhån;[...,Is it the highest?,unknown,unknown,-1,-1
7509,The 2008 Summer Olympics torch relay was run f...,And did they climb any mountains?,unknown,unknown,-1,-1


In [10]:
df.loc[(df['answer'] == 'unknown') & (df['span_text'] != 'unknown')]


Unnamed: 0,story,question,answer,span_text,span_start,span_end
6972,"CHAPTER XXIV: A Merry Home-Going \n\n""The Laug...",Which one was made of paper?,unknown,CHAPTER XXIV: A Merry Home-Going \n,0,34
