In [1]:
import os
import urllib.request
from tqdm import tqdm
import pandas as pd
import json

In [2]:
import os
import urllib.request
from tqdm import tqdm


class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(
            url, filename=output_path, reporthook=t.update_to)


def download_data(data_path, url_path, suffix):
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")


In [3]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

In [4]:
with open('./coqa/test.json', 'r') as json_file:
	data = json.load(json_file)['data']

dataframe_rows = []

for x in data:
    story = x['story']

    for q, a in zip(x['questions'], x['answers']):
        # if (a["span_text"], a["span_start"], a["span_end"], a["input_text"]) == ("unknown", -1, -1,"unknown"):
        # continue

        question = q['input_text']
        answer = a['input_text']
        span_text = a['span_text']
        span_start = a['span_start']
        span_end = a['span_end']

        # create single dataframe row
        dataframe_row = {
            "story": story,
            "question": question,
            "answer": answer,
            "span_text": span_text,
            "span_start": span_start,
            "span_end": span_end,
        }

        dataframe_rows.append(dataframe_row)

df = pd.DataFrame(dataframe_rows)


In [5]:
df.loc[(df['answer'] == 'unknown')]


Unnamed: 0,story,question,answer,span_text,span_start,span_end
20,Once there was a beautiful fish named Asta. As...,What did they do with the note,unknown,unknown,-1,-1
22,Once there was a beautiful fish named Asta. As...,were they excited,unknown,unknown,-1,-1
130,Which country grows the most tea? The answer i...,How did his body react to the tea?,unknown,unknown,-1,-1
229,"CHAPTER XXII \n\nNorthward, along the leeward ...",Were there cannibals?,unknown,unknown,-1,-1
287,CHAPTER FIFTY FIVE. \n\nWAITING. \n\nThe lengt...,Why was he excommunicated then?,unknown,unknown,-1,-1
...,...,...,...,...,...,...
6789,"Futsal, (literally ""mini-football"", ""futebol d...",Does it have the same number of players as reg...,unknown,unknown,-1,-1
6972,"CHAPTER XXIV: A Merry Home-Going \n\n""The Laug...",Which one was made of paper?,unknown,CHAPTER XXIV: A Merry Home-Going \n,0,34
7449,Guam (i/ˈɡwɑːm/ or /ˈɡwɒm/; Chamorro: Guåhån;[...,Is it the highest?,unknown,unknown,-1,-1
7509,The 2008 Summer Olympics torch relay was run f...,And did they climb any mountains?,unknown,unknown,-1,-1


In [6]:
df.loc[(df['answer'] == 'unknown') & (df['span_text'] != 'unknown')]


Unnamed: 0,story,question,answer,span_text,span_start,span_end
6972,"CHAPTER XXIV: A Merry Home-Going \n\n""The Laug...",Which one was made of paper?,unknown,CHAPTER XXIV: A Merry Home-Going \n,0,34
