In [None]:
import os
import urllib.request
from tqdm import tqdm
import pandas as pd
import json
import numpy as np
import tensorflow as tf
import random

## Setting Seed

In [None]:
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

set_reproducibility(42)

## Dataset Download

In [None]:
import os
import urllib.request
from tqdm import tqdm


class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(
            url, filename=output_path, reporthook=t.update_to)


def download_data(data_path, url_path, suffix):
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")


In [None]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## Creating DataFrame

In [None]:
def create_df(url):
    with open(url, 'r') as json_file:
    	data = json.load(json_file)['data']
    
    dataframe_rows = []

    for x in data:
        story = x['story']

        for q, a in zip(x['questions'], x['answers']):
            # if (a["span_text"], a["span_start"], a["span_end"], a["input_text"]) == ("unknown", -1, -1,"unknown"):
            #     continue

            question = q['input_text']
            answer = a['input_text']
            span_text = a['span_text']
            span_start = a['span_start']
            span_end = a['span_end']

            # create single dataframe row
            dataframe_row = {
                "story": story,
                "question": question,
                "answer": answer,
                "span_text": span_text,
                "span_start": span_start,
                "span_end": span_end,
            }

            dataframe_rows.append(dataframe_row)

    return pd.DataFrame(dataframe_rows)

In [None]:
df_train = create_df('./coqa/train.json')
df_test = create_df('./coqa/test.json')

## Remove unanswerable questions

In [None]:
df_train.loc[(df_train['answer'] == 'unknown')]

In same cases 'unknown' is the correct answer, so we remove only the one in which the spam text is 'unknown'

In [None]:
df_train.loc[(df_train['answer'] == 'unknown') & (df_train['span_text'] != 'unknown')]

In [None]:
index_train = df_train.loc[(df_train['answer'] == 'unknown') & (df_train['span_text'] != 'unknown')].index
index_test = df_test.loc[(df_test['answer'] == 'unknown') & (df_test['span_text'] != 'unknown')].index

df_train.drop(index_train, inplace=True)
df_test.drop(index_test, inplace=True)

## Data Inspection

In [None]:
df = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
print(f"Dataset size: {df.shape}")
print(f"Dataset columns: {df.columns.values}")
print(f"Some examples: {df.iloc[:5]}")

In [None]:
df_analysis = pd.DataFrame(df)
df_analysis['q_first_word']=df_analysis['question'].str.lower().str.extract(r'(\w+)')
df_analysis['q_first_two_words']=df_analysis['question'].str.lower().str.extract(r'^((?:\S+\s+){1}\S+).*')

Top ranking first word in question

In [None]:
df_analysis.groupby('q_first_word').size().sort_values(ascending=False).head(15)


Top ranking first two words in question

In [None]:
df_analysis.groupby('q_first_two_words').size().sort_values(ascending=False).head(15)

Percentage of rielaborated or not rielaborated answers

In [None]:
sia = []
for i in range (len(df["story"])):
    sia.append(df["answer"][i] in df["span_text"][i])
print(f'Percentage of rielaborated answers: {sia.count(False)/len(sia)*100:.2f}%')
print(f'Percentage of not rielaborated answers: {sia.count(True)/len(sia)*100:.2f}%')