In [None]:
# !pip install transformers
# !pip install datasets

In [None]:
import os
import urllib.request
import pandas as pd
import json
import numpy as np
import tensorflow as tf
import random

from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoTokenizer


## Setting Seed

In [None]:
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 42
set_reproducibility(seed)

## Dataset Download

In [None]:
import os
import urllib.request
from tqdm import tqdm


class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(
            url, filename=output_path, reporthook=t.update_to)


def download_data(data_path, url_path, suffix):
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")


In [None]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## Creating DataFrame

In [None]:
def create_df(url):
    with open(url, 'r') as json_file:
        data = json.load(json_file)['data']

    dataframe_rows = []

    for x in data:
        story = x['story']

        for q, a in zip(x['questions'], x['answers']):

            question = q['input_text']
            answer = a['input_text']
            span_text = a['span_text']
            span_start = a['span_start']
            span_end = a['span_end']

            # create single dataframe row
            dataframe_row = {
                "story": story,
                "question": question,
                "answer": answer,
                "span_text": span_text,
                "span_start": span_start,
                "span_end": span_end,
            }

            dataframe_rows.append(dataframe_row)

    return pd.DataFrame(dataframe_rows)

In [None]:
df_train = create_df('./coqa/train.json')
df_test = create_df('./coqa/test.json')
df_train['split'] = 'train'
df_test['split'] = 'test'

df = pd.concat([df_train, df_test], ignore_index=True)


## Remove unanswerable questions

In [None]:
df.loc[(df['answer'] == 'unknown')]

In some cases 'unknown' is the correct answer, so we remove only the one in which the span text is 'unknown'

In [None]:
df.loc[(df['answer'] == 'unknown') & (df['span_text'] == 'unknown')]

In [None]:
index = df.loc[(df['answer'] == 'unknown') & (df['span_text'] == 'unknown')].index

df = df.drop(index).reset_index(drop=True)


In [None]:
for key in ['story', 'question', 'answer', 'span_text']:
    df[key] = df[key].apply(lambda x: x.lower())
df

## Data Inspection

In [None]:
print(f"Dataset size: {df.shape}")
print(f"Dataset columns: {df.columns.values}")
print(f"Some examples:")
df.loc[:5]

In [None]:
df_analysis = df.copy()
df_analysis['q_first_word']=df_analysis['question'].str.lower().str.extract(r'(\w+)')
df_analysis['q_first_two_words']=df_analysis['question'].str.lower().str.extract(r'^((?:\S+\s+){1}\S+).*')

Top ranking first word in question

In [None]:
df_analysis.groupby('q_first_word').size().sort_values(ascending=False).head(15)

Top ranking first bigrams in question

In [None]:
df_analysis.groupby('q_first_two_words').size().sort_values(ascending=False).head(15)

Percentage of rielaborated or non-rielaborated answers. Non-rielaborated answers are just a snippet of the context.

In [None]:
sia = []
for i in range(df.shape[0]):
    sia.append(df["answer"][i] in df["span_text"][i])
print(f'Percentage of rielaborated answers: {sia.count(False)/len(sia)*100:.2f}%')
print(f'Percentage of not rielaborated answers: {sia.count(True)/len(sia)*100:.2f}%')


## Train, Validation and Test splits

In [None]:
stories = df["story"].loc[df['split'] != 'test'].unique()

story_train, story_val = train_test_split(stories, test_size=0.2, random_state=seed)

conditions = [
    (df['story'].isin(story_train)),
    (df['story'].isin(story_val)),
    (df['split'] == 'test')]
choices = ['train', 'val', 'test']
df['split'] = np.select(conditions, choices)

df

In [None]:
#veryfing split train val ratio
item_counts = df["split"].value_counts()
print(item_counts, '\n')
len_train_val = len(df.loc[(df['split'] == 'train') | (df['split'] == 'val')])
print(f'Train split {item_counts[0]/len_train_val:.2f}')
print(f'Val split {item_counts[1]/len_train_val:.2f}')

In [None]:
train_data = df.loc[df['split'] == 'train'].reset_index(drop=True)
val_data = df.loc[df['split'] == 'val'].reset_index(drop=True)
test_data = df.loc[df['split'] == 'test'].reset_index(drop=True)

In [None]:
from datasets import Dataset, DatasetDict

In [None]:
datasets = DatasetDict({
    'train': Dataset.from_pandas(train_data),
    'val': Dataset.from_pandas(val_data)
    })

# Span Detection

## Model definition

In [None]:
model_checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
max_length = 380
doc_stride = (
    128  # The authorized overlap between two part of the context when splitting
)

def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a
    # stride. This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous
    # feature.
    question = [q.lstrip() for q in examples["question"]]
    story = [c.lstrip() for c in examples["story"]]

    tokenized_examples = tokenizer(
        question,
        story,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a
    # map from a feature to its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original
    # context. This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what
        # is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this
        # span of text.
        sample_index = sample_mapping[i]

        # Start/end character index of the answer in the text.
        start_char = examples["span_start"][sample_index]
        end_char = examples["span_end"][sample_index]

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Detect if the answer is out of the span (in which case this feature is labeled with the
        # CLS index).
        if not (
            offsets[token_start_index][0] <= start_char
            and offsets[token_end_index][1] >= end_char
        ):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the
            # answer.
            # Note: we could go after the last offset if the answer is the last word (edge
            # case).
            while (
                token_start_index < len(offsets)
                and offsets[token_start_index][0] <= start_char
            ):
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


In [None]:
tokenized_datasets = datasets.map(
    prepare_train_features,
    batched=True,
    remove_columns=datasets.column_names,
    num_proc=3,
)

train_set = tokenized_datasets["train"].with_format("numpy")[:]  # Load the whole dataset as a dict of numpy arrays
validation_set = tokenized_datasets["val"].with_format("numpy")[:]

In [None]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
import tensorflow as tf
from tensorflow import keras

optimizer = keras.optimizers.Adam(learning_rate=5e-5)

keras.mixed_precision.set_global_policy("mixed_float16")

model.compile(optimizer=optimizer)

history = model.fit(train_set, validation_data=validation_set, epochs=1)

In [None]:
# inputs = tokenizer([str(datasets["test"]['story'])], [str(datasets["test"]['question'])], return_tensors="np")
context = """Keras is an API designed for human beings, not machines. Keras follows best
practices for reducing cognitive load: it offers consistent & simple APIs, it minimizes
the number of user actions required for common use cases, and it provides clear &
actionable error messages. It also has extensive documentation and developer guides. """
question = "What is Keras?"

inputs = tokenizer([context], [question], return_tensors="np")

outputs = model(inputs)
start_position = tf.argmax(outputs.start_logits, axis=1)
end_position = tf.argmax(outputs.end_logits, axis=1)
print(int(start_position), int(end_position[0]))

# Bert Generation