In [170]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [171]:
import os
import urllib.request
import pandas as pd
import json
import numpy as np
import tensorflow as tf
import random

from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoTokenizer


## Setting Seed

In [172]:
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 42
set_reproducibility(seed)

## Dataset Download

In [173]:
import os
import urllib.request
from tqdm import tqdm


class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)


def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(
            url, filename=output_path, reporthook=t.update_to)


def download_data(data_path, url_path, suffix):
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")


In [174]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## Creating DataFrame

In [175]:
def create_df(url):
    with open(url, 'r') as json_file:
    	data = json.load(json_file)['data']
    
    dataframe_rows = []

    for x in data:
        story = x['story']

        for q, a in zip(x['questions'], x['answers']):

            question = q['input_text']
            answer = a['input_text']
            span_text = a['span_text']
            span_start = a['span_start']
            span_end = a['span_end']

            # create single dataframe row
            dataframe_row = {
                "story": story,
                "question": question,
                "answer": answer,
                "span_text": span_text,
                "span_start": span_start,
                "span_end": span_end,
            }

            dataframe_rows.append(dataframe_row)

    return pd.DataFrame(dataframe_rows)

In [176]:
df_train = create_df('./coqa/train.json')
df_test = create_df('./coqa/test.json')
df_train['split'] = 'train'
df_test['split'] = 'test'

df = pd.concat([df_train, df_test], ignore_index=True)


In [177]:
df

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179,train
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research,he Vatican Library is a research library,454,494,train
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law",Vatican Library is a research library for hist...,457,511,train
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology",Vatican Library is a research library for hist...,457,545,train
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project,"March 2014, the Vatican Library began an initi...",769,879,train
...,...,...,...,...,...,...,...
116625,"Las Vegas (, Spanish for ""The Meadows""), offic...","where does the nickname ""Sin City"" come from?",The city's tolerance for numerous forms of adu...,The city's tolerance for numerous forms of adu...,1037,1131,test
116626,"Las Vegas (, Spanish for ""The Meadows""), offic...",Which state is it in?,Nevada,"Vegas, is the 28th-most populated city in the ...",100,207,test
116627,"Las Vegas (, Spanish for ""The Meadows""), offic...",Is it located in a desert?,Yes,within the greater Mojave Desert,326,358,test
116628,"Las Vegas (, Spanish for ""The Meadows""), offic...",what is the name of the desert?,Mojave Desert.,Mojave Desert.,345,359,test


## Remove unanswerable questions

In [178]:
df.loc[(df['answer'] == 'unknown')]

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
13,"The Vatican Apostolic Library (), more commonl...",what must be requested to view?,unknown,unknown,-1,-1,train
349,(CNN) -- The 54-year-old Michigan tree trimmer...,Were they the relatives of the kid?,unknown,unknown,-1,-1,train
352,(CNN) -- The 54-year-old Michigan tree trimmer...,Where did he go afterwards?,unknown,unknown,-1,-1,train
354,(CNN) -- The 54-year-old Michigan tree trimmer...,Who found them?,unknown,unknown,-1,-1,train
356,(CNN) -- The 54-year-old Michigan tree trimmer...,Did he have any siblings?,unknown,unknown,-1,-1,train
...,...,...,...,...,...,...,...
115436,"Futsal, (literally ""mini-football"", ""futebol d...",Does it have the same number of players as reg...,unknown,unknown,-1,-1,test
115619,"CHAPTER XXIV: A Merry Home-Going \n\n""The Laug...",Which one was made of paper?,unknown,CHAPTER XXIV: A Merry Home-Going \n,0,34,test
116096,Guam (i/ˈɡwɑːm/ or /ˈɡwɒm/; Chamorro: Guåhån;[...,Is it the highest?,unknown,unknown,-1,-1,test
116156,The 2008 Summer Olympics torch relay was run f...,And did they climb any mountains?,unknown,unknown,-1,-1,test


In same cases 'unknown' is the correct answer, so we remove only the one in which the spam text is 'unknown'

In [179]:
df.loc[(df['answer'] == 'unknown') & (df['span_text'] != 'unknown')]

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
2953,British actor and comedian Rik Mayall died at ...,What did cause his death?,unknown,"His agent, Kate Benson told CNN Mayall died su...",247,343,train
3732,"A few years ago, Paul Gerner began to gather a...",How many are done then?,unknown,Four architecture teams have nearly finished d...,538,618,train
13762,Jean is a young girl who comes from a rich and...,What was his name?,unknown,When Jean's father told her that he was going ...,998,1079,train
15815,(CNN) -- It was at San Francisco's Olympic Clu...,Did he golf?,unknown,"It was at San Francisco's Olympic Club that ""G...",9,186,train
17684,CHAPTER XII \n\nCONSOLATION \n\nOn the followi...,What was the jealous one wishing to do?,unknown,On the following Sunday neither Tudor nor Norm...,28,91,train
26402,"San Marino, officially the Republic of San Mar...",How old is it?,unknown,San Marino lays claim to be the oldest extant...,808,919,train
67001,"Fresno (/ˈfrɛznoʊ/ FREZ-noh), the county seat ...",Is it a good place to live?,unknown,"Fresno (/ˈfrɛznoʊ/ FREZ-noh), the county seat ...",0,127,train
75486,The history of India includes the prehistoric ...,Which civilization had a collapse in the begin...,unknown,A sophisticated and technologically advanced ...,1268,1461,train
87243,(CNN) -- Newcastle's 16-year stay in the Engli...,Why is his career over?,unknown,Newcastle's 16-year stay in the English Premi...,8,64,train
92962,"CHAPTER XXI \n\nA BOB SLED RACE \n\n""Whoop! hu...",What did Songbird wish?,unknown,", you'll be taking Songbird's",1384,1413,train


In [180]:
index = df.loc[(df['answer'] == 'unknown') & (df['span_text'] != 'unknown')].index

df = df.drop(index).reset_index(drop=True)


In [181]:
df

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179,train
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research,he Vatican Library is a research library,454,494,train
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law",Vatican Library is a research library for hist...,457,511,train
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology",Vatican Library is a research library for hist...,457,545,train
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project,"March 2014, the Vatican Library began an initi...",769,879,train
...,...,...,...,...,...,...,...
116614,"Las Vegas (, Spanish for ""The Meadows""), offic...","where does the nickname ""Sin City"" come from?",The city's tolerance for numerous forms of adu...,The city's tolerance for numerous forms of adu...,1037,1131,test
116615,"Las Vegas (, Spanish for ""The Meadows""), offic...",Which state is it in?,Nevada,"Vegas, is the 28th-most populated city in the ...",100,207,test
116616,"Las Vegas (, Spanish for ""The Meadows""), offic...",Is it located in a desert?,Yes,within the greater Mojave Desert,326,358,test
116617,"Las Vegas (, Spanish for ""The Meadows""), offic...",what is the name of the desert?,Mojave Desert.,Mojave Desert.,345,359,test


## Data Inspection

In [182]:
print(f"Dataset size: {df.shape}")
print(f"Dataset columns: {df.columns.values}")
print(f"Some examples: {df.iloc[:5]}")

Dataset size: (116619, 7)
Dataset columns: ['story' 'question' 'answer' 'span_text' 'span_start' 'span_end' 'split']
Some examples:                                                story  \
0  The Vatican Apostolic Library (), more commonl...   
1  The Vatican Apostolic Library (), more commonl...   
2  The Vatican Apostolic Library (), more commonl...   
3  The Vatican Apostolic Library (), more commonl...   
4  The Vatican Apostolic Library (), more commonl...   

                            question                               answer  \
0  When was the Vat formally opened?  It was formally established in 1475   
1           what is the library for?                             research   
2                 for what subjects?                     history, and law   
3                               and?     philosophy, science and theology   
4          what was started in 2014?                           a  project   

                                           span_text  span_start  sp

In [183]:
df_analysis = df.copy()
df_analysis['q_first_word']=df_analysis['question'].str.lower().str.extract(r'(\w+)')
df_analysis['q_first_two_words']=df_analysis['question'].str.lower().str.extract(r'^((?:\S+\s+){1}\S+).*')

Top ranking first word in question

In [184]:
df_analysis.groupby('q_first_word').size().sort_values(ascending=False).head(15)


q_first_word
what     34564
who      16756
how      11693
did       8018
where     7726
was       5483
when      4793
is        3645
why       3138
does      2277
and       2214
which     1854
were      1111
are       1094
in        1055
dtype: int64

Top ranking first two words in question

In [185]:
df_analysis.groupby('q_first_two_words').size().sort_values(ascending=False).head(15)

q_first_two_words
what did     6070
what was     5459
what is      5153
how many     3945
who was      3602
who is       2567
did he       2547
where did    2144
what does    1930
when did     1917
who did      1518
where was    1436
how did      1407
was he       1372
how old      1298
dtype: int64

Percentage of rielaborated or not rielaborated answers

In [186]:
df

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179,train
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research,he Vatican Library is a research library,454,494,train
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law",Vatican Library is a research library for hist...,457,511,train
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology",Vatican Library is a research library for hist...,457,545,train
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project,"March 2014, the Vatican Library began an initi...",769,879,train
...,...,...,...,...,...,...,...
116614,"Las Vegas (, Spanish for ""The Meadows""), offic...","where does the nickname ""Sin City"" come from?",The city's tolerance for numerous forms of adu...,The city's tolerance for numerous forms of adu...,1037,1131,test
116615,"Las Vegas (, Spanish for ""The Meadows""), offic...",Which state is it in?,Nevada,"Vegas, is the 28th-most populated city in the ...",100,207,test
116616,"Las Vegas (, Spanish for ""The Meadows""), offic...",Is it located in a desert?,Yes,within the greater Mojave Desert,326,358,test
116617,"Las Vegas (, Spanish for ""The Meadows""), offic...",what is the name of the desert?,Mojave Desert.,Mojave Desert.,345,359,test


In [187]:
sia = []
for i in range(df.shape[0]):
    sia.append(df["answer"][i] in df["span_text"][i])
print(f'Percentage of rielaborated answers: {sia.count(False)/len(sia)*100:.2f}%')
print(f'Percentage of not rielaborated answers: {sia.count(True)/len(sia)*100:.2f}%')


Percentage of rielaborated answers: 39.26%
Percentage of not rielaborated answers: 60.74%


## Train, Validation and Test splits

In [188]:
stories = df["story"].loc[df['split'] == 'train'].unique()

story_train, story_val = train_test_split(stories, test_size=0.2, random_state=seed)

conditions = [
    (df['story'].isin(story_train)),
    (df['story'].isin(story_val)),
    (df['split'] == 'test')]
choices = ['train', 'val', 'test']
df['split'] = np.select(conditions, choices)
# df['split'] = np.where(df['story'].isin(story_train), 'train', 'val')

df

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475,151,179,train
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research,he Vatican Library is a research library,454,494,train
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law",Vatican Library is a research library for hist...,457,511,train
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology",Vatican Library is a research library for hist...,457,545,train
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project,"March 2014, the Vatican Library began an initi...",769,879,train
...,...,...,...,...,...,...,...
116614,"Las Vegas (, Spanish for ""The Meadows""), offic...","where does the nickname ""Sin City"" come from?",The city's tolerance for numerous forms of adu...,The city's tolerance for numerous forms of adu...,1037,1131,test
116615,"Las Vegas (, Spanish for ""The Meadows""), offic...",Which state is it in?,Nevada,"Vegas, is the 28th-most populated city in the ...",100,207,test
116616,"Las Vegas (, Spanish for ""The Meadows""), offic...",Is it located in a desert?,Yes,within the greater Mojave Desert,326,358,test
116617,"Las Vegas (, Spanish for ""The Meadows""), offic...",what is the name of the desert?,Mojave Desert.,Mojave Desert.,345,359,test


In [191]:
item_counts = df["split"].value_counts()
item_counts

train    87622
val      21587
test      7410
Name: split, dtype: int64

In [198]:
len(df.loc[(df['split'] == 'train') | (df['split'] == 'val')])

109209

In [195]:
df.loc[(df['split'] == 'val')]

Unnamed: 0,story,question,answer,span_text,span_start,span_end,split
130,Local businessmen are increasingly facing comp...,What is a valuable service?,brick and mortar stores,Camera Co/Op,101,113,val
131,Local businessmen are increasingly facing comp...,What is the issue?,competition from online retailers,competition from online retailers.,42,76,val
132,Local businessmen are increasingly facing comp...,When did it begin?,with mail-order catalogues,with mail-order catalogues,240,266,val
133,Local businessmen are increasingly facing comp...,Is is happening quickly?,yes,developing at a more rapid rate than tradition...,765,826,val
134,Local businessmen are increasingly facing comp...,What is a safety issue?,online security,said she would not give out her credit card nu...,1653,1783,val
...,...,...,...,...,...,...,...
116329,Buenos Aires ( or ; ) is the capital and most ...,How many towns were added to the city limits a...,Two,The city limits were enlarged to include the t...,899,1012,val
116330,Buenos Aires ( or ; ) is the capital and most ...,what were they?,Belgrano and Flores,the towns of Belgrano and Flores,940,972,val
116331,Buenos Aires ( or ; ) is the capital and most ...,are they still part of the city?,Yes,both are now neighborhoods of the city,973,1012,val
116332,Buenos Aires ( or ; ) is the capital and most ...,What continent is Buenos Aires found on?,South America,The city is located on the western shore of th...,74,206,val


In [216]:
#veryfing split train val ratio
item_counts = df["split"].value_counts()
print(item_counts)
len_train_val = len(df.loc[(df['split'] == 'train') | (df['split'] == 'val')])
print(f'Train split {item_counts[0]/len_train_val:.2F}')
print(f'Val split {item_counts[1]/len_train_val:.2F}')

train    87622
val      21587
test      7410
Name: split, dtype: int64
Train split 0.80
Val split 0.20


In [202]:
train_data = df.loc[df['split'] == 'train']
val_data = df.loc[df['split'] == 'val']
test_data = df.loc[df['split'] == 'test']

## Model definition

In [203]:
model_checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [204]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['dropout_79', 'qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [205]:
max_length = model.config.max_position_embeddings
doc_stride = (
    128  # The authorized overlap between two part of the context when splitting
)

In [92]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a
    # stride. This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous
    # feature.
    question = [q.lstrip() for q in examples["question"]]
    story = [c.lstrip() for c in examples["story"]]
    
    tokenized_examples = tokenizer(
        question,
        story,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a
    # map from a feature to its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original
    # context. This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what
        # is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this
        # span of text.
        sample_index = sample_mapping[i]
        
        # Start/end character index of the answer in the text.
        start_char = examples["span_start"][sample_index]
        end_char = examples["span_end"][sample_index]

        # Start token index of the current span in the text.
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # End token index of the current span in the text.
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Detect if the answer is out of the span (in which case this feature is labeled with the
        # CLS index).
        if not (
            offsets[token_start_index][0] <= start_char
            and offsets[token_end_index][1] >= end_char
        ):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the
            # answer.
            # Note: we could go after the last offset if the answer is the last word (edge
            # case).
            while (
                token_start_index < len(offsets)
                and offsets[token_start_index][0] <= start_char
            ):
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


In [206]:
df_tokenized = prepare_train_features(df)

In [215]:
df_tokenized[0:10]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512

In [None]:
from transformers import pipeline

question_answering = pipeline("question-answering")
context = train_data['story'][0]
question = train_data['question'][0]
result = question_answering(question=question, context=context)
print("Answer:", result['answer'])
print("Score:", result['score'])