In [4]:
from datasets import load_dataset
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)

In [8]:
squad


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [5]:
import pandas as pd
data_dict = squad["train"].to_dict()
df = pd.DataFrame.from_dict(data_dict)
df.head()


Unnamed: 0,id,title,context,question,answers
0,56ce4b8baab44d1400b8867b,Sino-Tibetan_relations_during_the_Ming_dynasty,When an ally of the Ü-Tsang ruler threatened d...,What role did Güshi Khan take on?,"{'text': ['protector'], 'answer_start': [309]}"
1,56d34ffe59d6e414001462bc,Frédéric_Chopin,"During the summers at Nohant, particularly in ...",What two things did Chopin advise Viardot on?,"{'text': ['piano technique and composition.'],..."
2,56d4c0452ccc5a1400d831c8,Beyoncé,"Beyoncé further expanded her acting career, st...",Which singer did Beyoncé portray in Cadillac R...,"{'text': ['Etta James'], 'answer_start': [69]}"
3,573398ebd058e614000b5e66,University_of_Notre_Dame,A Science Hall was built in 1883 under the dir...,Which person oversaw the creation of a science...,"{'text': ['Fr. Zahm'], 'answer_start': [56]}"
4,56cbff116d243a140015ee49,Frédéric_Chopin,"The two became friends, and for many years liv...",For whose benefit was the first of these conce...,"{'text': ['Harriet Smithson'], 'answer_start':..."


In [8]:
df.to_csv('data/squad_data.csv', index=False)

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

questions = [q.strip() for q in df["question"]]
context = [q.strip() for q in df["context"]]

inputs = tokenizer(questions, context, max_length=384, truncation="only_second", padding="max_length", return_offsets_mapping=True)

offset_mapping = inputs.pop("offset_mapping")

start_positions = []
end_positions = []

answers = df["answers"]

for i, offset in enumerate(offset_mapping):
    answer = answers[i]

    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer['text'][0])
    sequence_ids = inputs.sequence_ids(i)

    idx = 0
    while sequence_ids[idx] != 1:
        idx+=1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx+=1
    context_end = idx-1
    
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx)
        
        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx)

df['start_position'] = start_positions
df['end_position'] = end_positions

In [None]:
data={'input_ids':inputs['input_ids'], 
      'attention_mask':inputs['attention_mask'], 
      'start_positions':start_positions, 
      'end_positions':end_positions}

df = pd.DataFrame(data)
df.to_csv('data/encoding_train.csv', index = False)
