In [1]:
import numpy as np
import json
import torch
import random


  from .autonotebook import tqdm as notebook_tqdm


# Task 1

In [2]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [3]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## [Task 2] Train, Validation and Test splits

CoQA only provides a train and validation set since the test set is hidden for evaluation purposes.

We'll consider the provided validation set as a test set. <br>
$\rightarrow$ Write your own script to:
* Split the train data in train and validation splits (80% train and 20% val)
* Perform splits such that a dialogue appears in one split only! (i.e., split at dialogue level)
* Perform splitting using the following seed for reproducibility: 42

#### Reproducibility Memo

Check back tutorial 2 on how to fix a specific random seed for reproducibility!

In [4]:
with open(os.path.join('coqa', 'train.json'), 'r') as j:
    train = json.loads(j.read())

with open(os.path.join('coqa', 'test.json'), 'r') as j:
    test = json.loads(j.read())

In [5]:
train = train['data']
test = test['data']

In [6]:
lengths=[len(doc['questions']) for doc in train]

In [7]:
le=np.cumsum(np.array(lengths,dtype=np.float32))
train_end=np.where((le/le[-1])>0.8)[0][0]

validation = train[train_end : ] 
train = train[ : train_end]

In [8]:
print(len(train))
print(len(validation))

5771
1428


In [9]:
len_train=np.sum([len(doc['questions']) for doc in train])
len_val=np.sum([len(doc['questions']) for doc in validation])

len_tot=len_train+len_val
print(len_train,len_train/len_tot)
print(len_val,len_val/len_tot)

86909 0.7999208445700295
21738 0.20007915542997046


In [10]:
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, data, return_history=False):

        self.story=[d['story'] for d in data]
        self.questions=[d['questions'] for d in data]
        self.answers=[d['answers'] for d in data]
        lengths = [len(doc['questions']) for doc in data]
        self.lengths = np.cumsum(np.array(lengths,dtype=np.int32))
        self.R_H=return_history
        

    def __len__(self):
        return self.lengths[-1]

    def __getitem__(self, idx):
        f_idx=int(np.where(self.lengths > idx)[0][0])
        if f_idx>0:
            q_idx=idx-self.lengths[f_idx-1]
        else:
            q_idx=idx

        passage=self.story[f_idx]
        questions=self.questions[f_idx]
        answers=self.answers[f_idx]
        question=questions[q_idx]['input_text']
        span_start=answers[q_idx]['span_start']
        span_end=answers[q_idx]['span_end']
        span_text=answers[q_idx]['span_text']

        if self.R_H:
            history = np.concatenate([ [questions[i]['input_text'], answers[i]['input_text']] for i in range(q_idx)],0)
            return (passage,question,history), span_text

        return (passage,question), (span_start, span_end)

In [11]:
'''# target is "nice puppet"
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])



outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss
round(loss.item(), 2)''';

In [12]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(CustomImageDataset(train), batch_size=2, shuffle=True)
val_dataloader = DataLoader(CustomImageDataset(validation), batch_size=2, shuffle=True)
test_dataloader = DataLoader(CustomImageDataset(test), batch_size=2, shuffle=True)

# Task 3

## [Task 3] Model definition

Write your own script to define the following transformer-based models from [huggingface](https://HuggingFace.co/).

* [M1] DistilRoBERTa (distilberta-base)
* [M2] BERTTiny (bert-tiny)

**Note**: Remember to install the ```transformers``` python package!

**Note**: We consider small transformer models for computational reasons!

In [13]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_name = 'distilroberta-base'

M1 = AutoModelForQuestionAnswering.from_pretrained(model_name)
T1 = AutoTokenizer.from_pretrained(model_name, max_new_tokens=50)

model_name = 'prajjwal1/bert-tiny'

M2 = AutoModelForQuestionAnswering.from_pretrained(model_name)
T2 = AutoTokenizer.from_pretrained(model_name, max_new_tokens=50)


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be 

In [44]:
def train(model, tokenizer):
    model.to('cuda')

    L=[]

    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(2):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            (passage, question), (sep_start, sep_end) = data

            #text_input = [question[i] + ' [SEP] ' + passage[i] for i in range(len(passage))]

            # zero the parameter gradients
            optimizer.zero_grad()

            inputs = tokenizer(
                question,
                #passage,
                max_length=1_000,
                #truncation="only_second",
                #stride=50,
                #return_overflowing_tokens=True,
                #return_offsets_mapping=True,
                #padding="max_length"
            )
            
            print(inputs.sequence_ids(i))
            print(np.sum(inputs.sequence_ids(i) is not None))

            #get_target_position(inputs, sep_start, sep_end)

            x = torch.tensor(inputs['input_ids']).to('cuda')
            target_start_index = sep_start[inputs['overflow_to_sample_mapping']].to('cuda')
            target_end_index = sep_end[inputs['overflow_to_sample_mapping']].to('cuda')

            outputs = model(x, start_positions=target_start_index, end_positions=target_end_index)
            loss = outputs.loss
            #round(loss.item(), 2)

            # the forward function automatically creates the correct decoder_input_ids
            #loss = model(input_ids=X, labels=y).loss

            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            L.append(loss.detach().cpu().numpy())
            
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3g}')
            running_loss = 0.0

    print('Finished Training')

In [45]:
train(M1,T1)

1


ValueError: expected sequence of length 10 at dim 1 (got 14)