In [15]:
import numpy as np
import json
import torch
import random


# Task 1

In [16]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [17]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## [Task 2] Train, Validation and Test splits

CoQA only provides a train and validation set since the test set is hidden for evaluation purposes.

We'll consider the provided validation set as a test set. <br>
$\rightarrow$ Write your own script to:
* Split the train data in train and validation splits (80% train and 20% val)
* Perform splits such that a dialogue appears in one split only! (i.e., split at dialogue level)
* Perform splitting using the following seed for reproducibility: 42

#### Reproducibility Memo

Check back tutorial 2 on how to fix a specific random seed for reproducibility!

In [18]:
with open(os.path.join('coqa', 'train.json'), 'r') as j:
    train = json.loads(j.read())

with open(os.path.join('coqa', 'test.json'), 'r') as j:
    test = json.loads(j.read())

In [19]:
train = train['data']
test = test['data']

In [20]:
lengths=[len(doc['questions']) for doc in train]

In [21]:
le=np.cumsum(np.array(lengths,dtype=np.float32))
train_end=np.where((le/le[-1])>0.8)[0][0]

validation = train[train_end : ] 
train = train[ : train_end]

In [22]:
print(len(train))
print(len(validation))

5771
1428


In [23]:
len_train=np.sum([len(doc['questions']) for doc in train])
len_val=np.sum([len(doc['questions']) for doc in validation])

len_tot=len_train+len_val
print(len_train,len_train/len_tot)
print(len_val,len_val/len_tot)

86909 0.7999208445700295
21738 0.20007915542997046


In [24]:
class CustomImageDataset(torch.utils.data.Dataset):
    def __init__(self, data, return_history=False):

        self.story=[d['story'] for d in data]
        self.questions=[d['questions'] for d in data]
        self.answers=[d['answers'] for d in data]
        lengths = [len(doc['questions']) for doc in data]
        self.lengths = np.cumsum(np.array(lengths,dtype=np.int32))
        self.R_H=return_history
        

    def __len__(self):
        return self.lengths[-1]

    def __getitem__(self, idx):
        f_idx=int(np.where(self.lengths > idx)[0][0])
        if f_idx>0:
            q_idx=idx-self.lengths[f_idx-1]
        else:
            q_idx=idx

        passage=self.story[f_idx]
        questions=self.questions[f_idx]
        answers=self.answers[f_idx]
        question=questions[q_idx]['input_text']
        answer=answers[q_idx]['input_text']

        if self.R_H:
            print([ [questions[i]['input_text'],answers[i]['input_text']] for i in range(q_idx)])
            history=np.concatenate([ [questions[i]['input_text'],answers[i]['input_text']] for i in range(q_idx)],0)
            return (passage,question,history), answer

        return (passage,question), answer

In [25]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(CustomImageDataset(train), batch_size=1, shuffle=True)
val_dataloader = DataLoader(CustomImageDataset(validation), batch_size=1, shuffle=True)
test_dataloader = DataLoader(CustomImageDataset(test), batch_size=1, shuffle=True)

# Task 3

## [Task 3] Model definition

Write your own script to define the following transformer-based models from [huggingface](https://HuggingFace.co/).

* [M1] DistilRoBERTa (distilberta-base)
* [M2] BERTTiny (bert-tiny)

**Note**: Remember to install the ```transformers``` python package!

**Note**: We consider small transformer models for computational reasons!

In [26]:
from transformers import EncoderDecoderModel, AutoTokenizer

model_name = 'distilroberta-base'

M1 = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name,max_new_tokens=50)
T1 = AutoTokenizer.from_pretrained(model_name,max_new_tokens=50)


model_name = 'prajjwal1/bert-tiny'

M2 = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)
T2 = AutoTokenizer.from_pretrained(model_name)


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['roberta.encoder.layer.2.crossattention.output.LayerNorm.bias', 'roberta.encoder.layer.5.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.4.cros

# Task 4

In [27]:
i=42

passage=train[i]['story']
questions=train[i]['questions']
n=random.randint(0,len(questions))
question=questions[n]['input_text']

input_text=passage+' [SEP] '+question
print(input_text)

Jefferson's metaphor of a wall of separation has been cited repeatedly by the U.S. Supreme Court. In Reynolds v. United States (1879) the Court wrote that Jefferson's comments "may be accepted almost as an authoritative declaration of the scope and effect of the [First] Amendment." In Everson v. Board of Education (1947), Justice Hugo Black wrote: "In the words of Thomas Jefferson, the clause against establishment of religion by law was intended to erect a wall of separation between church and state." 

Many early immigrant groups traveled to America to worship freely, particularly after the English Civil War and religious conflict in France and Germany. They included nonconformists like the Puritans, who were Protestant Christians fleeing religious persecution from the Anglican King of England. Despite a common background, the groups' views on religious toleration were mixed. While some such as Roger Williams of Rhode Island and William Penn of Pennsylvania ensured the protection of r

In [28]:
history=np.concatenate([ [train[i]['questions'][idx]['input_text'],train[i]['answers'][idx]['input_text']] for idx in range(n)],0)
history

array(['after what did a a lot of migrants travel?', 'English Civil War',
       'where did they go?', 'traveled to America', 'who were they?',
       'nonconformists like the Puritans', 'why did they leave home?',
       'fleeing religious persecution', 'who persecuted them?',
       'Anglican King of England.',
       'did they all share the same viewpoint on theology?', 'No',
       'did some protect different ideas?', 'yes', 'who was one?',
       'Roger Williams', 'from where?', 'Rhode Island', 'and another?',
       'William Penn'], dtype='<U50')

In [29]:
separator = ' [SEP] '
text_input = passage + f'{separator if len(history) else ""}' + separator.join(history) + separator + question
text_input

'Jefferson\'s metaphor of a wall of separation has been cited repeatedly by the U.S. Supreme Court. In Reynolds v. United States (1879) the Court wrote that Jefferson\'s comments "may be accepted almost as an authoritative declaration of the scope and effect of the [First] Amendment." In Everson v. Board of Education (1947), Justice Hugo Black wrote: "In the words of Thomas Jefferson, the clause against establishment of religion by law was intended to erect a wall of separation between church and state." \n\nMany early immigrant groups traveled to America to worship freely, particularly after the English Civil War and religious conflict in France and Germany. They included nonconformists like the Puritans, who were Protestant Christians fleeing religious persecution from the Anglican King of England. Despite a common background, the groups\' views on religious toleration were mixed. While some such as Roger Williams of Rhode Island and William Penn of Pennsylvania ensured the protectio

In [30]:
def f_PQ(model, tokenizer, passage, question):
    text_input = question + ' [SEP] ' + passage
    input_ids = tokenizer(text_input, return_tensors="pt").input_ids
    generated_ids = model.generate(input_ids)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [31]:
f_PQ(M1, T1, ' ', question)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


',,,,,,,,,,,,,,,,,,,'

# Task 5

In [32]:
def f_PQH(model, tokenizer, passage, question, history):
    separator = ' [SEP] '
    text_input = question + f'{separator if len(history) else ""}' + separator.join(history)+ separator+ passage 
    input_ids = tokenizer(text_input, return_tensors="pt").input_ids
    generated_ids = model.generate(input_ids)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Task 6

In [33]:
def train(model, tokenizer):
    model.to('cuda')

    L=[]

    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(2):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            (passage, question), answer = data

            text_input = [question[i] + ' [SEP] ' + passage[i] for i in range(len(passage))]

            # zero the parameter gradients
            optimizer.zero_grad()

            input_ids = tokenizer(
                text_input,
                return_tensors="pt",
            ).input_ids

            labels = tokenizer(
                answer,
                return_tensors="pt",
            ).input_ids

            X=torch.tensor(input_ids,device='cuda')
            y=torch.tensor(labels,device='cuda')
            print(X.shape,y.shape)
            if X.shape[1]>500:
                continue

            # the forward function automatically creates the correct decoder_input_ids
            loss = model(input_ids=X, labels=y).loss

            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            L.append(loss.detach().cpu().numpy())
            
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3g}')
            running_loss = 0.0

    print('Finished Training')

In [34]:
train(M1,T1)

  X=torch.tensor(input_ids,device='cuda')
  y=torch.tensor(labels,device='cuda')


torch.Size([1, 443]) torch.Size([1, 3])




[1,     1] loss: 0.00458
torch.Size([1, 369]) torch.Size([1, 4])
[1,     2] loss: 0.00259
torch.Size([1, 447]) torch.Size([1, 3])
[1,     3] loss: 0.0026
torch.Size([1, 260]) torch.Size([1, 11])
[1,     4] loss: 0.0117
torch.Size([1, 345]) torch.Size([1, 4])
[1,     5] loss: 0.00639
torch.Size([1, 373]) torch.Size([1, 5])
[1,     6] loss: 0.00458
torch.Size([1, 357]) torch.Size([1, 3])
[1,     7] loss: 0.0027
torch.Size([1, 446]) torch.Size([1, 8])
[1,     8] loss: 0.00425
torch.Size([1, 324]) torch.Size([1, 5])
[1,     9] loss: 0.00285
torch.Size([1, 383]) torch.Size([1, 5])
[1,    10] loss: 0.00246
torch.Size([1, 429]) torch.Size([1, 4])
[1,    11] loss: 0.00336
torch.Size([1, 371]) torch.Size([1, 6])
[1,    12] loss: 0.00351
torch.Size([1, 378]) torch.Size([1, 3])
[1,    13] loss: 0.00197
torch.Size([1, 355]) torch.Size([1, 9])
[1,    14] loss: 0.0037
torch.Size([1, 398]) torch.Size([1, 3])
[1,    15] loss: 0.0018
torch.Size([1, 382]) torch.Size([1, 6])
[1,    16] loss: 0.00377
torc

Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors


[1,    20] loss: 0.0029
torch.Size([1, 366]) torch.Size([1, 3])
[1,    21] loss: 0.00226
torch.Size([1, 552]) torch.Size([1, 3])
torch.Size([1, 413]) torch.Size([1, 3])
[1,    23] loss: 0.0015
torch.Size([1, 406]) torch.Size([1, 3])
[1,    24] loss: 0.00238
torch.Size([1, 424]) torch.Size([1, 3])
[1,    25] loss: 0.00216
torch.Size([1, 329]) torch.Size([1, 9])
[1,    26] loss: 0.00437
torch.Size([1, 424]) torch.Size([1, 3])
[1,    27] loss: 0.00231
torch.Size([1, 364]) torch.Size([1, 3])
[1,    28] loss: 0.00261
torch.Size([1, 396]) torch.Size([1, 7])
[1,    29] loss: 0.0036
torch.Size([1, 251]) torch.Size([1, 4])
[1,    30] loss: 0.00232
torch.Size([1, 371]) torch.Size([1, 3])
[1,    31] loss: 0.00166
torch.Size([1, 453]) torch.Size([1, 4])
[1,    32] loss: 0.00324
torch.Size([1, 427]) torch.Size([1, 4])
[1,    33] loss: 0.00336
torch.Size([1, 299]) torch.Size([1, 3])
[1,    34] loss: 0.00239
torch.Size([1, 355]) torch.Size([1, 7])
[1,    35] loss: 0.00402
torch.Size([1, 307]) torch.S

OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 4.00 GiB total capacity; 3.18 GiB already allocated; 0 bytes free; 3.39 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Task 7