In [1]:
import numpy as np
import json
import torch
import random


  from .autonotebook import tqdm as notebook_tqdm


# Task 1

In [2]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [3]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  # <-- Why test? See next slides for an answer!

## [Task 2] Train, Validation and Test splits

CoQA only provides a train and validation set since the test set is hidden for evaluation purposes.

We'll consider the provided validation set as a test set. <br>
$\rightarrow$ Write your own script to:
* Split the train data in train and validation splits (80% train and 20% val)
* Perform splits such that a dialogue appears in one split only! (i.e., split at dialogue level)
* Perform splitting using the following seed for reproducibility: 42

#### Reproducibility Memo

Check back tutorial 2 on how to fix a specific random seed for reproducibility!

In [4]:
with open(os.path.join('coqa', 'train.json'), 'r') as j:
    train = json.loads(j.read())

with open(os.path.join('coqa', 'test.json'), 'r') as j:
    test = json.loads(j.read())

In [5]:
train = train['data']
test = test['data']

for t in train:
    indices = [i for i, a in enumerate(t['answers']) if a['input_text'] != 'unknown']
    t['questions'] = [q for i, q in enumerate(t['questions']) if i in indices] 
    t['answers'] = [a for i, a in enumerate(t['answers']) if i in indices]


In [6]:
lengths=[len(doc['questions']) for doc in train]

In [7]:
le=np.cumsum(np.array(lengths,dtype=np.float32))
train_end=np.where((le/le[-1])>0.8)[0][0]

validation = train[train_end : ] 
train = train[ : train_end]

In [8]:
print(len(train))
print(len(validation))

5773
1426


In [9]:
len_train=np.sum([len(doc['questions']) for doc in train])
len_val=np.sum([len(doc['questions']) for doc in validation])

len_tot=len_train+len_val
print(len_train,len_train/len_tot)
print(len_val,len_val/len_tot)

85810 0.7998993251053358
21466 0.20010067489466424


In [10]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, return_history=False):

        self.story=[d['story'] for d in data]
        self.questions=[d['questions'] for d in data]
        self.answers=[d['answers'] for d in data]
        lengths = [len(doc['questions']) for doc in data]
        self.lengths = np.cumsum(np.array(lengths,dtype=np.int32))
        self.R_H=return_history
        

    def __len__(self):
        return self.lengths[-1]

    def __getitem__(self, idx):
        f_idx=int(np.where(self.lengths > idx)[0][0])
        if f_idx>0:
            q_idx=idx-self.lengths[f_idx-1]
        else:
            q_idx=idx

        passage=self.story[f_idx]
        questions=self.questions[f_idx]
        answers=self.answers[f_idx]
        question=questions[q_idx]['input_text']
        answer=answers[q_idx]['input_text']

        if self.R_H:
            print([ [questions[i]['input_text'],answers[i]['input_text']] for i in range(q_idx)])
            history=np.concatenate([ [questions[i]['input_text'],answers[i]['input_text']] for i in range(q_idx)],0)
            return (passage,question,history), answer

        return (passage,question), answer

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(TextDataset(train), batch_size=8, shuffle=True)
val_dataloader = DataLoader(TextDataset(validation), batch_size=8, shuffle=True)
test_dataloader = DataLoader(TextDataset(test), batch_size=8, shuffle=True)

# Task 3

## [Task 3] Model definition

Write your own script to define the following transformer-based models from [huggingface](https://HuggingFace.co/).

* [M1] DistilRoBERTa (distilberta-base)
* [M2] BERTTiny (bert-tiny)

**Note**: Remember to install the ```transformers``` python package!

**Note**: We consider small transformer models for computational reasons!

In [12]:
from transformers import EncoderDecoderModel, AutoTokenizer

model_name = 'distilroberta-base'

M1 = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name,max_new_tokens=50)
T1 = AutoTokenizer.from_pretrained(model_name,max_new_tokens=50)


model_name = 'prajjwal1/bert-tiny'

M2 = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)
T2 = AutoTokenizer.from_pretrained(model_name)


Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['roberta.encoder.layer.0.crossattention.self.key.bias', 'roberta.encoder.layer.5.crossattention.output.LayerNorm.weight', 'roberta.encoder.layer.3.crossattenti

# Task 4

In [13]:
i=42

passage=train[i]['story']
questions=train[i]['questions']
n=random.randint(0,len(questions))
question=questions[n]['input_text']

input_text=passage+' [SEP] '+question
print(input_text)

Jefferson's metaphor of a wall of separation has been cited repeatedly by the U.S. Supreme Court. In Reynolds v. United States (1879) the Court wrote that Jefferson's comments "may be accepted almost as an authoritative declaration of the scope and effect of the [First] Amendment." In Everson v. Board of Education (1947), Justice Hugo Black wrote: "In the words of Thomas Jefferson, the clause against establishment of religion by law was intended to erect a wall of separation between church and state." 

Many early immigrant groups traveled to America to worship freely, particularly after the English Civil War and religious conflict in France and Germany. They included nonconformists like the Puritans, who were Protestant Christians fleeing religious persecution from the Anglican King of England. Despite a common background, the groups' views on religious toleration were mixed. While some such as Roger Williams of Rhode Island and William Penn of Pennsylvania ensured the protection of r

In [14]:
history=np.concatenate([ [train[i]['questions'][idx]['input_text'],train[i]['answers'][idx]['input_text']] for idx in range(n)],0)
history

array(['after what did a a lot of migrants travel?', 'English Civil War',
       'where did they go?', 'traveled to America', 'who were they?',
       'nonconformists like the Puritans', 'why did they leave home?',
       'fleeing religious persecution', 'who persecuted them?',
       'Anglican King of England.',
       'did they all share the same viewpoint on theology?', 'No',
       'did some protect different ideas?', 'yes', 'who was one?',
       'Roger Williams', 'from where?', 'Rhode Island', 'and another?',
       'William Penn', 'from?', 'Pennsylvania',
       'who banned other worshiping?',
       'The Dutch colony of New Netherland', 'what court is discussed?',
       'Supreme Court.'], dtype='<U50')

In [15]:
separator = ' [SEP] '
text_input = passage + f'{separator if len(history) else ""}' + separator.join(history) + separator + question
text_input

'Jefferson\'s metaphor of a wall of separation has been cited repeatedly by the U.S. Supreme Court. In Reynolds v. United States (1879) the Court wrote that Jefferson\'s comments "may be accepted almost as an authoritative declaration of the scope and effect of the [First] Amendment." In Everson v. Board of Education (1947), Justice Hugo Black wrote: "In the words of Thomas Jefferson, the clause against establishment of religion by law was intended to erect a wall of separation between church and state." \n\nMany early immigrant groups traveled to America to worship freely, particularly after the English Civil War and religious conflict in France and Germany. They included nonconformists like the Puritans, who were Protestant Christians fleeing religious persecution from the Anglican King of England. Despite a common background, the groups\' views on religious toleration were mixed. While some such as Roger Williams of Rhode Island and William Penn of Pennsylvania ensured the protectio

In [16]:
def f_PQ(model, tokenizer, passage, question, generation_params=None):

    input_ids = tokenizer(
            question,
            passage,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).input_ids

    if generation_params is None:
        generation_params = {
            'do_sample' : True,
            'num_beams' : 3,
            'repetition_penalty' : 2.
        }
    generated_ids = model.generate(input_ids.to(model.device), **generation_params)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [17]:
print('The cat is on the table')
#print(f_PQ(M2, T2, 'The cat is on the table', 'where is the cat?'))

The cat is on the table


# Task 5

In [18]:
def f_PQH(model, tokenizer, passage, question, history,generation_params=None):
    separator = ' [SEP] '
    text_input = question + f'{separator if len(history) else ""}' + separator.join(history)+ separator+ passage 
    input_ids = tokenizer(text_input, return_tensors="pt").input_ids

    if generation_params is None:
        generation_params = {
            'do_sample' : True,
            'num_beams' : 3,
            'repetition_penalty' : 2
        }

    generated_ids = model.generate(input_ids, **generation_params)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Task 6

In [19]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, return_history=False):
        self.story=[d['story'] for d in data]
        self.questions=[d['questions'] for d in data]
        self.answers=[d['answers'] for d in data]
        lengths = [len(doc['questions']) for doc in data]
        self.lengths = np.cumsum(np.array(lengths,dtype=np.int32))
        self.R_H=return_history
        self.tokenizer = tokenizer
        

    def __len__(self):
        return self.lengths[-1]

    def __getitem__(self, idx):
        f_idx=int(np.where(self.lengths > idx)[0][0])
        if f_idx>0:
            q_idx=idx-self.lengths[f_idx-1]
        else:
            q_idx=idx

        passage=self.story[f_idx]
        questions=self.questions[f_idx]
        answers=self.answers[f_idx]
        question=questions[q_idx]['input_text']
        answer=answers[q_idx]['input_text']

        if self.R_H:
            print([ [questions[i]['input_text'],answers[i]['input_text']] for i in range(q_idx)])
            history=np.concatenate([ [questions[i]['input_text'],answers[i]['input_text']] for i in range(q_idx)],0)
            return (passage,question,history), answer
        
        #input_ids = torch.tensor(self.encodings['input_ids'])
        #target_ids = torch.tensor(self.labels[idx])
    
        inputs = self.tokenizer(
            question,
            passage,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).input_ids

        labels = self.tokenizer(
            answer,
            max_length=100,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        ).input_ids

        return {"input_ids": inputs.squeeze(0).to('cuda'), "labels": labels.squeeze(0).to('cuda')}

In [20]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

data_collator = DataCollatorWithPadding(tokenizer=T2)

batch_size=16

# metric = load('accuracy')
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    output_dir='/prova',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=5000,
    fp16=True,
    prediction_loss_only=False
    )

In [21]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

M2.config.decoder_start_token_id = T2.cls_token_id
M2.config.pad_token_id = T2.pad_token_id

trainer = Seq2SeqTrainer(
    model=M2,
    tokenizer=T2,
    args=training_args,
    train_dataset=CustomDataset(train, T2),
    eval_dataset=CustomDataset(validation, T2),
    compute_metrics=compute_metrics,
    optimizers=(torch.optim.AdamW(M2.parameters(), lr=0.001), None),
    data_collator=data_collator
)

Using cuda_amp half precision backend


In [22]:
import transformers
transformers.logging.set_verbosity_error()

In [23]:
from transformers.utils import logging

logging.set_verbosity_info()
logger = logging.get_logger("transformers")
logger.info("INFO")
logger.warning("WARN")

INFO
WARN


In [24]:
#trainer.train()

In [25]:
import time

In [28]:
def train(model, tokenizer, n_epochs=3, learning_rate=1e-4):
    model.to('cuda')

    L=[]

    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    
    for epoch in range(n_epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        optimizer.zero_grad()
        start_time = time.time()
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            (passage, question), answer = data

            # text_input = [question[i] + ' [SEP] ' + passage[i] for i in range(len(passage))]

            # zero the parameter gradients
            

            inputs = tokenizer(
                question,
                passage,
                padding=True,
                max_length=512,
                truncation=True,
                return_tensors="pt",
            ).input_ids

            labels = tokenizer(
                answer,
                max_length=512,
                truncation=True,
                padding=True,
                return_tensors="pt",
            ).input_ids

            #X=torch.tensor(input_ids,device='cuda')
            #y=torch.tensor(labels,device='cuda')
            
            #print(X.shape,y.shape)
            
            #if X.shape[1]>500:
            #    continue

            # the forward function automatically creates the correct decoder_input_ids
            outputs = model(inputs.to('cuda'), labels=labels.to('cuda'))
            loss = outputs.loss
            loss.backward()

            if i%2**1==2**1-1:
                optimizer.step()
                optimizer.zero_grad()

                if i%2**10==i%2**10-1:
                    torch.cuda.empty_cache()


            # print statistics
            running_loss += loss.item()

            L.append(loss.detach().cpu().numpy())
            
            epoch_time = time.time() - start_time
            batch_time = epoch_time/(i+1)
            
            print(f"epoch: {epoch + 1}/{n_epochs}, {i + 1}/{len(train_dataloader)}, {epoch_time:.0f}s {batch_time*1e3:.0f}ms/step, lr: {optimizer.param_groups[0]['lr']:.3g}, loss: {running_loss/(i+1):.3g}", end = '\r')

        print(f"epoch: {epoch + 1}/{n_epochs}, {i + 1}/{len(train_dataloader)}, {epoch_time:.0f}s {batch_time*1e3:.0f}ms/step, lr: {optimizer.param_groups[0]['lr']:.3g}, loss: {running_loss/(i+1):.3g}")

    print('Finished Training')

In [31]:
train(M2,T2)

epoch: 1/3, 3479/10727, 111s 32ms/step, lr: 0.0001, loss: 1.64

KeyboardInterrupt: 

# Task 7

In [56]:
print('The cat is on the table, where is the cat?')
print(f_PQ(M2, T2, 'The cat is on the table', 'where is the cat?'))

The cat is on the table, where is the cat?


KeyboardInterrupt: 