In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm, trange
from transformers import BertTokenizer, AutoConfig, AutoModelForQuestionAnswering, squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor
import gc

In [29]:
model_prefix = 'bert-base-uncased'
preds_dir = "results/bert-base-uncased/epoch_2"
epoch_dir = "results/bert-base-uncased/epoch_2"
data_dir = ''
data_file = "dev-v2.0.json"
layers = 12
batch_size = 8
hidden_dim = 768
max_seq_length = 384
max_answer_length = 17
res_size = 3
non_linear = "relu"
project_dim = 200
dropout_r = 0.5

In [23]:
# CPU
device = 'cuda'

# Tokenizer
tokenizer = BertTokenizer.from_pretrained(model_prefix)    

# Extract examples
processor = SquadV2Processor()
dev_examples = processor.get_train_examples(data_dir=data_dir, filename=data_file)

100%|█████████████████████████████████████| 16/16 [00:02<00:00,  7.42it/s]


In [4]:
# Extract dev features
print("Loading dev features")
dev_features, dev_dataset = squad_convert_examples_to_features(
    examples=dev_examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    return_dataset="pt",
    threads=1)

Loading dev features


convert squad examples to features: 100%|█| 6078/6078 [01:00<00:00, 100.49
add example index and unique id: 100%|█| 6078/6078 [00:00<00:00, 1210550.3


In [30]:
# Initialize config and model
config = AutoConfig.from_pretrained(model_prefix, output_hidden_states = True)
model = AutoModelForQuestionAnswering.from_pretrained(model_prefix, config = config)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AdamW
import numpy as np

class MultiHeadAttention(nn.Module):
    def __init__(self, in_dim, n_heads=4):
        super(MultiHeadAttention, self).__init__()   
        assert in_dim % n_heads == 0
        self.d = in_dim//n_heads
        self.n_heads = n_heads
        
        self.WQ = nn.Linear(in_dim, self.d * self.n_heads)
        self.WK = nn.Linear(in_dim, self.d * self.n_heads)
        self.WV = nn.Linear(in_dim, self.d * self.n_heads)
        
        self.linear = nn.Linear(self.n_heads * self.d, in_dim)
        self.layer_norm = nn.LayerNorm(in_dim)
        
    def forward(self, h): # (8, 384, 200)
        batch_size = h.shape[0]
        q_s = self.WQ(h).view(batch_size, -1, self.n_heads, self.d).transpose(1, 2) # (8, 4, 384, 50)
        k_s = self.WK(h).view(batch_size, -1, self.n_heads, self.d).transpose(1, 2)
        v_s = self.WV(h).view(batch_size, -1, self.n_heads, self.d).transpose(1, 2) 

        scores = torch.matmul(q_s, k_s.transpose(-1, -2)) / np.sqrt(self.d) #(8, 4, 384, 384)
        attn = F.softmax(scores, dim=-1) 
        context = torch.matmul(attn, v_s) #(8, 4, 384, 50)

        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d)
        output = self.linear(context)
        return self.layer_norm(output + h)
        

class Adapter(nn.Module):
    def __init__(self, in_dim, project_dim, p = dropout_r, max_seq = max_seq_length):
        super(Adapter, self).__init__()        
                
        self.project_down = nn.Linear(in_dim, project_dim)
        self.project_up = nn.Linear(project_dim, in_dim)
        self.dropout = nn.Dropout(p=p)
        self.batchnorm = nn.BatchNorm1d(max_seq)
        self.layernorm = nn.LayerNorm(in_dim, max_seq)
        self.attention = MultiHeadAttention(project_dim)
        
    def forward(self, h):
        h = self.project_down(h)
        h = self.batchnorm(h)
        h = self.attention(h)
        h = self.dropout(h)
        h = self.project_up(h)
        h = self.layernorm(h)
        
        return h

class AdapterModel(nn.Module):
    def __init__(self, in_dim, project_dim, max_seq=max_seq_length):
        super(AdapterModel, self).__init__()
        
        self.adapter_list = nn.ModuleList([Adapter(in_dim, project_dim) for i in range(12)])
        self.linear = nn.Linear(in_dim, 1)
        
    def forward(self, all_h):
        h = torch.zeros(all_h[0].size()).to(device)
            
        for i in range(12):
            h = self.adapter_list[i](all_h[i]+h)
        
        return self.linear(h).unsqueeze(0)

In [32]:
# Predict using probes for each epoch directory present
adaptor_predict_s = AdapterModel(hidden_dim, project_dim)
adaptor_predict_e = AdapterModel(hidden_dim, project_dim)


adaptor_predict_s.load_state_dict(torch.load(epoch_dir+"/_start_idx_per100", map_location=device))
adaptor_predict_e.load_state_dict(torch.load(epoch_dir+"/_end_idx_per100", map_location=device))

# Extract IDs
print("Extracting dev IDs")
n = len(dev_examples)
q_ids = []
for i in range(n):
    q_ids.append(dev_examples[i].qas_id)

Extracting dev IDs


In [33]:
# Initialize dev data loader
eval_sampler = SequentialSampler(dev_dataset)
eval_dataloader = DataLoader(dev_dataset, sampler = eval_sampler, batch_size = batch_size)

pred = pd.DataFrame()
pred['Id'] = q_ids
pred['Predicted'] = [""] * len(dev_examples)
pred['Question'] = [""] * len(dev_examples)
pred['Score'] = [0] * len(dev_examples)

# List to keep track of how many unique questions we've seen in each df, questions with
# contexts longer than max seq len get split into multiple features based on doc_stride
# a good alternative we may implement later is recording for all features, then simplifying with groupby and max
# e.g. something like df.sort_values('Score', ascending=False).drop_duplicates(['Question'])
question_ids = 0 

In [34]:
# Evaluation batches
print("Predicting on dev set")
for batch in tqdm(eval_dataloader, desc = "Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    
    with torch.no_grad():
        inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }
        # ALBERT/BERT/Distilibert forward pass
        idx = batch[3]
        outputs = model(**inputs)
        attention_hidden_states = outputs[2][1:] #(layers, batch_size, max_seq_len, hidden_size)
        # Compute prediction on eval indices
        for j, index in enumerate(idx):
            index = int(index.item())


            # Extract tokens for the current batch
            tokens = tokenizer.convert_ids_to_tokens(batch[0][j])

            # Find where context starts and ends, since we want to predict in context
            context_end = int(max_seq_length - torch.argmax(torch.flip(batch[2][j], [0])).item()) - 1
            context_start = int(torch.argmax(batch[2][j]).item())
            

            
            # Find the question, starting right after [CLS] and subtracting 1 to chop off the [SEP] token
            question_start = 1
            question_end = context_start # [SEP]

            question = tokenizer.convert_tokens_to_string(tokens[question_start:question_end-1])
            
#           print("attention_hidden_states.size()",torch.stack(list(attention_hidden_states)).size()) # ([12, 8, 384, 768])
            
            # to model
            inputs = torch.stack(list(attention_hidden_states))[:,j,:,:].unsqueeze(1) # (12, 1, 384, 768)
            # parameters
            threshold=0

            # probe.predict    
            seq_len = inputs.size(2)
            inputs = inputs.to(device)
            adaptor_predict_s.to(device)
            adaptor_predict_e.to(device)

            adaptor_predict_s.eval()
            adaptor_predict_e.eval()

            S = adaptor_predict_s
            E = adaptor_predict_e

            with torch.no_grad():
                start_scores = S(inputs) 
                end_scores = E(inputs) #inputs [12,1,384,768] output [1,384,1]

                _,max_start_score_idx = start_scores.squeeze().max(-1)

                
                #print("end_scores:", end_scores)
                _,max_end_score_idx = end_scores.squeeze().max(-1)
                #print("end_scores.max(-1)", end_scores.squeeze().max(-1))
                #print("tokens[max_end_score_idx]", tokens[int(max_end_score_idx)])
#                 print(start_scores.size())
                start_null = start_scores[:,:,0,:] #print the first of all 384 scores
                #print("start_null",start_null)
                end_null = end_scores[:,:,0,:]

                
                score_null = start_null + end_null

                start_best, end_best = context_start, context_start
                score_best = start_scores[:, :,start_best,:] + end_scores[:, :,end_best,:]

                for start_curr in range(context_start, context_end):
                    start_score = start_scores[:, :,start_curr,:]
                    
                    end_scores_valid = end_scores[:, :,start_curr:min(start_curr+max_answer_length+1, context_end),:]
                    #print("end_scores_valid",end_scores_valid)
                    #print("end_scores_valid.size()",end_scores_valid.size())
                    
                    end_score, end_idx = end_scores_valid.squeeze().max(-1)
                    #print("end_score",end_score)
                    #print("end_idx",end_idx)
                    
                    end_curr = end_idx+start_curr
                    score_curr = start_score + end_score
                    if score_curr >= score_best:
                        score_best = score_curr
                        start_best, end_best = start_curr, end_curr

                non_null_more_likely_than_null = score_best >= (score_null+threshold)

                # Multiply by mask to force idx where null is more probable to zero
                score = non_null_more_likely_than_null*score_best+(~non_null_more_likely_than_null)*score_null
                start_idx = non_null_more_likely_than_null*start_best
                end_idx = non_null_more_likely_than_null*end_best

            score, start_idx, end_idx = score.cpu().numpy(), start_idx.cpu().numpy(), end_idx.cpu().numpy()

            # end of probe.predict
            

            
            start_idx = int(start_idx[0])
            end_idx = int(end_idx[0]) 
            

            # Extract predicted answer, converting start tokens to empty strings (no answer)
            answer = tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx + 1])

            if answer == '[CLS]':
                answer = ''

            # Check if the question is the same as the last one, if it is go back to the last question id and keep the higher score.
            # If the question is not already in the dataframe, then assign it to the dataframe.
            # Note we first handle the case where there are no prior questions by storing since we know there are no duplicates
            
            if question_ids == 0:
                pred.loc[question_ids, 'Question'] = question
                pred.loc[question_ids, 'Predicted'] = answer
                pred.loc[question_ids, 'Score'] = score
            
            elif (pred.loc[ int(question_ids-1), 'Question'] == question):
                question_ids -= 1  
                old_score = pred.loc[question_ids, 'Score'] 
                if score > old_score:
                    pred.loc[question_ids, 'Predicted'] = answer
                    pred.loc[question_ids, 'Score'] = score
            else:
                pred.loc[question_ids, 'Question'] = question
                pred.loc[question_ids, 'Predicted'] = answer
                pred.loc[question_ids, 'Score'] = score
            # Increment to new question id (note, for duplicate answers this gets us back to where we were)
            question_ids += 1
# Save predictions for each layer
print("Saving predictions")
pred[['Id', 'Predicted']].to_csv(preds_dir + "/predict.csv", index = False)

Evaluating:   0%|                                 | 0/797 [00:00<?, ?it/s]

Predicting on dev set


Evaluating: 100%|███████████████████████| 797/797 [20:00<00:00,  1.51s/it]

Saving predictions



