<a href="https://colab.research.google.com/github/SahilDhull/emphasis_selection/blob/master/model/bert_with_fine_tuning_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
!pip install transformers
!pip install config



In [0]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForMaskedLM , BertModel ,WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
from transformers import PreTrainedModel, PreTrainedTokenizer , BertPreTrainedModel
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import codecs

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [46]:
from google.colab import drive
drive.mount('/content/drive')

train_file = 'drive/My Drive/datasets/train.txt'
dev_file = 'drive/My Drive/datasets/dev.txt'

quotes_file = 'drive/My Drive/datasets/all_quotes.txt'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def read_sent(file, caseless = False):
    
    with codecs.open(file, 'r', 'utf-8') as f:
        lines = f.readlines()
    #print(lines)
    sent = ""
    sents = []
    
    for line in lines:
        if not (line.isspace()):
            feats = line.strip().split()
            word = feats[0].lower() if caseless else feats[0]
            if(word == "n't"):
              word = "'t"
              sent = sent + "n"
            sent = sent + " " + word
        elif len(sent) > 0:
            sents.append(sent.strip())
            sent = ""
            
    if len(sent) > 0:
        sents.append(sent)
    
    return sents

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case = False)

In [11]:
sentences = read_sent(quotes_file)
print(sentences[0])
print(sentences[100])

sentences = ["[CLS] " + query + " [SEP]" for query in sentences]
print(sentences[0])
print(sentences[100])

# Tokenize with BERT tokenizer
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (tokenized_texts[0])
print (tokenized_texts[100])

You know you 're in love when you can 't fall asleep because reality is finally better than your dreams .
A half-read book is a half-finished love affair .
[CLS] You know you 're in love when you can 't fall asleep because reality is finally better than your dreams . [SEP]
[CLS] A half-read book is a half-finished love affair . [SEP]


HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…


['[CLS]', 'You', 'know', 'you', "'", 're', 'in', 'love', 'when', 'you', 'can', "'", 't', 'fall', 'asleep', 'because', 'reality', 'is', 'finally', 'better', 'than', 'your', 'dreams', '.', '[SEP]']
['[CLS]', 'A', 'half', '-', 'read', 'book', 'is', 'a', 'half', '-', 'finished', 'love', 'affair', '.', '[SEP]']


In [12]:
MAX_LEN = 36
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(input_ids[0])
print(input_ids[100])

[ 101 1192 1221 1128  112 1231 1107 1567 1165 1128 1169  112  189 2303
 6153 1272 3958 1110 1921 1618 1190 1240 6149  119  102    0    0    0
    0    0    0    0    0    0    0    0]
[ 101  138 1544  118 2373 1520 1110  170 1544  118 1845 1567 7033  119
  102    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [13]:
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)
print(attention_masks[0])
print(attention_masks[100])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [0]:
train_inputs, validation_inputs = train_test_split(input_ids, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)
                                             
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. 

In [0]:
def mask_tokens(inputs, tokenizer, mlm_probability = 0.15):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # print(inputs[0])

    # We sample a few tokens in each sequence for masked-LM training (with probability mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
 
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)

    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [0]:
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
class bert_model(nn.Module):
  def __init__(self, final_size, drop_prob, data_parallel=True):
    super(bert_model, self).__init__()

    config = BertConfig.from_pretrained('bert-base-cased', output_hidden_states=True)
    bert = BertForMaskedLM.from_pretrained('bert-base-cased', config = config)
    
    if data_parallel:
        self.bert = nn.DataParallel(bert)
    else:
        self.bert = bert
    bert_dim = 768
    
    self.fc = nn.Linear(bert_dim, final_size)
    self.dropout = nn.Dropout(p=drop_prob)
    self.sigmoid = nn.Sigmoid()
           
  def forward(self, bert_ids, bert_mask, labels = None, bert_token_starts = None):
    
    batch_size = bert_ids.size()[0]
    pad_size = bert_ids.size()[1]
    # print("batch size",batch_size,"\t\tpad_size",pad_size)

    if(bert_token_starts == None):
      output = self.bert(bert_ids, attention_mask = bert_mask, masked_lm_labels=labels)
      return output
    
    output = self.bert(bert_ids, attention_mask = bert_mask)
    # print(len(output))
    # print(len(output[1]))
    # print(output[1][0].size())
    bert_last_layer = output[1][0]
    
    pred_logits = self.sigmoid(self.fc(self.dropout(bert_last_layer)))
    pred_logits = torch.squeeze(pred_logits,2)
    # print(pred_logits.size())
    # print(labels.size())
    # print(pred_logits[1])
    # print(labels[1])
    # print(bert_token_starts[1])
    # print("\n")

    pred_labels = labels.clone()
    # print(pred_labels[1])
    # print("\n")
    
    for b in range(batch_size):
      for w in range(pad_size):
        if(bert_token_starts[b][w]!=0):
          if(bert_token_starts[b][w]>=pad_size):
            print(bert_token_starts[b])
          else:
            pred_labels[b][w] = pred_logits[b][bert_token_starts[b][w]]

    # print(pred_labels[1])
    # print(labels[1])
    # print("\n")
    
    
    mask = pred_labels!=0
    total = mask[mask].size()[0]

    loss_fn = nn.BCELoss(reduction='sum').to(device) 
    loss = loss_fn(pred_labels, labels)
    # print(loss)

    loss /= total 
    print(loss) 
    return loss, pred_labels

In [0]:
model = bert_model(1,0.3,True).to(device)

In [0]:
config = BertConfig.from_pretrained('bert-base-cased', output_hidden_states=True)

model = BertForMaskedLM.from_pretrained('bert-base-cased', config = config)
model = model.to(device)

In [0]:
max_epochs = 4
max_steps = len(train_dataloader)*max_epochs

optimizer = AdamW(model.parameters(),lr = 1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=max_steps/20, num_training_steps=max_steps)

In [0]:
def evaluate(model, tokenizer, validation_dataloader):
  total_loss = 0
  steps = len(validation_dataloader)
  model.eval()
  
  corrects = 0
  errors = 0

  with torch.no_grad():
    for i, (val_inputs, val_masks) in enumerate(validation_dataloader):
      inputs, labels = mask_tokens(val_inputs, tokenizer)
      
      inputs = inputs.to(device)
      labels = labels.to(device)
      val_masks = val_masks.to(device)
      
      output = model(inputs,val_masks)
      predict = output[0]

      batch_size = predict.size()[0]

      for bs in range(batch_size):
        ii = 0
        for ls in labels[bs]:
          ls = ls.item()
          if ls!=-100:
            predicted = torch.argmax(predict[bs][ii]).item()
            if(ls == predicted):
              corrects = corrects + 1
            else:
              errors = errors + 1
          ii = ii + 1

  total = corrects + errors
  accuracy = corrects/total
  print("\nvalidation accuracy = ",accuracy,"\t for",total,"masks on validation data\n")
   

In [0]:
def train(train_dataloader,validation_dataloader, model, tokenizer, optimizer, scheduler, max_epochs, print_freq = 30,val_freq = 666):
  
  model.zero_grad()
  steps = len(train_dataloader)

  for epoch in range(max_epochs):
    
    evaluate(model, tokenizer, validation_dataloader)
    total_loss = 0
    
    for i, (train_inputs, train_masks) in enumerate(train_dataloader):
      inputs, labels = mask_tokens(train_inputs, tokenizer)
      model.train()
      
      inputs = inputs.to(device)
      labels = labels.to(device)
      train_masks = train_masks.to(device)
      
      output = model(inputs,train_masks, labels=labels)
      
      loss = output[0]
      total_loss = total_loss + loss
      
      loss.backward()
      optimizer.step()
      scheduler.step()
      model.zero_grad()

      if((i+1)%print_freq==0):
        avg_loss = total_loss/print_freq
        total_loss = 0
        print("epoch:",(epoch+1),"out of",max_epochs,"\t batch:",(i+1),"out of",steps,"\t average loss:",avg_loss)   
      
      if((i+1)%val_freq==0):
        evaluate(model, tokenizer, validation_dataloader)
  
  evaluate(model, tokenizer, validation_dataloader)


In [22]:
train(train_dataloader,validation_dataloader, model, tokenizer, optimizer, scheduler, max_epochs)


validation accuracy =  0.5304349259735549 	 for 60654 masks on validation data

epoch: 1 out of 4 	 batch: 30 out of 6633 	 average loss: tensor(2.9017, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 60 out of 6633 	 average loss: tensor(2.8847, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 90 out of 6633 	 average loss: tensor(3.0682, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 120 out of 6633 	 average loss: tensor(2.7733, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 150 out of 6633 	 average loss: tensor(2.9001, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 180 out of 6633 	 average loss: tensor(2.9081, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 210 out of 6633 	 average loss: tensor(2.8629, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 240 out of 6633 	 average loss: tensor(2.8670, device='cuda:0', grad_fn=<DivBackward0>)
ep

KeyboardInterrupt: ignored

In [0]:
def read_token_map(file,word_index = 1,prob_index = 4, caseless = False):
  
  with codecs.open(file, 'r', 'utf-8') as f:
      lines = f.readlines()

  tokenized_texts = []
  token_map = []
  token_labels = []

  bert_tokens = []
  orig_to_tok_map = []
  labels = []

  bert_tokens.append("[CLS]")
  
  for line in lines:
    if not (line.isspace()):
      feats = line.strip().split()
      word = feats[word_index].lower() if caseless else feats[word_index]
      label = feats[prob_index].lower() if caseless else feats[prob_index]
      labels.append((float)(label))
      orig_to_tok_map.append(len(bert_tokens))
      
      if(word == "n't"):
        word = "'t"
        if(bert_tokens[-1] != "won"):
          bert_tokens[-1] = bert_tokens[-1] +"n"
      if(word == "wo"):
        word == "won"

      bert_tokens.extend(tokenizer.tokenize(word))
      
    elif len(orig_to_tok_map) > 0:
      bert_tokens.append("[SEP]")
      tokenized_texts.append(bert_tokens)
      token_map.append(orig_to_tok_map)
      token_labels.append(labels)
      bert_tokens = []
      orig_to_tok_map = []
      labels = []
      bert_tokens.append("[CLS]")
          
  if len(orig_to_tok_map) > 0:
    bert_tokens.append("[SEP]")
    tokenized_texts.append(bert_tokens)
    token_map.append(orig_to_tok_map)
    token_labels.append(labels)
  
  return tokenized_texts, token_map, token_labels

In [52]:
t_tokenized_texts, t_token_map, t_token_label = read_token_map(train_file)
print(t_tokenized_texts[100])
print(t_token_map[100])
print(t_token_label[100])

d_tokenized_texts, d_token_map, d_token_label = read_token_map(dev_file)
print(d_tokenized_texts[50])
print(d_token_map[50])
print(d_token_label[50])

['[CLS]', 'Happiness', 'consists', 'in', 'realizing', 'it', 'is', 'all', 'a', 'great', 'strange', 'dream', '.', '[SEP]']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[0.6666666666666666, 0.1111111111111111, 0.0, 0.2222222222222222, 0.0, 0.1111111111111111, 0.1111111111111111, 0.0, 0.2222222222222222, 0.3333333333333333, 0.3333333333333333, 0.1111111111111111]
['[CLS]', '`', '`', 'F', '##as', '##cin', '##ating', 'social', 'media', 'tip', 'or', 'fact', 'to', 'share', '.', "'", "'", '@', 'Speaker', 'Name', '[SEP]']
[1, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19]
[0.0, 0.5555555555555556, 0.0, 0.1111111111111111, 0.2222222222222222, 0.1111111111111111, 0.1111111111111111, 0.0, 0.2222222222222222, 0.0, 0.0, 0.2222222222222222, 0.2222222222222222]


In [53]:
MAX_LEN = 52

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
t_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in t_tokenized_texts]

# Pad our input tokens
t_input_ids = pad_sequences(t_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
t_token_map = pad_sequences(t_token_map, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
t_token_label = pad_sequences(t_token_label, maxlen=MAX_LEN, dtype="float", truncating="post", padding="post")

print(t_input_ids[100])
print(t_token_map[100])
print(t_token_label[100])

[  101 25410  2923  1107 10459  1122  1110  1155   170  1632  4020  4185
   119   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[ 1  2  3  4  5  6  7  8  9 10 11 12  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]
[0.66666667 0.11111111 0.         0.22222222 0.         0.11111111
 0.11111111 0.         0.22222222 0.33333333 0.33333333 0.11111111
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         

In [54]:
d_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in d_tokenized_texts]

# Pad our input tokens
d_input_ids = pad_sequences(d_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
d_token_map = pad_sequences(d_token_map, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
d_token_label = pad_sequences(d_token_label, maxlen=MAX_LEN, dtype="float", truncating="post", padding="post")

print(d_input_ids[50])
print(d_token_map[50])
print(d_token_label[50])

[  101   169   169   143  2225 16430  3798  1934  2394  5580  1137  1864
  1106  2934   119   112   112   137  9911 10208   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[ 1  3  7  8  9 10 11 12 13 14 15 17 19  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]
[0.         0.55555556 0.         0.11111111 0.22222222 0.11111111
 0.11111111 0.         0.22222222 0.         0.         0.22222222
 0.22222222 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         

In [55]:
t_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in t_input_ids:
  seq_mask = [float(i>0) for i in seq]
  t_attention_masks.append(seq_mask)
print(t_attention_masks[100])

d_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in d_input_ids:
  seq_mask = [float(i>0) for i in seq]
  d_attention_masks.append(seq_mask)
print(d_attention_masks[50])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [0]:
t_input_ids = torch.tensor(t_input_ids)
t_token_map = torch.tensor(t_token_map )
t_token_label = torch.tensor(t_token_label)
t_attention_masks = torch.tensor(t_attention_masks)

d_input_ids = torch.tensor(d_input_ids)
d_token_map = torch.tensor(d_token_map )
d_token_label = torch.tensor(d_token_label)
d_attention_masks = torch.tensor(d_attention_masks)

# Select a batch size for training. 
batch_size = 32
# print(t_token_labels)
# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(t_input_ids, t_token_map, t_token_label, t_attention_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(d_input_ids, d_token_map, d_token_label, d_attention_masks)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
optimizer = AdamW(model.parameters(), lr=2e-4, eps = 1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [0]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def fix_padding(scores_numpy, label_probs,  mask_numpy):
    #if len(scores_numpy) != len(mask_numpy):
    #    print("Error: len(scores_numpy) != len(mask_numpy)")
    #assert len(scores_numpy) == len(mask_numpy)
    #if len(label_probs) != len(mask_numpy):
    #    print("len(label_probs) != len(mask_numpy)")
    #assert len(label_probs) == len(mask_numpy)

    all_scores_no_padd = []
    all_labels_no_pad = []
    for i in range(len(mask_numpy)):
        all_scores_no_padd.append(scores_numpy[i][:int(mask_numpy[i])])
        all_labels_no_pad.append(label_probs[i][:int(mask_numpy[i])])

    assert len(all_scores_no_padd) == len(all_labels_no_pad)
    return all_scores_no_padd, all_labels_no_pad

def match_M(batch_scores_no_padd, batch_labels_no_pad):

    top_m = [1, 2, 3, 4]
    batch_num_m=[]
    batch_score_m=[]
    for m in top_m:
        intersects_lst = []
        # exact_lst = []
        score_lst = []
        ############################################### computing scores:
        for s in batch_scores_no_padd:
            if len(s) <=m:
                continue
            h = m
            # if len(s) > h:
            #     while (s[np.argsort(s)[-h]] == s[np.argsort(s)[-(h + 1)]] and h < (len(s) - 1)):
            #         h += 1

            # s = np.asarray(s.cpu())
            s = np.asarray(s)
            #ind_score = np.argsort(s)[-h:]
            ind_score = sorted(range(len(s)), key = lambda sub: s[sub])[-h:]
            score_lst.append(ind_score)

        ############################################### computing labels:
        label_lst = []
        for l in batch_labels_no_pad:
            if len(l) <=m:
                continue
            # if it contains several top values with the same amount
            h = m
            l = l.cpu()
            if len(l) > h:
                while (l[np.argsort(l)[-h]] == l[np.argsort(l)[-(h + 1)]] and h < (len(l) - 1)):
                    h += 1
            l = np.asarray(l)
            ind_label = np.argsort(l)[-h:]
            label_lst.append(ind_label)

        ############################################### :

        for i in range(len(score_lst)):
            intersect = intersection(score_lst[i], label_lst[i])
            intersects_lst.append((len(intersect))/(min(m, len(score_lst[i]))))
            # sorted_score_lst = sorted(score_lst[i])
            # sorted_label_lst =  sorted(label_lst[i])
            # if sorted_score_lst==sorted_label_lst:
            #     exact_lst.append(1)
            # else:
            #     exact_lst.append(0)
        batch_num_m.append(len(score_lst))
        batch_score_m.append(sum(intersects_lst))
    return batch_num_m, batch_score_m

In [71]:
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[3].to(device)
        b_token_starts = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        output = model(b_input_ids, b_input_mask, b_labels, b_token_starts)
        loss = output[0]

        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    # print("total loss",total_loss)
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    num_m = [0, 0, 0, 0]
    score_m = [0, 0, 0, 0]

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        t_input_ids = batch[0].to(device)
        t_input_mask = batch[3].to(device)
        t_token_starts = batch[1].to(device)
        t_labels = batch[2].to(device)
        # b_input_ids, b_input_mask, b_token_starts, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            output = model(t_input_ids, t_input_mask,  t_labels,t_token_starts)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = output[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = t_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        # tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        mask = torch.sum(t_labels,dim=1)
        # print(t_input_mask.size())
        # print(mask.size())
        mask = mask.cpu().data.numpy()
        t_scores, t_labels_new = fix_padding(logits, t_labels,mask)

        batch_num_m, batch_score_m = match_M(t_scores, t_labels_new)
        num_m = [sum(i) for i in zip(num_m, batch_num_m)]
        score_m = [sum(i) for i in zip(score_m, batch_score_m)]
    
    m_score = [i/j for i,j in zip(score_m, num_m)]
    print("Validation Accuracy: ")
    print(m_score)
    v_score = np.mean(m_score)
    print(v_score)
        
        # Accumulate the total accuracy.
        # eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        # nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    # print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    # print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
tensor(0.6593, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6585, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6496, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6608, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6510, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6580, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6640, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6560, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6693, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6554, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6449, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6556, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)
tensor(0.6584, device='cuda:0', dtype=torch.float64, grad_fn=<D