In [None]:
!git clone https://github.com/Senyu-T/unifiedqa

Fetch Data


In [None]:
!nvidia-smi -L

In [None]:
cd unifiedqa/bart

In [None]:
!chmod +x download_data.sh; ./download_data.sh

In [None]:
cd data/natural_questions_with_dpr_para/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [7]:
import os
os.chdir("/content/drive/MyDrive/NQ")

In [22]:
import string
import re

def normalize_answer(s):
  def remove_articles(text):
    return re.sub(r'\b(a|an|the)\b', ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def remove_spc_token(s):
  s = s.replace(' \\\'\\\'', ' \'\'')   # double quotation
  s = s.replace('\\\'', '\'')
  s = s.replace(' \'s', '\'s')    # 's
  s = s.replace(' ,', ',')
  return s

In [23]:
# read file, parse into context / question / answer for further data analysis
def read_files(file_name):
  answers = []
  questions = []
  contexts = []
  with open(file_name, 'rb') as inference_in:
    lines = inference_in.readlines()
    for i in range(len(lines)):
      sep = str(lines[i]).split('\\n') 
      questions.append(sep[0][2:-1])
      ans = (sep[1].split('\\t')[-1]).lower()
      ans = normalize_answer(remove_spc_token(ans))  # normalize answers
      answers.append(ans)
      contexts.append(sep[1].split('\\t')[0])
  return answers, questions, contexts

Get answers, question, contexts

In [36]:
tr_answers, tr_questions, tr_contexts = read_files("/content/unifiedqa/bart/data/natural_questions_with_dpr_para/train.tsv")
val_answers, val_questions, val_contexts = read_files("/content/unifiedqa/bart/data/natural_questions_with_dpr_para/dev.tsv")

Save data as ground truth.

In [93]:
import json
with open("/content/drive/MyDrive/NQ/data/answer_tags/gold_l2i.json") as f:
  label2index = json.load(f)
with open("/content/drive/MyDrive/NQ/data/answer_tags/gold_i2l.json") as f:
  index2label = json.load(f)

In [94]:
print(index2label)

{'ORG': 0, 'PERSON': 1, 'DATE': 2, 'NORP': 3, 'OTHERS': 4, 'LAW': 5, 'FAC': 6, 'PERCENT': 7, 'WORK_OF_ART': 8, 'CARDINAL': 9, 'TIME': 10, 'ORDINAL': 11, 'EVENT': 12, 'LANGUAGE': 13, 'MONEY': 14, 'GPE': 15, 'LOC': 16, 'QUANTITY': 17, 'PRODUCT': 18}


In [95]:
print(label2index)

{'0': 'ORG', '1': 'PERSON', '2': 'DATE', '3': 'NORP', '4': 'OTHERS', '5': 'LAW', '6': 'FAC', '7': 'PERCENT', '8': 'WORK_OF_ART', '9': 'CARDINAL', '10': 'TIME', '11': 'ORDINAL', '12': 'EVENT', '13': 'LANGUAGE', '14': 'MONEY', '15': 'GPE', '16': 'LOC', '17': 'QUANTITY', '18': 'PRODUCT'}


In [97]:
def get_labels(tsv_file):
  index2label, label2index = None, None
  tsv_file = open(tsv_file)
  read_tsv = csv.reader(tsv_file)
  labels = [row[0] for row in read_tsv]
  return labels

In [100]:
tr_tags = "/content/drive/MyDrive/NQ/data/answer_tags/train_soft_tag.tsv"
tr_index = get_labels(tr_tags)
print(tr_index[:10])

['CARDINAL', 'CARDINAL', 'PERSON', 'LOC', 'PERSON', 'OTHERS', 'PERSON', 'OTHERS', 'DATE', 'CARDINAL']


In [102]:
tr_labels = [index2label[tr_index[i]] for i in range(len(tr_index))]

In [106]:
import numpy as np
tr_labels = np.array(tr_labels)
with open("/content/drive/MyDrive/NQ/data/answer_tags/tr_labels.npy", 'wb') as f:
  np.save(f, tr_labels)


In [108]:
val_tags = "/content/drive/MyDrive/NQ/data/answer_tags/dev_soft_tag.tsv"
val_index = get_labels(val_tags)
val_labels = np.array([index2label[val_index[i]] for i in range(len(val_index))])
with open("/content/drive/MyDrive/NQ/data/answer_tags/val_labels.npy", 'wb') as f:
  np.save(f, val_labels)

Load tags

In [107]:
with open("/content/drive/MyDrive/NQ/data/answer_tags/tr_labels.npy", "rb") as f:
  tr_labels = np.load(f)
print(tr_labels[:10])

[ 9  9  1 16  1  4  1  4  2  9]


In [109]:
with open("/content/drive/MyDrive/NQ/data/answer_tags/val_labels.npy", "rb") as f:
  val_labels = np.load(f)
print(val_labels[:10])

[ 1  9 16  2  9  1  1  2  1  4]


In [None]:
!pip install transformers

In [10]:
import torch
import torch.optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import Tensor
import torch.nn.functional as F
from transformers import AutoModel, BertTokenizerFast, BertTokenizer, AutoTokenizer, BertModel
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import csv
import numpy as np

In [37]:
class BiLSTM(nn.Module):
    def __init__(self,embedding_dim, hidden_dim, num_layers, num_classes):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=embedding_dim,
                    hidden_size=hidden_dim,
                    num_layers=num_layers,
                    batch_first=False,
                    bidirectional=True)
        self.fc1 = torch.nn.Linear(2*hidden_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_dim, num_classes)
    
    
    def forward(self, embeddings):
        self.lstm.flatten_parameters()
        lstm_output, _ = self.lstm(embeddings)
        output = lstm_output[:,-1,:]
        output = self.fc1(output)
        output = self.relu(output)
        output = self.fc2(output)
        return output

class BertBiLSTM(nn.Module):
    def __init__(self, num_layers, num_classes, embedding_dim = 768, hidden_dim=128, freeze=False):
        super(BertBiLSTM, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        if freeze:
          for param in self.bert.parameters():
            param.requires_grad = False
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.classifier = BiLSTM(self.embedding_dim, hidden_dim, num_layers, num_classes)
       
    def forward(self, input_ids, attention_mask):
        text_embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embeddings = text_embeddings[0]
        
        output = self.classifier(text_embeddings)
        return output

In [13]:
MAX_LEN = 64

def preprocessing_for_bert(tokenizer, sentences):
    input_ids = []
    attention_masks = []
    num_sentences = len(sentences)

    for i, sent in enumerate(sentences):
      encoded_sent = tokenizer.encode_plus(text=sent, 
                          add_special_tokens=True,        
                          max_length=MAX_LEN,               
                          padding='max_length',         
                          return_attention_mask=True, 
                          truncation=True)     
      input_ids.append(encoded_sent.get('input_ids'))
      attention_masks.append(encoded_sent.get('attention_mask'))

    return input_ids, attention_masks


In [110]:
def get_data(file_path, npy_file, tokenizer, save_location, label2index=None, index2label=None):
    answers, questions, contexts = read_files(file_path)
    with open(npy_file, 'rb') as f:
      labels = np.load(f)

    input_ids, attention_masks = preprocessing_for_bert(tokenizer, questions)
    np.savez(save_location, input_ids=input_ids, attention_masks=attention_masks, labels=labels)

In [52]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [111]:
train_path = "/content/drive/MyDrive/NQ/data/raw_data/train.tsv"
val_path = "/content/drive/MyDrive/NQ/data/raw_data/dev.tsv"
tr_tags = "/content/drive/MyDrive/NQ/data/answer_tags/tr_labels.npy"
val_tags = "/content/drive/MyDrive/NQ/data/answer_tags/val_labels.npy"

In [112]:
os.chdir("/content/drive/MyDrive/NQ")

In [114]:
get_data(train_path, tr_tags, tokenizer, "data/tr_tokenized.npz")

In [113]:
get_data(val_path, val_tags, tokenizer, "data/val_tokenized.npz")

In [116]:
def load_dataset(location):
    data = dict(np.load(location,allow_pickle=True))
    for key, elem in data.items():
      data[key] = torch.tensor(elem)
    dataset = TensorDataset(data['input_ids'].squeeze(), data['attention_masks'].squeeze(), data['labels'])
    return dataset 

In [117]:
train_path = "data/tr_tokenized.npz"
train_dataset = load_dataset(train_path)
val_path = "data/val_tokenized.npz"
val_dataset = load_dataset(val_path)

Main BERT code

In [118]:
def evaluate(network, loader, loss_fn, data_size):
    loss = 0.0
    acc = 0.0
    with torch.no_grad():
      for i, (input_ids, masks, labels) in tqdm(enumerate(loader),total=len(loader),position=0, leave=True):
          input_ids = input_ids.cuda()
          masks = masks.cuda()
          labels = labels.cuda()
            
          output = network(input_ids, masks)
          loss += loss_fn(output,labels)
          preds = torch.argmax(output,dim=1)
          #output = network(input_ids, masks, labels=labels)
          #preds = torch.argmax(output.logits, dim=1)
          #loss = output.loss

          acc += torch.eq(preds, labels).sum().item()
    return loss.item() / data_size, acc / data_size

In [119]:
def train(directory, network, loss_fn, train_dataset, test_dataset, optimizer, scheduler, batch_size, num_epochs, verbose=True, val_freq=2):
    train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size)
    val_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)
    
    train_loss, train_acc = torch.zeros(num_epochs), torch.zeros(num_epochs)
    test_loss, test_acc = torch.zeros(num_epochs//val_freq + 1), torch.zeros(num_epochs//val_freq + 1)
    val_best_acc, train_best_acc = 0.0, 0.0
    os.makedirs(directory, exist_ok=True)
    for epoch in range(num_epochs):
        if epoch % val_freq == 0:
          network.eval()
          idx = epoch//val_freq
          test_loss[idx], test_acc[idx] = evaluate(network, val_dataloader, loss_fn, len(test_dataset))
          if test_acc[idx] > val_best_acc:
            val_best_acc = test_acc[idx]
            torch.save(network.state_dict(), f"{directory}/snapshot_val_best")
          if verbose:
            print(f"epoch:{epoch:3d}, test_loss: {test_loss[idx]:3.6f}, test_acc: {test_acc[idx]:3.5f}")

        network.train()
        train_epoch_loss, train_epoch_acc = 0.0, 0.0
        for i, (input_ids, masks, labels) in tqdm(enumerate(train_dataloader), total=len(train_dataloader), position=0, leave=True):
            input_ids = input_ids.cuda()
            masks = masks.cuda()
            labels = labels.cuda()              
            optimizer.zero_grad() 
            
            output = network(input_ids, masks)
            pred = torch.argmax(output, dim=1)
            train_epoch_acc += torch.eq(pred, labels).sum().item()
            loss = loss_fn(output,labels)/batch_size

            # commented lines are codes for BertSeqeucen
            #output = network(input_ids, masks, labels=labels)       
            #pred = torch.argmax(output.logits, dim=1)
            #loss = output.loss
            train_epoch_loss += loss.item()
            loss.backward()
            optimizer.step()

        train_acc[epoch] = train_epoch_acc / len(train_dataset)
        train_loss[epoch] = train_epoch_loss / len(train_dataloader)
        print(f"train_loss:{train_loss[epoch]:.6f}, train_acc:{train_acc[epoch]:.6f}")
        if (train_acc[epoch] > train_best_acc):
          train_best_acc = train_acc[epoch]
          torch.save(network.state_dict(), f"{directory}/snapshot_train_best")

        scheduler.step(test_acc[epoch//val_freq])

    network.eval()   
    test_loss[-1], test_acc[-1] = evaluate(network, val_dataloader, loss_fn, len(test_dataset))    
    torch.save(train_loss, f"{directory}/train_loss")
    torch.save(test_loss, f"{directory}/test_loss")
    torch.save(train_acc, f"{directory}/train_acc")
    torch.save(test_acc, f"{directory}/test_acc")
    torch.save(network.state_dict(), f"{directory}/snapshot_final")


In [128]:
def get_path(loss, opt, lr, batch_size, epoch, freeze):
    return f"{PATH}/lr_{lr}_bs_{batch_size}_epoch_{epoch}_freeze_{freeze}"

In [127]:
loss = 'ce'
opt = 'adam'
freeze = 0
batch_size = 32
lr = 1e-5
num_epochs = 12

In [129]:
PATH = "bert_bilstm_classifier"
directory = f"{get_path(loss, opt, lr, batch_size, num_epochs, freeze)}"
os.makedirs(directory, exist_ok=True)

torch.manual_seed(11747)
num_layers = 2

In [130]:
network = BertBiLSTM(num_layers, num_classes, freeze=freeze)
network = network.cuda()

In [131]:
optimizer = torch.optim.Adam(network.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=2, verbose=True, mode='max')

In [132]:
# Trained for Unfreezed BERT
train(directory, network, loss_fn, train_dataset, val_dataset, optimizer, scheduler, batch_size, num_epochs)

100%|██████████| 335/335 [00:13<00:00, 24.60it/s]
  0%|          | 1/3022 [00:00<08:48,  5.72it/s]

epoch:  0, test_loss: 0.092450, test_acc: 0.00037


100%|██████████| 3022/3022 [06:47<00:00,  7.42it/s]


train_loss:0.032031, train_acc:0.752782


100%|██████████| 3022/3022 [06:47<00:00,  7.41it/s]


train_loss:0.014977, train_acc:0.878419


100%|██████████| 335/335 [00:13<00:00, 24.41it/s]
  0%|          | 1/3022 [00:00<06:59,  7.19it/s]

epoch:  2, test_loss: 0.013435, test_acc: 0.88422


100%|██████████| 3022/3022 [06:48<00:00,  7.40it/s]


train_loss:0.012783, train_acc:0.888318


100%|██████████| 3022/3022 [06:47<00:00,  7.42it/s]


train_loss:0.011365, train_acc:0.897213


100%|██████████| 335/335 [00:13<00:00, 24.30it/s]
  0%|          | 1/3022 [00:00<06:48,  7.40it/s]

epoch:  4, test_loss: 0.012667, test_acc: 0.88076


100%|██████████| 3022/3022 [06:46<00:00,  7.43it/s]


train_loss:0.010117, train_acc:0.905975


100%|██████████| 3022/3022 [06:48<00:00,  7.40it/s]


train_loss:0.008990, train_acc:0.915056


  1%|          | 3/335 [00:00<00:13, 24.29it/s]

Epoch     6: reducing learning rate of group 0 to 7.0000e-06.


100%|██████████| 335/335 [00:13<00:00, 24.49it/s]
  0%|          | 1/3022 [00:00<07:30,  6.71it/s]

epoch:  6, test_loss: 0.013946, test_acc: 0.87356


  2%|▏         | 47/3022 [00:06<06:47,  7.29it/s]

KeyboardInterrupt: ignored

Get the tags

In [None]:
network = BertBiLSTM(num_layers, num_classes)
network.load_state_dict(torch.load(f"{directory}/snapshot_val_best"))
network.cuda()

In [140]:
def inference(network, dataset, labels, batch_size):
  logits = []
  preds = []
  loader = DataLoader(dataset, shuffle=False, batch_size=batch_size)
  acc = 0.0
  network.eval()

  with torch.no_grad():
    for i, (input_ids, masks, labels) in tqdm(enumerate(loader),total=len(loader),position=0, leave=True):
      input_ids = input_ids.cuda()
      masks = masks.cuda()
      labels = labels.cuda()
            
      output = network(input_ids, masks)
      logits.append(output)
      pred = torch.argmax(output,dim=1)
      preds.extend([label2index[str(pred[i].item())] for i in range(len(pred))])
      #output = network(input_ids, masks, labels=labels)
      #preds = torch.argmax(output.logits, dim=1)
      #loss = output.loss
      acc += torch.eq(pred, labels).sum().item()
    
  return torch.cat(logits), preds, acc / len(dataset)

In [149]:
val_logits, val_preds, val_acc = inference(network, val_dataset, val_labels, 32)

100%|██████████| 335/335 [00:13<00:00, 24.37it/s]


In [150]:
print(val_acc)

0.8842233236696905


In [151]:
print(val_preds[:10])

['PERSON', 'CARDINAL', 'LOC', 'DATE', 'CARDINAL', 'PERSON', 'PERSON', 'DATE', 'PERSON', 'OTHERS']


In [152]:
tr_logits, tr_preds, tr_acc = inference(network, train_dataset, tr_labels, 32)

100%|██████████| 3022/3022 [02:04<00:00, 24.27it/s]


In [153]:
print(tr_acc)

0.8874281103893418


In [146]:
torch.save(val_logits, f"{directory}/val_logits")

In [154]:
torch.save(tr_logits, f"{directory}/tr_logits")

In [147]:
with open(f"{directory}/val_preds.tsv", 'w') as v_f:
  v_f.write('\n'.join(val_preds))

In [155]:
with open(f"{directory}/tr_preds.tsv", 'w') as t_f:
  t_f.write('\n'.join(tr_preds))