<a href="https://colab.research.google.com/github/SahilDhull/emphasis_selection/blob/master/model/bert_with_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
!pip install transformers
!pip install config



In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import BertForMaskedLM , BertModel ,WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup
from transformers import PreTrainedModel, PreTrainedTokenizer
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import codecs

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [64]:
from google.colab import drive
drive.mount('/content/drive')

train_file = 'drive/My Drive/datasets/train.txt'
dev_file = 'drive/My Drive/datasets/dev.txt'

quotes_file = 'drive/My Drive/datasets/all_quotes.txt'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
def read_sent(file, caseless = True):
    
    with codecs.open(file, 'r', 'utf-8') as f:
        lines = f.readlines()
    #print(lines)
    sent = ""
    sents = []
    
    for line in lines:
        if not (line.isspace()):
            feats = line.strip().split()
            word = feats[0].lower() if caseless else feats[0]
            if(word == "n't"):
              word = "'t"
              sent = sent + "n"
            sent = sent + " " + word
        elif len(sent) > 0:
            sents.append(sent.strip())
            sent = ""
            
    if len(sent) > 0:
        sents.append(sent)
    
    return sents

In [66]:
sentences = read_sent(quotes_file)
print(sentences[0])
print(sentences[100])

sentences = ["[CLS] " + query + " [SEP]" for query in sentences]
print(sentences[0])
print(sentences[100])

# Tokenize with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (tokenized_texts[0])
print (tokenized_texts[100])

you know you 're in love when you can 't fall asleep because reality is finally better than your dreams .
a half-read book is a half-finished love affair .
[CLS] you know you 're in love when you can 't fall asleep because reality is finally better than your dreams . [SEP]
[CLS] a half-read book is a half-finished love affair . [SEP]
['[CLS]', 'you', 'know', 'you', "'", 're', 'in', 'love', 'when', 'you', 'can', "'", 't', 'fall', 'asleep', 'because', 'reality', 'is', 'finally', 'better', 'than', 'your', 'dreams', '.', '[SEP]']
['[CLS]', 'a', 'half', '-', 'read', 'book', 'is', 'a', 'half', '-', 'finished', 'love', 'affair', '.', '[SEP]']


In [67]:
MAX_LEN = 36
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(input_ids[0])
print(input_ids[100])

[ 101 2017 2113 2017 1005 2128 1999 2293 2043 2017 2064 1005 1056 2991
 6680 2138 4507 2003 2633 2488 2084 2115 5544 1012  102    0    0    0
    0    0    0    0    0    0    0    0]
[ 101 1037 2431 1011 3191 2338 2003 1037 2431 1011 2736 2293 6771 1012
  102    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [68]:
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)
print(attention_masks[0])
print(attention_masks[100])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [0]:
train_inputs, validation_inputs = train_test_split(input_ids, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)
                                             
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. 

In [0]:
def mask_tokens(inputs, tokenizer, mlm_probability = 0.15):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # print(inputs[0])

    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
 
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)

    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [0]:
batch_size = 32

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(train_inputs, train_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = model.to(device)

In [0]:
max_epochs = 4
max_steps = len(train_dataloader)*max_epochs

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters,lr = 2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=max_steps/20, num_training_steps=max_steps)

In [0]:
def evaluate(model, tokenizer, validation_dataloader):
  total_loss = 0
  steps = len(validation_dataloader)
  model.eval()
  
  corrects = 0
  errors = 0

  with torch.no_grad():
    for i, (val_inputs, val_masks) in enumerate(validation_dataloader):
      inputs, labels = mask_tokens(val_inputs, tokenizer)
      
      inputs = inputs.to(device)
      labels = labels.to(device)
      val_masks = val_masks.to(device)
      
      output = model(inputs,attention_mask = val_masks)
      predict = output[0]

      batch_size = predict.size()[0]

      for bs in range(batch_size):
        ii = 0
        for ls in labels[bs]:
          ls = ls.item()
          if ls!=-100:
            predicted = torch.argmax(predict[bs][ii]).item()
            if(ls == predicted):
              corrects = corrects + 1
            else:
              errors = errors + 1
          ii = ii + 1

  total = corrects + errors
  accuracy = corrects/total
  print("\nvalidation accuracy = ",accuracy,"\t for",total,"masks on validation data\n")
   

In [0]:
def train(train_dataloader,validation_dataloader, model, tokenizer, optimizer, scheduler, max_epochs, print_freq = 30,val_freq = 666):
  
  model.zero_grad()
  steps = len(train_dataloader)

  for epoch in range(max_epochs):
    total_loss = 0
    for i, (train_inputs, train_masks) in enumerate(train_dataloader):
      inputs, labels = mask_tokens(train_inputs, tokenizer)
      model.train()
      
      inputs = inputs.to(device)
      labels = labels.to(device)
      train_masks = train_masks.to(device)
      
      output = model(inputs,attention_mask = train_masks, masked_lm_labels=labels)
      loss = output[0]
      total_loss = total_loss + loss
      
      loss.backward()
      optimizer.step()
      scheduler.step()
      model.zero_grad()

      if((i+1)%print_freq==0):
        avg_loss = total_loss/print_freq
        total_loss = 0
        print("epoch:",(epoch+1),"out of",max_epochs,"\t batch:",(i+1),"out of",steps,"\t average loss:",avg_loss)   
      
      if((i+1)%val_freq==0):
        evaluate(model, tokenizer, validation_dataloader)


In [0]:
train(train_dataloader,validation_dataloader, model, tokenizer, optimizer, scheduler, max_epochs)

epoch: 1 out of 4 	 batch: 30 out of 6633 	 average loss: tensor(3.1121, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 60 out of 6633 	 average loss: tensor(2.8736, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 90 out of 6633 	 average loss: tensor(3.1090, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 120 out of 6633 	 average loss: tensor(3.1944, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 150 out of 6633 	 average loss: tensor(3.0000, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 180 out of 6633 	 average loss: tensor(3.1065, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 210 out of 6633 	 average loss: tensor(3.0820, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 240 out of 6633 	 average loss: tensor(3.1438, device='cuda:0', grad_fn=<DivBackward0>)
epoch: 1 out of 4 	 batch: 270 out of 6633 	 average loss: tensor(3.0121, device='c