<a href="https://colab.research.google.com/github/Vaibhav-Tyro/keyphrase-extraction-using-BERT/blob/main/Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
"""Train and evaluate the model"""
import argparse
import random
import logging
import os
import torch
from torch.optim import Adam
import torch.nn as nn 
from torch.optim.lr_scheduler import LambdaLR
from tqdm import trange
from transformers import BertForTokenClassification
from torch.utils.data import DataLoader
import evaluate
import utils

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', default = 'data/task1', help = "Directory containing the dataset")
parser.add_argument('--bert_model_dir', default='bert-base-uncased-pytorch', help = "Directory containing the BERT model in pytorch")
parser.add_argument('--model_dir', default = 'experiment/base_model', help = "Directory containing params.json")
parser.add_argument('--seed', type=int, default=2019, help = "random seed for initialization")
parser.add_argument('--restore_file', default = None,
                    help = "optional, name of the file in --model_dir containing weights to reload before training")
parser.add_argument('--multi_gpu', default = False, action = 'store_true', help = "whether to use multiple GPU if available")
parser.add_argument('--fp16', default = False, action = 'store_true', help = "whether to use 16-bit float precission instead of 32-bit")
parser.add_argument('--loss_scale', type = float, default=0,
                    help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to true.\n"
                    "0 (default value):dynamic loss scaling.\n"
                    "positive power of 2: static loss scaling value.\n")

In [5]:
def train(model,data_iterator, optimizer, scheduler,params):
  """ Train the model on 'steps' batches"""
  #set model to training mode
  model.train()
  schedular.step()

  #a running average object for loss
  loss_avg = utils.RunningAverage()

  #use tqdm for progress bar
  t = trange(params.train_steps)
  for i in t:
    #fetch the next taining batch
    batch_data, batch_tags = next(data_iterator)
    batch_masks = batch_data.gt(0)

    #compute model output and loss
    loss = model(batch_data, token_type_ids = None, attention_mask = batch_masks, labels = batch_tags)

    if params.n_gpu > 1 and args.multi_gpu:
      loss = loss.mean() #mean() to average on multi-gpu

      #clear previous  gradients, compute gradients of all variables with respect to loss
      model.zero_grad()
      if args.fp16:
        optimizer.backward(loss)
      else:
          loss.backward()

      # gradient clipping
      nn.utils.clip_grad_norm_(parameters = model.parameters(), max_norm = params.clip_grad)

      #performs update using calculated gradients
      optimizer.step()

      #update the average loss
      loss_avg.update(loss.item())
      t.set_postfix(loss = '{:05.3f)'.format(loss_avg()))


def train_and_evaluate(model,train_data, val_data, optimizer, scheduler, params, model_dir, restore_file= None):
  """Train the model and evaluate every epoch."""

  #reload weights from restore_file if specified

  if restore_file is not None:
    restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
    logging.info("Restoring parameters from {}".format(restore_path))
    utils.load_checkpoint(restore_path, model, optimizer)

    best_val_f1 = 0.0
    patience_counter = 0
    for epoch in range(1,params.epoch_num +1):
      #Run one epoch
      logging.info("Epoch {}/{}".format(epoch, params.epoch_num))

      #compute number of batches in one epoch
      params.val_steps = params.train_size // params.batch_size
      params.val_steps = params.val_size // params.batch_size

      #data iterator for training
      train_data_iterator = data_loader.data_iterator(train_data, shuffle=True)
      #Train for one epoch on training set
      train(model, train_data_iterator, optimizer, scheduler, params)

      #data iterator for evaluation
      train_data_iterator = data_loader.data_iterator(trainn_data, shuffle = False)
      val_data_iterator = data_loader.data_iterator(val_data, shuffle = False)

      #evaluate for one epoch on training set and validation set
      params.eval_steps = params.train_steps
      train_metrics = evaluate(model, train_data_iterator, params, mark-'Train')
      params.eval_steps = params.val_steps
      val_metrics = evaluate(model,val_data_iterator, params, mark='val')

      val_f1 = val_metrics['f1']
      improve_f1 = val_f1 - best_val_f1

      #save weights of the network
      model_to_save = model.module if hasattr(model, 'module') else model #only save the model it-self.
      utils.save_checkpoint({'epoch': epoch + 1,
                             'state_dict': model_to_save.state_dict(),
                             'optim_dict': optimizer_to_save.state_dict()},
                            is_best = improve_f1>0,
                            checkpoints = model_dir)
      
      if improve_f1 > 0:
        logging.info("-found new best F1")
        best_val_f1 = val_f1
        if improve_f1 < params.patience:
          patience_counter += 1
        else:
          patience_counter = 0
      else:
        patience_counter +=1


        #Early stopping and logging best f1
        if (patience_counter >= params.patience_num and epoch > params.min_epoch_num) or epoch == param.epoch_num:
          logging.info("Best val f1: {:05.2f}".format(best_val_f1))
          break


if __name__ == '__main__':
  args = parser.parse_args()

  #Load the parameters from json file
  json_path = os.path.join(args.model_dir, 'params.json')
  assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
  params = utils.Params(json_path)

  #Use GPUs if available
  params.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  params.n_gpu = torch.cuda.device_count()
  params.multi_gpu = args.multi_gpu

  #set the random seed for reproducible experiments
  random.seed(args.seed)
  torch.manual_seed(args.seed)
  if params.n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)  # set random seed for all GPUs
  params.seed = args.seed

  #set the logger
  utils.set_logger(os.path.join(args.model_dir, 'train.log'))
  logging.info("device: {}, n_gpu: {}, 16-bits training: {}".format(params.device, params.n_gpu, args.fp16))

  # Create the input data pipeline
  logging.info("Loading the datasets...")

  #Initialize the Dataloader
  data_loader = DataLoader(args.data_dir,args.bert_model_dir, params, token_pad_idx = 0)

  # Load training data and test data
  train_data = data_loader.load_data('train')
  val_data = data_loader.load_data('val')

  #Specify the training and validation dataset sizes
  params.train_size = train_data['size']
  params.val_size = val_data['size']

  #Prepare model
  model = BertTokenForClassification.from_pretrained(args.bert_model_dir, num_labels = len(params.tag2idx))
  model.to(params.device)
  if args.fp16:
    model.half()

  if params.n_gpu > 1 and args.multi_gpu:
    model = torch.nn.DataParallel(model)

    #prepare optimizer
    if params.full_finetuning:
      param_optimizer = list(model.named_parameters())
      no_decay = ['bias', 'gamma','beta']
      optimizer_grouped_parameters = [
                                      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                                       'weight_decay_rate': 0.001},
                                      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                                       'weight_decay_rate':0.0}
      ]
    else:
      param_optimizer = list(model.classifier.named_parameters())
      optimizer_grouped_parameters = [{'params': [p for n, p in param_optimzer]}]
      if args.fp16:
        try:
          from apex.optimizers import FP16_Optimizer
          from apex.optimizers import FusedAdam

        except ImportError:
          raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=params.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        scheduler = LambdaLR(optimizer, lr_lambda = lambda epoch: 1/(1 + 0.05*opoch))
        if args.loss_scale == 0:
          optimizer = FP16_optimizer(optimizer, dynamic_loss_scale= True)
        else:
          optimizer = Fp16_optimizer(optimizer, static_loss_scale = args.loss_scale)
      else:
        optimizer = Adam(optimizer_grouped_parameters, lr = params.learning_rate)
        scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 1/(1 + 0.05*epoch))

      #Train and evaluate the model
      logging.info("starting training for {} epoch(s)".format(params.epoch_num))
      train_and_evaluate(model,train_data, val_data, optimizer, scheduler, params, args.model_dir, args.restore_file)

NameError: ignored

In [12]:
pip install utils

Collecting utils
  Downloading https://files.pythonhosted.org/packages/55/e6/c2d2b2703e7debc8b501caae0e6f7ead148fd0faa3c8131292a599930029/utils-1.0.1-py2.py3-none-any.whl
Installing collected packages: utils
Successfully installed utils-1.0.1


In [10]:
pip install evaluate

Collecting evaluate
  Downloading https://files.pythonhosted.org/packages/90/50/0cc73b299fd941cb12d7ed39e0ccf8e18fe78dd6c16b951abe5477b3cd82/evaluate-0.0.3.tar.gz
Building wheels for collected packages: evaluate
  Building wheel for evaluate (setup.py) ... [?25l[?25hdone
  Created wheel for evaluate: filename=evaluate-0.0.3-cp36-none-any.whl size=6862 sha256=3c0b71bd1e6280372a463f1577d4c0edd6f7a718c5d17aef654cbdf6597e6b24
  Stored in directory: /root/.cache/pip/wheels/de/51/a5/ebdce3e18b99539f31d3624ed21ca88ab3841617eb82628b05
Successfully built evaluate
Installing collected packages: evaluate
Successfully installed evaluate-0.0.3


In [4]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 2.8MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 18.1MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 43.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K 