In [1]:
import argparse
import random
import logging
import os

import torch
from torch.optim import Adam, AdamW
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP

from tqdm import trange
from transformers import BertTokenizer, BertModel, BertForTokenClassification, BertConfig
#import pytorch_pretrained_bert

from torch.optim.lr_scheduler import LambdaLR

In [2]:
%run ./data_loader.ipynb

In [3]:
%run ./metrics.ipynb

In [4]:
%run ./options.ipynb

In [5]:
%run ./utils.ipynb

In [6]:
%run ./model.ipynb

In [7]:
def make_bert_mask(x, pad_id):
    bert_mask = (x != pad_id).float()
    return bert_mask

In [8]:
def train(model, data_iterator, optimizer, scheduler, args):
    """Train the model on `steps` batches"""
    # set model to training mode
    model.train()

    # a running average object for loss
    loss_avg = RunningAverage()
    loss_list = []
    
    # Use tqdm for progress bar
    t = trange(args.train_steps)
    for i in t:
        # fetch the next training batch
        if isinstance(model, nn.DataParallel):
          model = model.module
        batch_data, batch_tags = next(data_iterator)
        #print(batch_data.shape)
        batch_masks = make_bert_mask(batch_data, 0)
        
        inputs = {"input_ids": batch_data, "attention_mask": batch_masks, "labels": batch_tags}
        #print(x_mask.shape)
        if args.pretrain:
          outputs = model(**inputs)#model(batch_data, batch_masks, batch_tags) 
        else:
          outputs = model(**inputs)
        #print(outputs)
        #model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags)
        #loss = -1 * log_likelihood
        #print(outputs)
        # print(outputs)
        # print(outputs[0])
        
        # linear
        # loss = outputs[0][0].mean()# to average on multi-gpu
        # crf
        loss = outputs[0].mean()

        # clear previous gradients, compute gradients of all variables wrt loss
        
        optimizer.zero_grad()
        loss.backward()
        #if args.fp16:
        #    optimizer.backward(loss)
        #else:
        #   loss.backward()

        # gradient clipping
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)

        # performs updates using calculated gradients
        optimizer.step()
        # scheduler.step()

        # update the average loss
        loss_avg.update(loss.item())
        loss_list.append(loss.item())
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
        
    return loss_list

In [9]:
def evaluate(model, data_iterator, params, mark='Eval', verbose=False):
    """Evaluate the model on `steps` batches."""
    # set model to evaluation mode
    model.eval()

    idx2tag = params.idx2tag
    #if params.pretrain == False:
    idx2word = BertTokenizer.from_pretrained(params.pretrained_model)
    #else:
    #  idx2word = pytorch_pretrained_bert.BertTokenizer.from_pretrained(params.pretrain_path)

    true_tags = []
    pred_tags = []
    word_data = []

    # a running average object for loss
    loss_avg = RunningAverage()

    for i in range(params.eval_steps):
        # fetch the next evaluation batch
        if isinstance(model, nn.DataParallel):
          model = model.module
        batch_data, batch_tags = next(data_iterator)
        #batch_masks = batch_data.gt(0)
        batch_masks = make_bert_mask(batch_data, 0)
        
        inputs = {"input_ids": batch_data, "attention_mask": batch_masks, "labels": batch_tags}
        #print(x_mask.shape)
        if params.pretrain:
          outputs = model(**inputs)#model(batch_data, batch_masks, batch_tags) 
        else:
          outputs = model(**inputs)
        #print(outputs)
        tmp_eval_loss, logits = outputs[0], outputs[1]
        if params.decoder == 'crf':
          tags = model.crf.decode(logits, batch_masks)
          batch_output = torch.squeeze(tags, 0).detach().cpu().numpy()
        elif params.decoder == 'linear':
          tags = np.argmax(logits.detach().cpu().numpy(), axis=2).tolist()
          batch_output = tags
          #print(outputs)
        #print(tags)
        #print(len(tags))
        #o = model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags)
        #print(tag_seq)
        # print(tmp_eval_loss)
        # linear
        tmp_eval_loss = tmp_eval_loss.mean()
        # crf
        tmp_eval_loss = tmp_eval_loss.mean()
        
        loss_avg.update(tmp_eval_loss.item())
        
        #batch_output = model(batch_data, token_type_ids=None, attention_mask=batch_masks).logits  # shape: (batch_size, max_len, num_labels)
        
        
        batch_tags = batch_tags.detach().cpu().numpy()
        batch_data = batch_data.detach().cpu().numpy()

        #pred_tags.extend([idx2tag.get(idx) for indices in np.argmax(batch_output, axis=2) for idx in indices])
        word_data.extend([idx2word.convert_ids_to_tokens(int(idx)) for indices in batch_data for idx in indices])
        pred_tags.extend([idx2tag.get(idx) for indices in batch_output for idx in indices])
        true_tags.extend([idx2tag.get(idx) for indices in batch_tags for idx in indices])
        #pred_list = list(chain.from_iterable(tag_seq))
        #pred_tags += pred_list
        
        #batch_lens = 16
        #true_list = list(chain.from_iterable([sublist[:batch_lens.tolist()[b]] for b, sublist in enumerate(batch_y.tolist())]))
        #print(len(pred_tags))
        #print(len(true_tags))
        #if i == 0:
        #  a1 = ", ".join(pred_tags)
        #  a2 = ", ".join(true_tags)
        #  a3 = ", ".join(word_data)
        #  a = a1 + '\n' + a2+ '\n' + a3
        #  f_p = "/dbfs/FileStore/shared_uploads/hus45338967@hustietoallas.fi/pre/pred_crf_{}_{}.txt".format(params.epoch_record, i)
        #  with open(f_p, "w") as f:
        #    f.write(a)
        

        
        
    assert len(pred_tags) == len(true_tags)

    # logging loss, f1 and report
    metrics = {}
    p, r, f1 = eval_score(true_tags, pred_tags)
    metrics['loss'] = loss_avg()
    metrics['f1'] = f1
    metrics['prec'] = p
    metrics['rec'] = r
    rp = classification_report(true_tags, pred_tags)
    metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items())
    #logging.info("- {} metrics: ".format(mark) + metrics_str)
    print("- {} metrics: ".format(mark) + metrics_str)

    if verbose:
        report = classification_report(true_tags, pred_tags)
        #logging.info(report)
        print(report)
    return metrics, rp


In [10]:
def train_and_evaluate_and_test(model, train_data, val_data, test_data, optimizer, scheduler, args, model_dir, restore_file=None):
    """Train the model and evaluate every epoch."""
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth')
        #logging.info("Restoring parameters from {}".format(restore_path))
        print("Restoring parameters from {}".format(restore_path))
        load_checkpoint(restore_path, model, optimizer)
        
    best_val_f1 = 0.0
    best_test_f1 = 0.0
    patience_counter = 0

    for epoch in range(1, args.n_epochs + 1):
        # Run one epoch
        args.epoch_record = epoch
        #logging.info("Epoch {}/{}".format(epoch, args.n_epochs))
        print("Epoch {}/{}".format(epoch, args.n_epochs))

        # Compute number of batches in one epoch
        args.train_steps = args.train_size // args.batch_size
        args.val_steps = args.val_size // args.batch_size
        args.test_steps = args.test_size // args.batch_size

        # data iterator for training
        train_data_iterator = data_loader.data_iterator(train_data, shuffle=True)
        # Train for one epoch on training set
        #!!!!
        train(model, train_data_iterator, optimizer, scheduler, args)

        # data iterator for evaluation
        train_data_iterator = data_loader.data_iterator(train_data, shuffle=False)
        val_data_iterator = data_loader.data_iterator(val_data, shuffle=False)
        test_data_iterator = data_loader.data_iterator(test_data, shuffle=False)

        # Evaluate for one epoch on training set and validation set
        args.eval_steps = args.train_steps
        train_metrics, train_rp = evaluate(model, train_data_iterator, args, mark='Train')
        args.eval_steps = args.val_steps
        val_metrics, val_rp = evaluate(model, val_data_iterator, args, mark='Val')
        args.eval_steps = args.test_steps
        test_metrics, test_rp = evaluate(model, test_data_iterator, args, mark='Test')
        
        val_f1 = val_metrics['f1']
        #improve_f1 = val_f1 - best_val_f1
        improve_f1 = test_metrics['f1'] - best_test_f1
        # Save weights of the network
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        optimizer_to_save = optimizer.optimizer if args.fp16 else optimizer
        save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model_to_save.state_dict(),
                               'optim_dict': optimizer_to_save.state_dict()},
                               is_best=improve_f1>0,
                               checkpoint=model_dir)
        if improve_f1 > 0:
            #logging.info("- Found new best F1")
            print("- Found new best F1")
            best_val_f1 = val_metrics['f1']
            best_test_f1 = test_metrics['f1']
            best_val = "; ".join("{}: {:05.2f}".format(k, v) for k, v in val_metrics.items())
            best_test = "; ".join("{}: {:05.2f}".format(k, v) for k, v in test_metrics.items())
            best_rp = test_rp
            if improve_f1 < args.patience:
                patience_counter += 1
            else:
                patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping and logging best f1
        if (patience_counter >= args.patience_num and epoch > args.n_epochs) or epoch == args.n_epochs:
            #logging.info("Best val f1: {:05.2f}".format(best_val_f1))
            print("Best val score - eval metrics: " + best_val)
            print("Best test score - test metrics: " + best_test)
            print(best_rp)
            break

In [None]:
if __name__ == '__main__':
  args = args_parser()
  
  args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  args.gpus = torch.cuda.device_count()
  
  random.seed(args.seed)
  torch.manual_seed(args.seed)
  
  if args.gpus > 0:
    torch.cuda.manual_seed_all(args.seed)  # set random seed for all GPUs
  args.seed = args.seed
  
  # Set the logger
  #logging.info("device: {}, gpus: {}".format(args.device, args.gpus))
  print("device: {}, gpus: {}".format(args.device, args.gpus))

  # Create the input data pipeline
  #logging.info("Loading the datasets...")
  print("Loading the datasets...")
  
  # Initialize the DataLoader
  data_loader = DataLoader(args.data_dir, args.bert_model_dir, args, token_pad_idx=0)
  
  # Load training data and test data
  train_data = data_loader.load_data('train')
  val_data = data_loader.load_data('dev')
  test_data = data_loader.load_data('test')
  
  args.train_size = train_data['size']
  args.val_size = val_data['size']
  args.test_size = test_data['size']
  
  #config = BertConfig.from_json_file('../')
  #model = BertForTokenClassification.from_pretrained(args.pretrained_model, num_labels=len(args.tag2idx))
  config = BertConfig.from_pretrained(args.pretrained_model, num_labels=len(args.tag2idx))
  if args.pretrain == True:
    config.pretrain = args.pretrain
    config.pretrain_path = args.pretrain_path
  config.loss_type = args.loss_type
  if args.decoder == 'linear':
    if args.pretrain == False:
      model = BERT_Linear(config)
    else:
      model = BERT_Linear_pre(config)
  elif args.decoder == 'crf':
    if args.pretrain == False:
      model = BERT_CRF(config)
    else:
      model = BERT_CRF_pre(config)
  elif args.decoder == 'lan':
    config.drop_rate = 0
    config.head_num = 1
    model = BERT_LAN(config)
  else:
    raise RuntimeError("wrong model name")
  model.to(args.device)
  #model._init_weights()
  if args.gpus > 1:
    model = torch.nn.DataParallel(model)
  #model = model.cuda()
  # Prepare optimizer
  args.lr_layer_decay = 1.0
  args.weight_decay = 0
  if args.full_finetuning:
    # param_optimizer = list(model.named_parameters())
    # print([item[0] for item in param_optimizer])
    # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    # no_decay = ['bias', 'gamma', 'beta']
    # optimizer_grouped_parameters = [
    #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
    #      'weight_decay_rate': 0.01},
    #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
    #      'weight_decay_rate': 0.0}
    # ]
    # print([item[0] for item in param_optimizer])
    optimizer = AdamW(
                [{'params':model.bert.embeddings.parameters(), 'lr': args.lr * args.lr_layer_decay ** (len(model.bert.encoder.layer) + 2)}]
                +
                [{'params': module_list_item.parameters(), 'lr': args.lr * args.lr_layer_decay ** (len(model.bert.encoder.layer) + 1 - index), } 
                for index, module_list_item in enumerate(model.bert.encoder.layer)]
                +
                [{'params':model.bert.pooler.parameters(), 'lr': args.lr * args.lr_layer_decay ** 1}]
                +
                [{'params':model.classifier.parameters(), 'lr': args.lr * args.lr_layer_decay ** 0}]
                # +
                # [{'params':model.crf.parameters(), 'lr': args.lr * args.lr_layer_decay ** 0}]
            , weight_decay = args.weight_decay)
  else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]
    
  # optimizer = AdamW(model.parameters(), lr=args.lr)
  scheduler = None
  # scheduler --> affect performance
  # scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 1/(1 + 0.05*epoch))
  
  # Train and evaluate the model
  #logging.info("Starting training for {} epoch(s)".format(args.n_epochs))
  print("Starting training for {} epoch(s)".format(args.n_epochs))
  train_and_evaluate_and_test(model, train_data, val_data, test_data, optimizer, scheduler, args, args.model_dir, args.restore_file)

device: cuda, gpus: 1
Loading the datasets...
Starting training for 10 epoch(s)
Epoch 1/10


100%|██████████| 17206/17206 [43:24<00:00,  6.61it/s, loss=0.497]


- Train metrics: loss: 00.43; f1: 49.76; prec: 89.84; rec: 34.40
- Val metrics: loss: 00.46; f1: 48.13; prec: 87.93; rec: 33.14
- Test metrics: loss: 03.14; f1: 07.36; prec: 32.39; rec: 04.15
Checkpoint Directory does not exist! Making directory /mnt/batch/tasks/shared/LS_root/mounts/clusters/nergpu/code/Users/Wei/model/linear_pretrain_ce
- Found new best F1
Epoch 2/10


100%|██████████| 17206/17206 [42:49<00:00,  6.70it/s, loss=0.429]


- Train metrics: loss: 00.37; f1: 57.19; prec: 87.74; rec: 42.42
- Val metrics: loss: 00.47; f1: 51.89; prec: 82.14; rec: 37.92
- Test metrics: loss: 03.42; f1: 08.76; prec: 32.16; rec: 05.07
- Found new best F1
Epoch 3/10


100%|██████████| 17206/17206 [42:50<00:00,  6.69it/s, loss=0.383]


- Train metrics: loss: 00.32; f1: 63.93; prec: 86.01; rec: 50.86
- Val metrics: loss: 00.51; f1: 52.84; prec: 75.29; rec: 40.71
- Test metrics: loss: 03.76; f1: 10.13; prec: 32.67; rec: 05.99
- Found new best F1
Epoch 4/10


 30%|██▉       | 5144/17206 [12:49<30:00,  6.70it/s, loss=0.350]