In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip install seqeval
!pip install transformers==3.0.0
!pip install tokenizers==0.10.0

import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from tokenizers import Tokenizer, BertWordPieceTokenizer 
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

from transformers import BertForTokenClassification, AdamW
from transformers.modeling_bert import load_tf_weights_in_bert
import transformers
import logging
import argparse
import glob
import logging
import os
import random
import pdb
import json

import matplotlib.pyplot as plt
import numpy as np
import torch
# from seqeval.metrics import precision_score, recall_score, f1_score , accuracy_score
logger = logging.getLogger(__name__)

from transformers import get_linear_schedule_with_warmup

# from seqeval.metrics import f1_score, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

from seqeval.metrics import classification_report
import os


def load_BFTC_from_TF_ckpt(bert_config, ckpt_path, model_class):
    config=bert_config
    model = transformers.BertForPreTraining(config)
    load_tf_weights_in_bert(model,config, ckpt_path)
    state_dict=model.state_dict()
    # logging.info(json.dumps(config))
    model = model_class(config)
    # Load from a PyTorch state_dict
    old_keys = []
    new_keys = []
    for key in state_dict.keys():
        new_key = None
        if 'gamma' in key:
            new_key = key.replace('gamma', 'weight')
        if 'beta' in key:
            new_key = key.replace('beta', 'bias')
        if new_key:
            old_keys.append(key)
            new_keys.append(new_key)
    for old_key, new_key in zip(old_keys, new_keys):
        state_dict[new_key] = state_dict.pop(old_key)

    missing_keys = []
    unexpected_keys = []
    error_msgs = []
    # copy state_dict so _load_from_state_dict can modify it
    metadata = getattr(state_dict, '_metadata', None)
    state_dict = state_dict.copy()
    if metadata is not None:
        state_dict._metadata = metadata

    def load(module, prefix=''):
        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
        module._load_from_state_dict(
            state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
        for name, child in module._modules.items():
            if child is not None:
                load(child, prefix + name + '.')
    start_prefix = ''
    if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
        start_prefix = 'bert.'
    load(model, prefix=start_prefix)
    if len(missing_keys) > 0:
        logger.info("Weights of {} not initialized from pretrained model: {}".format(
            model.__class__.__name__, missing_keys))
    if len(unexpected_keys) > 0:
        logger.info("Weights from pretrained model not used in {}: {}".format(
            model.__class__.__name__, unexpected_keys))
    if len(error_msgs) > 0:
        raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                           model.__class__.__name__, "\n\t".join(error_msgs)))
    return model


In [None]:
import sklearn.metrics

class SentenceGetter(object):
  def __init__(self, file_path):
    self.n_sent = 1
    self.empty = False
    self.data=open(file_path,'r').readlines()
    self.sentences=[]
    self.labels=[]
    count=0
    curr_sent=[]
    curr_lab=[]
    for line in self.data:
      if(line.split("\n")[0]==''):
        if  curr_sent != []:
          self.sentences.append(curr_sent)
          self.labels.append(curr_lab)
          curr_lab=[]
          curr_sent=[]
        
      else:
        if line.split("\n")[0].split('\t')[0]:
          curr_sent.append(line.split("\n")[0].split('\t')[0])
          curr_lab.append(line.split("\n")[0].split('\t')[1])
    if(curr_sent!=[]):
      self.sentences.append(curr_sent)
      self.labels.append(curr_lab)
  def get_next(self):
    try:
        s = sentences[self.n_sent-1]
        self.n_sent += 1
        return s
    except:
        return None



def tagger(bert_config_path,model_ckpt,model_type,data_path,train_lang,test_langs,out_dir,do_lower_case,max_seq_len,batch_size,learning_rate,epochs,evaluation_interval,vocab_file_path="",tokenizer_path = ""):

  class data_preprocessed(object):
    def __init__(self, path,lang):
      self.path=path
      self.lang=lang
      self.getter=SentenceGetter(path)
      self.tokenized_texts_and_labels = [
          tokenize_and_preserve_labels(sent, labs)
          for getter in getter_list for sent, labs in zip(self.getter.sentences, self.getter.labels)
      ]
      self.tokenized_texts = [token_label_pair[0] for token_label_pair in self.tokenized_texts_and_labels if token_label_pair[0]]
      self.labels = [token_label_pair[1] for token_label_pair in self.tokenized_texts_and_labels]
      print("Test Lang: ",path)
      # print(self.tokenized_texts[:10])
      # print(self.labels[:10])
      self.input_ids = pad_sequences([[tokenizer.token_to_id(x) for x in txt] for txt in self.tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", value=0.0,
                              truncating="post", padding="post") 
      # print(self.input_ids)
      # self.input_ids = pad_sequences([[tokenizer.encode(x).ids[0] for x in txt] for txt in self.tokenized_texts],
      #                         maxlen=MAX_LEN, dtype="long", value=0.0,
      #                         truncating="post", padding="post")
      self.tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in self.labels],
                        maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                        dtype="long", truncating="post") 
      
      self.attention_masks = [[float(i != 0.0) for i in ii] for ii in self.input_ids]

      self.attention_masks=torch.tensor(self.attention_masks)
      self.input_ids=torch.tensor(self.input_ids)
      self.tags=torch.tensor(self.tags)

      self.data = TensorDataset(self.input_ids, self.attention_masks, self.tags)
      self.sampler = SequentialSampler(self.data)
      self.dataloader = DataLoader(self.data, sampler=self.sampler, batch_size=batch_size)

  def tokenize_and_preserve_labels(sentence, text_labels):
      tokenized_sentence = []
      labels = []

      for word, label in zip(sentence, text_labels):
          # Tokenize the word and count # of subwords the word is broken into
          tokenized_word = tokenizer.encode(word,add_special_tokens=False).tokens
          n_subwords = len(tokenized_word)

          # Add the tokenized word to the final tokenized word list

          # Add the label to the new list of labels followed by "PAD" token `n_subwords`-1 times
          if n_subwords > 0:
            tokenized_sentence.extend(tokenized_word)
            labels.extend([label] + ["PAD"]*(n_subwords-1))

      assert(len(tokenized_sentence) == len(labels))
      assert(len(tokenized_sentence)>0) 
      return tokenized_sentence, labels

  getter_list = [SentenceGetter(data_path+'/train-'+train_lg+".tsv") for train_lg in train_lang]
  tag_values = list(set([item for getter in getter_list for sublist in getter.labels for item in sublist]))
  

  
  tag2idx = {t: i for i, t in enumerate(tag_values)}
  tag2idx["PAD"] = -100
  print('tag2idx\n',tag2idx)
  MAX_LEN = max_seq_len
  # batch_size = 32
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  n_gpu = torch.cuda.device_count()
  print('n_gpu : ',n_gpu)

  if tokenizer_path:
    print(tokenizer_path)
    tokenizer = Tokenizer.from_file(tokenizer_path)
  else:
    tokenizer = BertWordPieceTokenizer(vocab_file_path, lowercase=do_lower_case, strip_accents=False)
  tokenized_texts_and_labels_train = [
    tokenize_and_preserve_labels(sent, labs)
    for getter in getter_list for sent, labs in zip(getter.sentences, getter.labels)
  ]
  tokenized_texts_train = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_train]
  labels_train = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_train]

  input_ids_train = pad_sequences([[tokenizer.token_to_id(x) for x in txt] for txt in tokenized_texts_train],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post") 
  tags_train = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_train],
                      maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                      dtype="long", truncating="post") 
  print("Train Lang: ",data_path+'/train-'+train_lang[0]+".tsv")
  # print(tokenized_texts_train[:10])
  # print(labels_train[:10])

  attention_masks_train = [[float(i != 0.0) for i in ii] for ii in input_ids_train]
  attention_masks_train=torch.tensor(attention_masks_train)
  input_ids_train=torch.tensor(input_ids_train)
  tags_train=torch.tensor(tags_train)
  train_data = TensorDataset(input_ids_train, attention_masks_train, tags_train)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)



  bert_config            = BertConfig.from_json_file(bert_config_path)
  bert_config.num_labels=len(tag2idx)-1 
  test_data_objs=[]
  for lang in test_langs:
    path=data_path+'/test-'+lang+".tsv"
    
    test_data_objs.append(data_preprocessed(path,lang))
  if(model_type=='tensorflow'):
    model=load_BFTC_from_TF_ckpt(bert_config, model_ckpt, BertForTokenClassification)
  elif(model_type=='pytorch'):
    model = BertForTokenClassification.from_pretrained(model_ckpt, config=bert_config)
  else:
    print("enter correct model type")
    return
  print("Model successfully loaded : ",model_ckpt)
  model.cuda()

  FULL_FINETUNING = True
  if FULL_FINETUNING:
      param_optimizer = list(model.named_parameters())
      no_decay = ['bias', 'gamma', 'beta']
      optimizer_grouped_parameters = [
          {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.01},
          {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.0}
      ]
  else:
      param_optimizer = list(model.classifier.named_parameters())
      optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
  optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=learning_rate,
    eps=1e-8
  )

  # epochs = 11
  max_grad_norm = 1.0
  # Total number of training steps is number of batches * number of epochs.
  total_steps = len(train_dataloader) * epochs
  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
  )
  loss_values, validation_loss_values = [], []
  eval_intervals_left=evaluation_interval

  for epoch_num in range(epochs):
	    # ========================================
	    #               Training
	    # ========================================
	    # Perform one full pass over the training set.

	    # Put the model into training mode.
      eval_intervals_left-=1
      model.train()
      # Reset the total loss for this epoch.
      total_loss = 0

      # Training loop
      count=0
      print("Epoch number: ",epoch_num)
      print("num training batches:" ,len(train_dataloader))
      for step, batch in enumerate(train_dataloader):
          # add batch to gpu
          # batch = tuple(t.to(device) for t in batch)
          batch = tuple(t.cuda() for t in batch)
          b_input_ids, b_input_mask, b_labels = batch

          # Always clear any previously calculated gradients before performing a backward pass.
          model.zero_grad()
          # forward pass
          # This will return the loss (rather than the model output)
          # because we have provided the `labels`.
          outputs = model(b_input_ids, token_type_ids=None,
                          attention_mask=b_input_mask, labels=b_labels)
          # get the loss
          loss = outputs[0]
          if(count%20==0):
            print(count,str(loss.item()))
          count+=1

          if (count %400 == 0):
            break
          
          # Perform a backward pass to calculate the gradients.
          loss.backward()
          # track train loss
          total_loss += loss.item()
          # Clip the norm of the gradient
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
          # update parameters
          optimizer.step()
          # Update the learning rate.
          scheduler.step()

	    # Calculate the average loss over the training data.
      avg_train_loss = total_loss / len(train_dataloader)
      print("Average train loss: {}".format(avg_train_loss))

	    # Store the loss value for plotting the learning curve.
      loss_values.append(avg_train_loss)


      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      # Put the model into evaluation mode
      if(eval_intervals_left==0):
        eval_intervals_left=evaluation_interval
        for obj in test_data_objs:

          model.eval()
          # Reset the validation loss for this epoch.
          eval_loss, eval_accuracy = 0, 0
          nb_eval_steps, nb_eval_examples = 0, 0
          predictions , true_labels, token_ids = [], [], [] 
          
          for batch in obj.dataloader:
              # batch = tuple(t.to(device) for t in batch)
              batch = tuple(t.cuda() for t in batch)
              b_input_ids, b_input_mask, b_labels = batch

              # Telling the model not to compute or store gradients,
              # saving memory and speeding up validation
              with torch.no_grad():
                  # Forward pass, calculate logit predictions.
                  # This will return the logits rather than the loss because we have not provided labels.
                  outputs = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
              # Move logits and labels to CPU
              logits = outputs[1].detach().cpu().numpy()
              label_ids = b_labels.to('cpu').numpy()
              input_ids = b_input_ids.to('cpu').numpy() 
              

              # Calculate the accuracy for this batch of test sentences.
              eval_loss += outputs[0].mean().item()
              predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
              true_labels.extend(label_ids)
              token_ids.extend(input_ids)

          eval_loss = eval_loss / len(obj.dataloader)
          validation_loss_values.append(eval_loss)
          print("Validation loss for",obj.path," : {}".format(eval_loss))
          pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                        for p_i, l_i in zip(p, l) if l_i != -100]
          valid_tags = [tag_values[l_i] for l in true_labels
                                        for l_i in l if l_i != -100]
          
          print("Validation Accuracy: {}".format(accuracy_score(valid_tags,pred_tags )))
          print("Validation F1-Score: {}".format(f1_score(valid_tags,pred_tags,average='micro')))
          print("Validation F1-Score: {}".format(f1_score(valid_tags,pred_tags,average='macro')))
		    

          combined=[]
          for labels,ids in zip(true_labels,token_ids):
            curr_word = []
            for i,(label,id) in enumerate(zip(labels,ids)):
              if id:
                if label == -100:
                  curr_word.append(id)
                else:
                  if i!=0:
                    combined.append(curr_word)
                  curr_word = [id]
            combined.append(curr_word)
                 try:
            os.mkdir(out_dir)
          except:
            print('error while creating : ',out_dir)
            temp=1
          dir_name=out_dir+'/pos/'
          try:
            os.mkdir(dir_name)
          except:
            print('error while creating : ',dir_name)
            temp=1
          dir_name=dir_name+'/classification_reports/'
          try:
            os.mkdir(dir_name)
          except:
            print('error while creating : ',dir_name)
            temp=1
          dir_name=dir_name+'epoch_num_'+str(epoch_num)+'/'
          try:
            os.mkdir(dir_name)
          except:
            print('error while creating : ',dir_name)
          temp=1
          dir_name=dir_name+obj.lang+'/'
          try:
            os.mkdir(dir_name)
          except:
            print('error while creating : ',dir_name)
            temp=1
          
          with open(dir_name+'pred.csv','w') as f:
            for i in range(len(combined)):
              curr_word = tokenizer.decode(combined[i])
              f.write(curr_word+"\t"+valid_tags[i]+"\t"+pred_tags[i]+"\n")

          with open(dir_name+'report.txt','w') as f:
            print(str(tag2idx)+'\n')
            f.write("Validation Accuracy: {}".format(accuracy_score(valid_tags,pred_tags)))
            f.write(str(sklearn.metrics.confusion_matrix(valid_tags, pred_tags)))
            print('\n')
            f.write(str(precision_recall_fscore_support(valid_tags, pred_tags)))
          print("\n")




In [None]:
do_lower_case=False ## Note. Keep it False if involving any Indian Language or Scirpt of Indian Origin.
max_seq_len=128 ## Depends on the model architechture
batch_size=32
learning_rate=2e-5
epochs=8 ## Number of passes over Training Data
evaluation_interval=1 ## Evaluation after an interval of evaluation_interval number of epochs. 
                      ## Final model selection manually done based on Accuracy-Score on Validation Data on Train_Language.  

bert_config_path="" ## BERT Config path.              Sample : "/content/drive/My Drive/btp/bert_increased_vocab/hindi_bert/bpe_hin_mar_config_15k.json"
vocab_file_path="" ## Vocab File for model.          Sample : "/content/drive/My Drive/Experiments/uncased_L-12_H-768_A-12/vocab.txt"
tokenizer_path="/content/drive/My Drive/btp/bert_increased_vocab/hindi_bert/vocabs/docsampled_hi_gu_multibpe_0.5_15k.json"
model_ckpt="" ## Either Pytorch or Tensorflow
              ## Tensorflow Sample : "/content/drive/My Drive/btp/bert_increased_vocab/hindi_bert/jun_expts/MultiLang/bpe/docsampled_hi_gu_multibpe_0.5_15k_added_dict/model.ckpt-64000"
              ## Pytorch    Sample : "/content/drive/My Drive/Experiments/English_RelateLM_Bengali/model.pt"
model_type='' ## 'tensorflow' or 'pytorch'
data_path="" ## Directory containing NER data. Sample: "/content/drive/My Drive/btp/ai4bharat_pos/"
train_lang=['hi']
test_langs=['hi','gu_t13n_hi_15k_dict_added']
out_dir="" # Directory to save the checkpoints after each epoch. Sample : "/content/drive/My Drive/btp/bert_increased_vocab/hindi_bert/jun_expts/MultiLang/bpe/docsampled_hi_gu_multibpe_0.5_15k_added_dict/POS_run1"


In [None]:
tagger(bert_config_path,model_ckpt,model_type,data_path,train_lang,test_langs,out_dir,do_lower_case,max_seq_len,batch_size,learning_rate,epochs,evaluation_interval,tokenizer_path=tokenizer_path)