In [1]:
import os
import pandas as pd 
import glob
import numpy as np
import torch
from torch.utils.data import DataLoader
import transformers
import torch.nn as nn
from tqdm import tqdm
from prepare_invoice_ner_dataset import label_idx_dict
from prepare_invoice_ner_dataset import split_tokenize_label_dataset, split_tokenize_label_file, form_input

In [2]:
train_split_path = './split/train.txt'
val_split_path = './split/val.txt'
test_split_path = './split/test.txt'

with open(train_split_path, "r", encoding="utf-8") as f:
    train_file_path_list = f.read().splitlines()

from transformers import AutoTokenizer, BertTokenizer
bert_path = '../kaggle_ner/huggingface-bert/bert-base-uncased/'
#model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(bert_path, do_lower_case=True)

In [3]:
config = {'MAX_LEN':128,
          'tokenizer': tokenizer,
          'batch_size':32,
          'Epoch': 3,
          #'train_path':train_path,
          #'test_path':test_path, 
          'device': 'cuda' if torch.cuda.is_available() else 'cpu',
          #'model_path':model_path,
          'model_name':'model1_bert_base_uncased_3_epochs.bin'
         }

In [4]:
final_train_split_and_tokenized_file_list, final_train_split_and_tokenized_labels, final_train_split_word_id_list, final_train_split_token_ids_list = split_tokenize_label_dataset(train_split_path, tokenizer)
final_val_split_and_tokenized_file_list, final_val_split_and_tokenized_labels, final_val_split_word_id_list, final_val_split_token_ids_list = split_tokenize_label_dataset(val_split_path, tokenizer)


In [5]:
train_prod_input = form_input(final_train_split_and_tokenized_file_list, final_train_split_and_tokenized_labels, 
                              final_train_split_token_ids_list, final_train_split_word_id_list, config, data_type='train')

val_prod_input = form_input(final_val_split_and_tokenized_file_list, final_val_split_and_tokenized_labels, 
                              final_val_split_token_ids_list, final_val_split_word_id_list, config, data_type='train')

In [6]:
train_prod_input_data_loader = DataLoader(train_prod_input, batch_size= config['batch_size'], shuffle=True)
val_prod_input_data_loader = DataLoader(val_prod_input, batch_size= config['batch_size'], shuffle=True)

In [7]:
def train_fn(data_loader, model, optimizer):
    '''
    Functiont to train the model
    '''
    print("Training phase")
    train_loss = 0
    for index, dataset in enumerate(data_loader):
        batch_input_ids = dataset['ids'].to(config['device'], dtype = torch.long)
        batch_att_mask = dataset['att_mask'].to(config['device'], dtype = torch.long)
        batch_tok_type_id = dataset['tok_type_id'].to(config['device'], dtype = torch.long)
        batch_target = dataset['target'].to(config['device'], dtype = torch.long)
                
        output = model(batch_input_ids, 
                       token_type_ids=None,
                       attention_mask=batch_att_mask,
                       labels=batch_target)
        
        step_loss = output[0]
        prediction = output[1]
        
        if((index+1)%10 == 0):
            print("Step {}, train loss {}".format(index+1, step_loss))
        
        #print(prediction.shape)
        
        step_loss.sum().backward()
        optimizer.step()        
        train_loss += step_loss
        optimizer.zero_grad()
        
    return train_loss.sum()


def eval_fn(data_loader, model):
    '''
    Functiont to evaluate the model on each epoch. 
    We can also use Jaccard metric to see the performance on each epoch.
    '''
    
    model.eval()
    
    eval_loss = 0
    predictions = np.array([], dtype = np.int64).reshape(0, config['MAX_LEN'])
    true_labels = np.array([], dtype = np.int64).reshape(0, config['MAX_LEN'])
    print("Evaluation phase")
    with torch.no_grad():
        for index, dataset in enumerate(data_loader):
            batch_input_ids = dataset['ids'].to(config['device'], dtype = torch.long)
            batch_att_mask = dataset['att_mask'].to(config['device'], dtype = torch.long)
            batch_tok_type_id = dataset['tok_type_id'].to(config['device'], dtype = torch.long)
            batch_target = dataset['target'].to(config['device'], dtype = torch.long)

            output = model(batch_input_ids, 
                           token_type_ids=None,
                           attention_mask=batch_att_mask,
                           labels=batch_target)

            step_loss = output[0]
            eval_prediction = output[1]

            if((index+1)%10 == 0):
                print("Step {}, train loss {}".format(index+1, step_loss))
            
            eval_loss += step_loss
            
            eval_prediction = np.argmax(eval_prediction.detach().to('cpu').numpy(), axis = 2)
            actual = batch_target.to('cpu').numpy()
            
            predictions = np.concatenate((predictions, eval_prediction), axis = 0)
            true_labels = np.concatenate((true_labels, actual), axis = 0)
            
    return eval_loss.sum(), predictions, true_labels

In [8]:
def train_engine(epoch, train_data, valid_data):
    model = transformers.BertForTokenClassification.from_pretrained('bert-base-uncased',  num_labels = len(label_idx_dict))
    model = nn.DataParallel(model)
    model = model.to(config['device'])
    
    params = model.parameters()
    optimizer = torch.optim.Adam(params, lr= 3e-5)
    
    best_eval_loss = 1000000
    for i in range(epoch):
        train_loss = train_fn(data_loader = train_data, 
                              model=model, 
                              optimizer=optimizer)
        eval_loss, eval_predictions, true_labels = eval_fn(data_loader = valid_data, 
                                                           model=model)
        
        #print(f"Epoch {i} , Train loss: {train_loss}, Eval loss: {eval_loss}")

        if eval_loss < best_eval_loss:
            best_eval_loss = eval_loss           
            
            print("Saving the model")
            torch.save(model.state_dict(), config['model_name'])
            
    return model, eval_predictions, true_labels 

In [9]:
model, val_predictions, val_true_labels = train_engine(epoch=config['Epoch'],
                                                       train_data=train_prod_input_data_loader, 
                                                       valid_data=val_prod_input_data_loader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Training phase
Step 10, train loss 0.8367651700973511
Step 20, train loss 0.43318334221839905
Step 30, train loss 0.3518441617488861
Step 40, train loss 0.2847069799900055
Step 50, train loss 0.2896549701690674
Step 60, train loss 0.19489172101020813
Step 70, train loss 0.13789771497249603
Step 80, train loss 0.13461090624332428
Step 90, train loss 0.04147884249687195
Evaluation phase
Step 10, train loss 0.05112822353839874
Step 20, train loss 0.05208927392959595
Step 30, train loss 0.091541588306427
Saving the model
Training phase
Step 10, train loss 0.03879745304584503
Step 20, train loss 0.06278297305107117
Step 30, train loss 0.05111193656921387
Step 40, train loss 0.03180626407265663
Step 50, train loss 0.03399399667978287
Step 60, train loss 0.019001971930265427
Step 70, train loss 0.03240441903471947
Step 80, train loss 0.018828539177775383
Step 90, train loss 0.015408680774271488
Evaluation phase
Step 10, train loss 0.06105958670377731
Step 20, train loss 0.015644501894712448
S