In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report

from transformers import BertModel,BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import transformers

import pandas as pd
import numpy as np

import more_itertools as mit
from tqdm import tqdm
import joblib

In [2]:
# Configurations

# Maximmum sequence length (we know from data exploration, longest sentence has 76 tokens in it)
MAX_LEN = 80

# Training batch size
TRAIN_BATCH_SIZE = 8

# Validation batch size
VAL_BATCH_SIZE = 4

# Number of epochs to train the model
EPOCHS = 10

# Model path to use the pre trained model
BASE_MODEL_PATH = "./BERT/bert_base_uncase"

# Path to save the trained model
MODEL_PATH = "model_BERT_ENG.bin"

# File to read the data for tarining purpose
TRAINING_FILE = "./data/engtrain.bio"

# Bert Tokenizer
TOKENIZER = BertTokenizer.from_pretrained(BASE_MODEL_PATH)

In [3]:
# Data set class whose object will be passed as an argument to generator to generate the batches of data set for training and validation purpose

class EntityDataset:
    def __init__(self, texts, tags):
        self.texts = texts
        self.tags = tags
    
    # invoked when len() is called upon the generator
    def __len__(self):
        return len(self.texts)
    
    # called when a generator gets invoked
    def __getitem__(self, ind):
        text = self.texts[ind]
        tags = self.tags[ind]
        
        # variables to hold the encoded sequences and tag ids (lable for our supervised trainning)
        ids = []
        target_tags = []
        
        # gives us each word in a sentence
        for i, seq in enumerate(text):
            # Tokenizing and encoding the input sequence (each word) as per BERT tokenizer without [SEP] and [CLS] as we are assigning it manually
            inputs = TOKENIZER.encode(seq, add_special_tokens=False)
            
            # no of sub words the 'seq' is split into
            input_len = len(inputs)
            
            # storing token ids in a list
            ids.extend(inputs)
            
            # for each token length of target lable must be same as the length of token ids generated by tokenizer
            target_tags.extend([tags[i]] * input_len)

        # Now that we have parsed one sentence (token wise in the above for loop), we can add special tokens [CLS] and [SEP] at the beginning and at the end of the sequence
        ids = [101] + ids + [102]
        
        # make target tags to have the same length as ids
        target_tags = [0] + target_tags + [0]

        # Attention mask, setting 1 will let the model know to use those (attend those tokens)
        mask = [1] * len(ids)
        
        # token_type_ids will be set to 0 for all tokens as we are not dealing with sentence pairs (for task like eg: QnA)
        token_type_ids = [0] *len(ids)

        # Calculating how much padding is required for a sequence. Since, ids and target tags are all of same length, we can use any one to calculate the required padding length
        padding_len = MAX_LEN- len(ids)

        # pad the ids with 0
        ids = ids + ([0]*padding_len)
        
        # mask the padded tokens with 0 (to let the model know to ignore these tokens)
        mask = mask + ([0]*padding_len)
        
        # setting token type ids of padded tokens to 0 (we are not actually using these in this task)
        token_type_ids = token_type_ids + ([0]*padding_len)
        
        # padding the target tags
        target_tags = target_tags + ([0]*padding_len)
        
        # returning dictionary of ids, mask, token_type_ids and target_tags. all will have same length (128 which is set as MAX_LEN)
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tags": torch.tensor(target_tags, dtype=torch.long)
        }    

In [4]:
# Function to train the model

def train_fn(data_loader, model, optimizer, device, scheduler):
    # setting the model in training mode, so that we can update the gradients
    model.train()
    final_loss = 0
    
    # data_loader will be of train_dataset, with data loader generator for each call, dataset equal to batch size will be supplied to pass through the model
    for data in tqdm(data_loader, total=len(data_loader)):
        # accessing ids, attention mask, token_type_ids and target tags as returned by __getitem__
        for k, v in data.items():
            data[k] = v.to(device)
        
        # Setting gradients of the model parameters to 0
        optimizer.zero_grad()
        
        # Pushing the data through the network and saving the loss for this pass
        _, loss = model.forward(**data)

        # Calculate the gradients
        loss.backward()
        
        # Update the parameters with the calculated gradients
        optimizer.step()
        
        # Takes care of learning rate decay
        scheduler.step()
        
        final_loss += loss.item()
    
    return final_loss / len(data_loader)
        

In [5]:
# Function to evaluate the model after updating the model gradients with one batch of data

def eval_fn(data_loader, model, device):
    # setting the model in evaluation mode, so that we can use the last updated gradients to predict
    model.eval()
    final_loss = 0
    
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            # This will transfer the data to GPU
            data[k] = v.to(device)
        _, loss = model.forward(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [6]:
# Function to calculate the loss (cross entropy loss)

def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(active_loss, target.view(-1), torch.tensor(lfn.ignore_index).type_as(target))
    
    loss = lfn(active_logits, active_labels)
    return loss

In [7]:
# Class to create the model

class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        
        # Defining the model architecture
        self.bert = transformers.BertModel.from_pretrained(BASE_MODEL_PATH)
        self.num_tag = num_tag
        self.bert_drop_1 = nn.Dropout(0.5)
        
        # output dimension will be 768Xnum_tag. 768 because we are using bert_base_uncase
        self.out_tag = nn.Linear(768, self.num_tag)
        
    # Forward pass method to pass the data through the network
    def forward(self, ids, mask, token_type_ids, target_tags):
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bo_tag = self.bert_drop_1(o1)
        
        tag = self.out_tag(bo_tag)
        
        loss = loss_fn(tag, target_tags, mask, self.num_tag)

        return tag, loss

In [8]:
def read_and_process_data(path):
    
    # Read the data file, sentences are represented as words and each sentence is separated by a blank line
    df = pd.read_csv(path, sep="\t", names=['TAG', 'WORD'], skip_blank_lines=False, dtype="string", skipfooter=1)
    
    # fill the blank line values with some value, to construct sentences from word 
    df.fillna("split_at", inplace=True)
    
    filt = df['TAG']!="split_at"
    
    # Using label encoder to encode the target tags
    tags_enc = LabelEncoder()
    tags_enc.fit_transform(df.loc[filt, 'TAG'])
    dic = dict(zip(tags_enc.classes_, tags_enc.transform(tags_enc.classes_)))
    
    df['TAG'] = df['TAG'].map(dic)
    df['TAG'].fillna("split_at", inplace=True)
    
    # Constructing the sentences from word representation
    sentences = np.array(list(mit.split_at(df['WORD'].tolist(), pred=lambda x: x=='split_at')))
    
    # Constructing tags for each sentence as a whole from each word
    tags = np.array(list(mit.split_at(df['TAG'].tolist(), pred=lambda x: x=='split_at' )))
    
    return sentences, tags, tags_enc

In [9]:
%%time
%time

if __name__ =="__main__":
    file_path = "./data/engtrain.bio"
    sentences, tags, tags_encoder = read_and_process_data(file_path)
    
    metadata = {
        "enc_tag" : tags_encoder
    }
    
    joblib.dump(metadata, "meta.bin")
    
    num_tags = len(tags_encoder.classes_)
    
    # Split the data into train and validation set using train_test_split with 10% as val data
    (train_sentences, test_sentences, train_tag, test_tag) = train_test_split(sentences, tags, random_state=43, test_size=0.1)
    
    # dataset object for train data loader
    train_dataset = EntityDataset(
        texts=train_sentences, tags=train_tag
    )

    # Generator object which will be used for model training purpose
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=0
    )

    # dataset object for validation data loader
    valid_dataset = EntityDataset(
        texts=test_sentences, tags=test_tag
    )

    # Generator object which will be used for model evaluation purpose after processing each batch from train data loader
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VAL_BATCH_SIZE, num_workers=0
    )

    # This will transfer the model to GPU
    device = torch.device("cuda")
    model = EntityModel(num_tags)
    model.to(device)

    # Getting the parameters of the model to be passed to the optimizer
    param_optimizer = list(model.named_parameters())
    
    # Not decaying the weights for bias, LyerNorm weight and bias
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    # Getting the list of parameters to use in decay
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    # Calculating the total number of training steps required to pass to the linear scheduler for decreasing the learning rate
    num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * EPOCHS)
    
    # Adam optimizer with weight decay (regularizing the variables/parameters with large gradients)
    optimizer = AdamW(optimizer_parameters, lr=1e-5)
    
    # scheduler to linearly decay the learning rate, setting num_warmup_steps to 0, helps in initial learning rate to be at the specified value
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )
    
    best_loss = np.inf
    
    # Train the model for EPOCHS number of times
    for epoch in range(EPOCHS):
        train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss = eval_fn(valid_data_loader, model, device)
        print(f"Train loss = {train_loss} Valid Loss = {test_loss}")
        if  test_loss < best_loss:
            # Saving the weights if loss is better than previous epoch loss
            torch.save(model.state_dict(), MODEL_PATH)
            best_loss = test_loss

Wall time: 0 ns


  after removing the cwd from sys.path.
Some weights of the model checkpoint at ./BERT/bert_base_uncase were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00, 

Train loss = 0.543764461864802 Valid Loss = 0.24740117401340786


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.60it/s]


Train loss = 0.2030286230637946 Valid Loss = 0.20728233872665738


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.73it/s]


Train loss = 0.15633766272697935 Valid Loss = 0.20296533088180788


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.52it/s]


Train loss = 0.13010746813040566 Valid Loss = 0.19662809569521675


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.71it/s]
  0%|                                                                                         | 0/1100 [00:00<?, ?it/s]

Train loss = 0.11027436334639788 Valid Loss = 0.20189963285716211


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.70it/s]
  0%|                                                                                         | 0/1100 [00:00<?, ?it/s]

Train loss = 0.09482114355418493 Valid Loss = 0.2070213700644672


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.72it/s]
  0%|                                                                                         | 0/1100 [00:00<?, ?it/s]

Train loss = 0.0830725264900618 Valid Loss = 0.20710446471966118


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.73it/s]
  0%|                                                                                         | 0/1100 [00:00<?, ?it/s]

Train loss = 0.0732365293237804 Valid Loss = 0.212727547721101


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.68it/s]
  0%|                                                                                         | 0/1100 [00:00<?, ?it/s]

Train loss = 0.06629291739653458 Valid Loss = 0.21592167988229466


100%|██████████████████████████████████████████████████████████████████████████████| 1100/1100 [05:20<00:00,  3.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 245/245 [00:13<00:00, 18.71it/s]

Train loss = 0.06271925185637718 Valid Loss = 0.21926759854101632
Wall time: 55min 46s





In [10]:
# We have to prepare the test data set supplied to us, in the same way we prepared train dataset, but we will use tags_enc transform method to transform the tags


def prepare_test_data(path, tags_enc):
    df = pd.read_csv(path, sep="\t", names=['TAG', 'WORD'], skip_blank_lines=False, dtype="string", skipfooter=1)
    df.fillna("split_at", inplace=True)
    
    filt = df['TAG']!="split_at"
    tags_enc.transform(df.loc[filt, 'TAG'])
    dic = dict(zip(tags_enc.classes_, tags_enc.transform(tags_enc.classes_)))
    
    df['TAG'] = df['TAG'].map(dic)
    df['TAG'].fillna("split_at", inplace=True)
        
    sentences = np.array(list(mit.split_at(df['WORD'].tolist(), pred=lambda x: x=='split_at')))
    tags = np.array(list(mit.split_at(df['TAG'].tolist(), pred=lambda x: x=='split_at' )))
    
    return sentences, tags

In [11]:
def predict(data_loader, device, model):   
    all_tags_prob = []
    with torch.no_grad():
        for data in data_loader:
            for k, v in data.items():
                data[k] = v.to(device)
            tags_prob, _ = model.forward(**data)
            all_tags_prob.extend(tags_prob)
    
    return all_tags_prob

In [12]:
def test_predictions(data_path):
    
    metadata = joblib.load("meta.bin")
    tags_enc = metadata['enc_tag']
    num_tags = len(tags_enc.classes_)
    
    test_sentences, test_tags= prepare_test_data(path=data_path, tags_enc=tags_enc)
    
    model = EntityModel(num_tag=num_tags)
    model.load_state_dict(torch.load(MODEL_PATH))
    
    device = torch.device("cuda")
    model.to(device)
    
    test_dataset = EntityDataset(
        texts=test_sentences, tags=test_tags
    )
    
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=VAL_BATCH_SIZE, num_workers=0
    )
    
    predicted_tags_prob = predict(test_data_loader, device, model)
    return test_tags, predicted_tags_prob

In [13]:
def measure_performance(test_tags, predicted_tags_prob):
    predicted_tags = []
    for i in range(len(test_tags)):
        # for each test sentence get the predicted tag value using argmax and upto the length of actual sentence (not the padded sentence)
        predicted_tags.append(predicted_tags_prob[i].cpu().numpy().argmax(1).reshape(-1)[1:len(test_tags[i])+1])

    print(classification_report(np.concatenate(test_tags), np.concatenate(predicted_tags)))

In [14]:
test_file_path = "./data/engtest.bio"
test_tags, predicted_tags_prob = test_predictions(test_file_path)

  """
  from ipykernel import kernelapp as app
  app.launch_new_instance()
Some weights of the model checkpoint at ./BERT/bert_base_uncase were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
measure_performance(test_tags, predicted_tags_prob)

              precision    recall  f1-score   support

         0.0       0.74      0.86      0.79       812
         1.0       0.48      0.64      0.55        90
         2.0       0.78      0.81      0.79       456
         3.0       0.78      0.80      0.79      1117
         4.0       0.65      0.60      0.63       491
         5.0       0.77      0.82      0.79       500
         6.0       0.68      0.68      0.68       451
         7.0       0.21      0.16      0.18        56
         8.0       0.53      0.69      0.60        54
         9.0       0.75      0.84      0.79       562
        10.0       0.78      0.83      0.81        30
        11.0       0.77      0.71      0.74       720
        12.0       0.56      0.75      0.64       862
        13.0       0.44      0.48      0.46        75
        14.0       0.57      0.73      0.64       496
        15.0       0.74      0.60      0.67       222
        16.0       0.78      0.54      0.63       496
        17.0       0.83    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## We are getting a macro avg F1 Score of 0.65
## Model is not performing great, it is failing completely for labes encoded as 7, 19 and 22

## Possible improvements:
###  Train model for large number of epochs
###  Try using large bert model