In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert-base-uncased/config.json
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/vocab.txt
/kaggle/input/ner-dataset/ner_dataset.csv


For running and training model, please find at https://www.kaggle.com/trangdothuy/ner-bert-crf-model

In [2]:
pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import torch.nn as nn
from torchcrf import CRF
from tqdm import tqdm
import transformers 

import pandas as pd
import numpy as np 

import joblib


from sklearn import preprocessing
from sklearn import model_selection

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [4]:
class Config:
    def __init__(self):
        self.MAX_LEN = 128
        self.TRAIN_BATCH_SIZE = 8
        self.VALID_BATCH_SIZE = 8
        self.EPOCHS = 5
        self.BASE_MODEL_PATH = "../input/bert-base-uncased"
        self.MODEL_PATH = "./model.bin"
        self.TRAINING_FILE = "../input/ner-dataset/ner_dataset.csv"
        self.TOKENIZER = transformers.BertTokenizer.from_pretrained(
                self.BASE_MODEL_PATH,
                do_lower_case=True
            )
config = Config()

In [5]:
class EntityDataset:
    def __init__(self,texts,tags):
        # texts: [["hi",",","I","am","learning"],["hello"," ","nice","to","meet","you"]]
        # pos/tags: [[1 2 3 4 5],[2 3 4 5 6]]
        self.texts = texts
        self.tags = tags

    def __len__(self):
        return len(self.texts)
    def __getitem__(self,item):
        text = self.texts[item]
        tags = self.tags[item]


        ids = []
        target_tag = tags
        valid_mask = []

        for i,s in enumerate(text):
            inputs = config.TOKENIZER.tokenize(s)
            for j,token in enumerate(inputs):
                if j == 0:
                    valid_mask.append(1)
                else:
                    valid_mask.append(0)
            # hello: he ##llo
            ids.extend(inputs)
            
        ids = config.TOKENIZER.convert_tokens_to_ids(ids)
        ids = ids[:config.MAX_LEN -2]
        valid_mask = valid_mask[:config.MAX_LEN -2]
        target_tag = target_tag[:config.MAX_LEN -2]

        mask = [1]*len(ids)
        mask_crf = [1]*len(ids)

        ids = [101] + ids + [102]
        target_tag = [0] + target_tag + [0]
        valid_mask = [1] + valid_mask + [1]
        mask = [0]+mask+[0]
        mask_crf = [1] + mask_crf + [1]

        token_type_ids = [0]*config.MAX_LEN

        padding_len = config.MAX_LEN - len(ids)

        ids = ids + [0]*padding_len
        mask = mask + [0]*padding_len
        mask_crf = mask_crf + [0]*padding_len
        valid_mask = valid_mask + [0]*padding_len
        
        other_padding_len = config.MAX_LEN - len(target_tag)
        target_tag = target_tag + [0]*other_padding_len

        assert len(ids) == config.MAX_LEN 
        assert len(mask) == config.MAX_LEN 
        assert len(mask_crf) == config.MAX_LEN
        assert len(token_type_ids) == config.MAX_LEN 
        assert len(target_tag) == config.MAX_LEN 
        assert len(valid_mask) == config.MAX_LEN 

        return {
            "ids":torch.tensor(ids, dtype=torch.long),
            "mask":torch.tensor(mask, dtype=torch.long),
            "mask_crf":torch.tensor(mask_crf,dtype=torch.long),
            "token_type_ids":torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag":torch.tensor(target_tag, dtype=torch.long),
            "valid_mask":torch.tensor(valid_mask, dtype=torch.long)
        }

In [6]:
def train_fn(data_loader,model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    final_accuracy = 0
    for data in tqdm(data_loader,total=len(data_loader)):
        for k,v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _,loss,accuracy_score = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
        final_accuracy += accuracy_score
    return final_loss/len(data_loader), final_accuracy/len(data_loader)

def eval_fn(data_loader,model, device):
    model.eval()
    final_loss = 0
    final_accuracy = 0
    for data in tqdm(data_loader,total=len(data_loader)):
        for k,v in data.items():
            data[k] = v.to(device)
        _,loss,accuracy_score = model(**data)
        final_loss += loss.item()
        final_accuracy += accuracy_score
    return final_loss/len(data_loader), final_accuracy/len(data_loader)

In [7]:
def loss_fn(output,target,mask, num_lables):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1,num_lables)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits,active_labels)
    return loss

def accuracy_fn(output,target,mask):
    #output = output.argmax(2)
    real_output = torch.flatten(mask*output)
    real_target = torch.flatten(mask*target)
    score = 0.0
    total = 0
    for i in range(len(real_target)):
        if real_target[i] !=0:
            total += 1
            if real_target[i] == real_output[i]:
                score +=1
    return score/total


class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.bert = transformers.BertModel.from_pretrained(config.BASE_MODEL_PATH,return_dict=False)
        self.bert_drop_1 = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768,self.num_tag)
        self.crf = CRF(self.num_tag, batch_first = True)


    def forward(self,ids,mask,mask_crf,token_type_ids,target_tag,valid_mask):
        o1, _ = self.bert(ids,attention_mask=mask,token_type_ids=token_type_ids)
        bo_tag = self.bert_drop_1(o1)
        emissions = self.out_tag(bo_tag)
        log_likelihood, sequence_of_tags = self.crf(emissions, target_tag,mask = mask_crf.bool(), reduction='mean'), self.crf.decode(emissions, mask = mask_crf.bool(),)
        
        tag, mask = valid_sequence_output(sequence_of_tags,valid_mask,mask)
        
        #loss = loss_fn(tag,target_tag,mask,self.num_tag)
        loss = -1 * log_likelihood
        accuracy_score = accuracy_fn(tag,target_tag,mask)
        return tag,loss,accuracy_score


In [8]:
def valid_sequence_output(sequence_output, valid_mask, attention_mask): # convert token back to word

    # batch_size, max_len, feat_dim = sequence_output.shape
    batch_size = len(sequence_output)
    max_len = config.MAX_LEN
    current_len = len(sequence_output[0])
    valid_output = torch.zeros(batch_size, max_len, dtype = torch.long,
                                device='cuda' if torch.cuda.is_available() else 'cpu')
    valid_attention_mask = torch.zeros(batch_size, max_len, dtype = torch.long,
                                device='cuda' if torch.cuda.is_available() else 'cpu')


    for i in range(batch_size):
        jj = -1
        for j in range(current_len):
            if valid_mask[i][j].item() == 1:
                jj += 1
                valid_output[i][jj] = sequence_output[i][j]*attention_mask[i][j]
                valid_attention_mask[i][jj] = attention_mask[i][j]

    return valid_output, valid_attention_mask

In [9]:

def process_data(data_path):
    df = pd.read_csv(data_path,encoding='latin-1')
    df.loc[:,"Sentence #"] = df["Sentence #"].fillna(method="ffill")

    
    enc_tag = preprocessing.LabelEncoder()
    tag_list = list(df["Tag"])
    tag_list.insert(0,"0")
    
    enc_tag.fit(tag_list)
    df.loc[:,"Tag"] = enc_tag.transform(df["Tag"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values 
    tag = df.groupby("Sentence #")["Tag"].apply(list).values

    return sentences, tag, enc_tag


if __name__ == '__main__':
    sentences, tag, enc_tag = process_data(config.TRAINING_FILE)
    
    meta_data = {
        "enc_tag": enc_tag
    }

    joblib.dump(meta_data,"meta.bin")

    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences, 
        test_sentences, 
        train_tag,
        test_tag
     ) = model_selection.train_test_split(sentences,tag,random_state = 42,test_size = 0.1)

    train_dataset = EntityDataset(texts = train_sentences, tags =train_tag)
    valid_dataset = EntityDataset(texts = test_sentences, tags = test_tag)

    train_data_loader = torch.utils.data.DataLoader(train_dataset,batch_size=config.TRAIN_BATCH_SIZE,num_workers = 4)
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=config.VALID_BATCH_SIZE,num_workers=1)

    device = torch.device("cuda")
    model = EntityModel(num_tag = num_tag)
    model.to(device)


    # print("========================================")
    # print(summary(model,
    #  torch.ones(8,128,dtype = torch.long,device='cuda'),
    #  torch.ones(8,128,dtype = torch.long,device='cuda'),
    #  torch.ones(8,128,dtype = torch.long,device='cuda'),
    #  torch.ones(8,128,dtype = torch.long,device='cuda'),
    #  torch.ones(8,128,dtype = torch.long,device='cuda'),
    #  torch.ones(8,128,dtype = torch.long,device='cuda')
    #  ))

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias","LayerNorm.bias","LayerNorm.weight"]
    optimizer_parameters = [{
        "params":[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],
        "weight_decay":0.001,
    },{
        "params":[p for n,p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay":0.0,
    }]
    num_train_steps = int(len(train_sentences)/config.TRAIN_BATCH_SIZE*config.EPOCHS)
    optimizer = AdamW(optimizer_parameters,lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,num_warmup_steps=0,num_training_steps=num_train_steps
    )

    best_loss = np.inf
    for epoch in range(config.EPOCHS):
        train_loss, train_accuracy = train_fn(train_data_loader,model,optimizer,device, scheduler)
        test_loss, test_accuracy = eval_fn(valid_data_loader,model,device)
        print(f"Train loss = {train_loss} Valid loss = {test_loss} ")
        print(f"Train accuracy = {train_accuracy} Valid accuracy = {test_accuracy} ")
        if test_loss < best_loss:
            torch.save(model.state_dict(),config.MODEL_PATH)
            best_loss = test_loss


  cpuset_checked))
Some weights of the model checkpoint at ../input/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 5396/5396 [27:16<00:00,  3.30it/s]
100%|██████████| 600/600 [01:30<00:00,  6.61it/s]


Train loss = 6.686323683336101 Valid loss = 3.7516463486353557 
Train accuracy = 0.8306724592240401 Valid accuracy = 0.8280946494027132 


100%|██████████| 5396/5396 [28:01<00:00,  3.21it/s]
100%|██████████| 600/600 [01:34<00:00,  6.33it/s]


Train loss = 3.4712855979369426 Valid loss = 3.0786762579282123 
Train accuracy = 0.8323102995590129 Valid accuracy = 0.8296607741129537 


100%|██████████| 5396/5396 [28:14<00:00,  3.18it/s]
100%|██████████| 600/600 [01:32<00:00,  6.51it/s]


Train loss = 2.5266765379039686 Valid loss = 2.796115957101186 
Train accuracy = 0.8341821036083973 Valid accuracy = 0.8311459042102092 


100%|██████████| 5396/5396 [28:18<00:00,  3.18it/s]
100%|██████████| 600/600 [01:32<00:00,  6.49it/s]


Train loss = 1.8934881603876867 Valid loss = 2.720780169169108 
Train accuracy = 0.8363193812199786 Valid accuracy = 0.8297682031090342 


100%|██████████| 5396/5396 [28:40<00:00,  3.14it/s]
100%|██████████| 600/600 [01:33<00:00,  6.43it/s]

Train loss = 1.4345686365446397 Valid loss = 2.7760127568244934 
Train accuracy = 0.8380309910865494 Valid accuracy = 0.8304265368720529 



