In [2]:
MAX_SENTENCE_LENGHT = 10

In [3]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pdb

np.random.seed(0)
torch.manual_seed(42)

# https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html

<torch._C.Generator at 0x7f68e252ad38>

# Dataloader conll-format

In [4]:
from utils.load_vocab import *

### Load vocabs

In [5]:
!pwd

/ist/users/weerayutb/models/baseline/test_hypothesis/BiLSTM_CRF02


In [6]:
path_train = '/ist/users/weerayutb/datasets/ner_small_set_BIOES/sm_ner.train.txt'
path_dev   = '/ist/users/weerayutb/datasets/ner_small_set_BIOES/sm_ner.dev.txt'
path_test  = '/ist/users/weerayutb/datasets/ner_small_set_BIOES/sm_ner.test.txt'

train, words, chars, tags = load_conll_format_nested_ner(path_train, MAX_LEVEL=1, NE_TYPE='flatten')

UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

words = [NUM, UNK] + words
tags  = tags

words = list(set(words))
# tags  = list(set(tags))

idx2word  = {idx: word for idx, word in enumerate(words)}
idx2tag   = {idx: tag  for idx, tag  in enumerate(tags)}
idx2char  = {idx: char for idx, char in enumerate(chars)}

word2idx  = {word:idx for idx, word in idx2word.items()}
tag2idx   = {tag :idx for idx, tag  in idx2tag.items()}
char2idx  = {char:idx for idx, char in idx2char.items()}

len(train)

idx2tag

words[0:8], chars[0:8], tags[0:8]

for item1, item2 in zip(idx2word.items(), word2idx.items()):
    if list(item1)[0] == 10 : break
    print(item1, item2)

(0, 'ฮัลโหล') ('ฮัลโหล', 0)
(1, 'ล่วงหน้า') ('ล่วงหน้า', 1)
(2, 'รูด') ('รูด', 2)
(3, 'ทุจริต') ('ทุจริต', 3)
(4, 'กระสุนปืน') ('กระสุนปืน', 4)
(5, 'เจ้าฟ้า') ('เจ้าฟ้า', 5)
(6, 'ความคิดสร้างสรรค์') ('ความคิดสร้างสรรค์', 6)
(7, 'รูปธรรม') ('รูปธรรม', 7)
(8, 'กัน') ('กัน', 8)
(9, 'การค้า') ('การค้า', 9)


In [7]:
train[6:9]

[[['ไทย', 'B-placeName'],
  ['ให้', 'O'],
  ['ยั่งยืน', 'O'],
  ['_', 'O'],
  ['โดย', 'O'],
  ['ขอให้', 'O']],
 [['ที่', 'O'],
  ['ตลาดหลักทรัพย์', 'B-orgName'],
  ['แห่ง', 'I-orgName'],
  ['ประเทศ', 'I-orgName'],
  ['ไทย', 'I-orgName']],
 [['ตลาดหลักทรัพย์', 'B-orgName'],
  ['แห่ง', 'I-orgName'],
  ['ประเทศ', 'I-orgName'],
  ['ไทย', 'I-orgName']]]

### Convert dataset text 2 idx 

In [8]:
from utils.convert2idx import *
from utils.dataloader_conll import *

In [9]:
processing_word = get_processing_word( word2idx, char2idx, lowercase=True, chars=True)
processing_tag  = get_processing_word( tag2idx, lowercase=False, allow_unk=False)

In [10]:
# create datasets
trainset = CoNLLDataset(path_train, processing_word, processing_tag, None, True)
testset  = CoNLLDataset(path_test, processing_word, processing_tag, None, True)
devset   = CoNLLDataset(path_dev, processing_word, processing_tag, None, True)

In [11]:
list(iter(train))[6:9]

[[['ไทย', 'B-placeName'],
  ['ให้', 'O'],
  ['ยั่งยืน', 'O'],
  ['_', 'O'],
  ['โดย', 'O'],
  ['ขอให้', 'O']],
 [['ที่', 'O'],
  ['ตลาดหลักทรัพย์', 'B-orgName'],
  ['แห่ง', 'I-orgName'],
  ['ประเทศ', 'I-orgName'],
  ['ไทย', 'I-orgName']],
 [['ตลาดหลักทรัพย์', 'B-orgName'],
  ['แห่ง', 'I-orgName'],
  ['ประเทศ', 'I-orgName'],
  ['ไทย', 'I-orgName']]]

# Model

In [12]:
from utils.data_generator import *
from utils.conllevalscript import ConllevalScript
Conlleval   = ConllevalScript(idx2word, idx2tag, word2idx['$UNK$'])

In [13]:
class Config:
    def __init__(self):
        self.nwords   = len(word2idx)
        self.dim_word = 400
        self.hidden_size_lstm = 300

        self.nchars   = len(char2idx)
        self.dim_char = 128
        self.dropout  = 0.5
        self.hidden_size_char = 128

        self.ntags = len(tag2idx)

        self.lr = 1e-3
        self.lr_decay = 1e-3
        
        self.batch_size = 3

In [14]:
from models.crf import *
class BiLSTM_CRF(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config     = config

        self.emb = nn.Embedding(self.config.nwords, self.config.dim_word, padding_idx=0)
        self.char_embeddings = nn.Embedding(self.config.nchars, self.config.dim_char, padding_idx=0)
        
        self.char_lstm = nn.LSTM(self.config.dim_char, self.config.hidden_size_char, bidirectional=True)

        self.dropout = nn.Dropout(p=self.config.dropout)
        self.word_lstm = nn.LSTM(self.config.dim_word+2*self.config.hidden_size_char, self.config.hidden_size_lstm, bidirectional=True)

        self.linear = LinearClassifier(self.config, layers=[self.config.hidden_size_lstm*2, self.config.ntags], drops=[0.5])


    def forward(self, input):
        # Word_dim = (batch_size x sent_length)
        # char_dim = (batch_size x sent_length x word_length)

        word_input, char_input = input[0], input[1]
        word_input.transpose_(0,1)
        
        # Word Embedding
        word_emb = self.emb(word_input) #shape= S*B*wnh

        # Char LSTM
        char_emb = self.char_embeddings(char_input.view(-1, char_input.size(2))) #https://stackoverflow.com/questions/47205762/embedding-3d-data-in-pytorch
        char_emb = char_emb.view(*char_input.size(), -1) #dim = BxSxWxE

        _, (h, c) = self.char_lstm(char_emb.view(-1, char_emb.size(2), char_emb.size(3)).transpose(0,1)) #(num_layers * num_directions, batch, hidden_size) = 2*BS*cnh
        char_output = torch.cat((h[0], h[1]), 1) #shape = BS*2cnh
        char_output = char_output.view(char_emb.size(0), char_emb.size(1), -1).transpose(0,1) #shape = S*B*2cnh

        # Concat char output and word output
        word_emb = torch.cat((word_emb, char_output), 2) #shape = S*B*(wnh+2cnh)
        word_emb = self.dropout(word_emb)

        output, (h, c) = self.word_lstm(word_emb) #shape = S*B*hidden_size_lstm
        check_output = output
        output = self.dropout(output)        
        output = self.linear(output)
        
        return output #shape = S*B*ntags
    
    def forward_once(self, input):
        output1 = self.forward(input)
        output2 = self.forward(input)
        

class LinearBlock(nn.Module):
    def __init__(self, ni, nf, drop):
        super().__init__()
        self.lin = nn.Linear(ni, nf)
        self.drop = nn.Dropout(drop)
        self.bn = nn.BatchNorm1d(ni)

    def forward(self, x):
        return self.lin(self.drop(self.bn(x)))


class LinearClassifier(nn.Module):
    def __init__(self, config, layers, drops):
        self.config = config
        super().__init__()
        self.layers = nn.ModuleList([LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)])

    def forward(self, input):
        output = input
        sl,bs,_ = output.size()
        x = output.view(-1, 2*self.config.hidden_size_lstm)

        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
        return l_x.view(sl, bs, self.config.ntags)

In [15]:
def form_idx_input(inputs, targets, sequence_lengths):
    targets     = T(targets, cuda=True).transpose(0,1).contiguous()

    word_input  = T(inputs['word_ids'], cuda=True)
    char_input  = T(inputs['char_ids'], cuda=True)

    word_input  = Variable(word_input, requires_grad=False)
    char_input  = Variable(char_input, requires_grad=False)
    targets     = Variable(targets)

    mask        = create_mask(sequence_lengths, targets, cuda=True)
    inputs      = (word_input, char_input)
    return inputs, mask, targets

def input_format(inputs, targets, sequence_lengths, form='idx_input'):
    
    if form == 'idx_input':
        return form_idx_input(inputs, targets, sequence_lengths)
    
    elif form == 'sent_pair':
        return form_sent_pair(inputs, targets, sequence_lengths)
    
    elif form == 'word_input':
        raise "Implement !!!!"
        
    else :
        raise "Input_format error !!! "

# Initial model

In [16]:
config    = Config()
model     = BiLSTM_CRF(config).cuda()
optimizer = optim.SGD(model.parameters(), lr=config.lr, weight_decay=config.lr_decay)
criterion = CRF(config.ntags).cuda()

  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)


In [17]:
# from gensim.models import KeyedVectors
# embedding_path = "/ist/users/weerayutb/embedding/ulmfit_social_wiki_prachathai.vec"
# ulmfit_emb     = KeyedVectors.load_word2vec_format(f'{embedding_path}', binary=False, unicode_errors = 'ignore')
# ulmfit_vocab   = list(ulmfit_emb.vocab)
# ulmfit_dim     = 400

# pre_train_weigths = np.zeros([len(word_to_idx), ulmfit_dim])
# for word in word_to_idx:
#     if word in ulmfit_vocab:
#         pre_train_weigths[word_to_idx[word]] = np.asarray(ulmfit_emb[word])

# sum(sum(pre_train_weigths)) # -36.36466901099368

In [18]:
# ## From pre_train embedding
# weight = torch.tensor(pre_train_weigths)

# # Initail UNK with mean of all embedding 
# weight[word_to_idx[UNK]] = torch.mean(weight,dim=0)

# # Load embedding to model
# model.embedding.weight.data.copy_(nn.Parameter(weight)).cuda()
# # model.embedding.weight[word_to_idx[UNK]]

In [19]:
def test( nbatches_val, val_generator, fine_tune=False):
        model.eval()
        accs          = []
        test_loss     = 0
        correct_preds = 0
        total_correct = 0
        total_preds   = 0
        total_step    = None

        FILE = open('eval.pred', "w", encoding='utf-8')
        for batch_idx, (inputs, targets, sequence_lengths) in enumerate(val_generator):
            total_step   = batch_idx
            if batch_idx == nbatches_val: break
            if inputs['word_ids'].shape[1] == 1: continue

            batch_inputs, mask, batch_targets = input_format(inputs, targets, sequence_lengths, form='idx_input')
            outputs = model(batch_inputs)
            
            # Get CRF Loss
            loss = -1*criterion(outputs, batch_targets, mask=mask)

            # Callbacks
            test_loss  += loss.item()
            predictions = criterion.decode(outputs, mask=mask)

            # Save batch_training to conlleval format 
            batch_training_text = Conlleval.convert_idx_to_text( inputs['word_ids'], sequence_lengths, status='word') # For save conlleval 
            batch_tag_text      = Conlleval.convert_idx_to_text(targets, sequence_lengths, status='tag')
            preds_text          = Conlleval.convert_idx_to_text(predictions, sequence_lengths, status='pred')
            Conlleval.save_conlleval_format(batch_training_text, batch_tag_text, preds_text, FILE)

        FILE.close()
        !cat eval.pred | ./conlleval.pl

# dev

In [20]:
# Dev
nbatches_dev, dev_generator = batch_iter(devset, 5, return_lengths=True)
test(nbatches_dev, dev_generator)

	cuda(torch.device device, bool async, *, torch.memory_format memory_format)
Consider using one of the following signatures instead:
	cuda(torch.device device, bool non_blocking, *, torch.memory_format memory_format) (Triggered internally at  /opt/conda/conda-bld/pytorch_1595629427286/work/torch/csrc/utils/python_arg_parser.cpp:766.)
  return x.cuda(*args, **kwargs) if USE_GPU else x


processed 14907 tokens with 513 phrases; found: 6751 phrases; correct: 60.
accuracy:  28.26%; precision:   0.89%; recall:  11.70%; FB1:   1.65
          orgName: precision:   1.33%; recall:   8.19%; FB1:   2.29  1428
         persName: precision:   0.04%; recall:   0.69%; FB1:   0.08  2321
        placeName: precision:   1.33%; recall:  29.20%; FB1:   2.55  3002


# Test

In [21]:
# Dev
nbatches_test, test_generator = batch_iter(testset, 5, return_lengths=True)
test(nbatches_test, test_generator)

processed 14064 tokens with 522 phrases; found: 6327 phrases; correct: 56.
accuracy:  28.38%; precision:   0.89%; recall:  10.73%; FB1:   1.64
          orgName: precision:   1.44%; recall:   8.23%; FB1:   2.45  1323
         persName: precision:   0.00%; recall:   0.00%; FB1:   0.00  2105
        placeName: precision:   1.28%; recall:  22.98%; FB1:   2.42  2899


In [22]:
def training(epoch, nbatches_train, train_generator, fine_tune=False):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    total_step = None

    for batch_idx, (inputs, targets, sequence_lengths) in enumerate(train_generator):

        if batch_idx == nbatches_train     : break
        if inputs['word_ids'].shape[0] == 1: continue
        total_step = batch_idx
        
        optimizer.zero_grad()
        
        # Edit this 
        batch_inputs, mask, targets = input_format(inputs, targets, sequence_lengths, form='idx_input')
        outputs                     = model(batch_inputs)

#         pdb.set_trace()
        # Get CRF Loss
        loss = -1*criterion(outputs, targets, mask=mask)
        loss.backward()
        optimizer.step()

        # Callbacks
        train_loss += loss.item()
        predictions = criterion.decode(outputs, mask=mask)
                
    print(train_loss/batch_idx)

In [23]:
def fit(epochs, nbatches_train, train_generator):    
    for epoch in range(1, epochs+1):
        print(f"Epoch: {epoch}")

        # Train
        training(epoch, nbatches_train, train_generator, fine_tune = False)
        
        # Dev
        nbatches_dev, dev_generator = batch_iter(devset, 32, return_lengths=True)
        test(nbatches_dev, dev_generator)

# Train

In [24]:
nbatches_train, train_generator = batch_iter(trainset, config.batch_size, return_lengths=True)
fit(50, nbatches_train, train_generator)

Epoch: 1
38.01362424655769
processed 14907 tokens with 513 phrases; found: 384 phrases; correct: 59.
accuracy:  92.64%; precision:  15.36%; recall:  11.50%; FB1:  13.15
          orgName: precision:  12.75%; recall:   5.60%; FB1:   7.78  102
         persName: precision:  12.44%; recall:  17.36%; FB1:  14.49  201
        placeName: precision:  25.93%; recall:  15.33%; FB1:  19.27  81
Epoch: 2
26.28594280155472
processed 14907 tokens with 513 phrases; found: 392 phrases; correct: 114.
accuracy:  93.47%; precision:  29.08%; recall:  22.22%; FB1:  25.19
          orgName: precision:  22.50%; recall:  11.64%; FB1:  15.34  120
         persName: precision:  29.59%; recall:  34.72%; FB1:  31.95  169
        placeName: precision:  35.92%; recall:  27.01%; FB1:  30.83  103
Epoch: 3
22.131348686234762
processed 14907 tokens with 513 phrases; found: 430 phrases; correct: 138.
accuracy:  94.17%; precision:  32.09%; recall:  26.90%; FB1:  29.27
          orgName: precision:  27.41%; recall:  15.95

In [25]:
# Test
nbatches_test, test_generator = batch_iter(testset, 5, return_lengths=True)
test(nbatches_test, test_generator)

processed 14064 tokens with 522 phrases; found: 495 phrases; correct: 371.
accuracy:  97.34%; precision:  74.95%; recall:  71.07%; FB1:  72.96
          orgName: precision:  71.30%; recall:  68.83%; FB1:  70.04  223
         persName: precision:  77.52%; recall:  76.92%; FB1:  77.22  129
        placeName: precision:  78.32%; recall:  69.57%; FB1:  73.68  143


In [26]:
nbatches_test, test_generator = batch_iter(testset, 5, return_lengths=True)

In [27]:
# for x in test_generator:
#     print(x[0]['char_ids'][0])
#     break

In [None]:
list(iter(dev_generator))[0]