<a href="https://colab.research.google.com/github/Nilanshrajput/NER_SyferText/blob/master/lSTM_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/synalp/NER.git

Cloning into 'NER'...
remote: Enumerating objects: 3148, done.[K
remote: Total 3148 (delta 0), reused 0 (delta 0), pack-reused 3148[K
Receiving objects: 100% (3148/3148), 281.51 MiB | 32.41 MiB/s, done.
Resolving deltas: 100% (2066/2066), done.
Checking out files: 100% (189/189), done.


In [0]:

from __future__ import print_function
from collections import OrderedDict

import torch
import torch.nn as nn
from torch.nn import init
from torch.autograd import Variable
from torch import autograd

import time
import _pickle as cPickle

import urllib
import matplotlib.pyplot as plt


import os
import sys
import codecs
import re
import numpy as np

In [0]:
def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)

def load_sentences(path, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """

    
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

In [0]:
train_sentences = load_sentences('/content/NER/corpus/CoNLL-2003/eng.train',zeros=True)
test_sentences = load_sentences('/content/NER/corpus/CoNLL-2003/eng.testb', zeros=True)
dev_sentences = load_sentences('/content/NER/corpus/CoNLL-2003/eng.testa', zeros=True)

In [0]:
len(train_sentences)

14041

In [0]:
train_sentences[2]

[['BRUSSELS', 'NNP', 'I-NP', 'I-LOC'], ['0000-00-00', 'CD', 'I-NP', 'O']]

In [0]:
def lower_case(x,lower=False):
    if lower:
        return x.lower()  
    else:
        return x

def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = [word_to_id[lower_case(w,lower) if lower_case(w,lower) in word_to_id else '<UNK>']
                 for w in str_words]
        # Skip characters that are not in the training set
        chars = [[char_to_id[c] for c in w if c in char_to_id]
                 for w in str_words]
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'chars': chars,
            'tags': tags,
        })
    return data

train_data = prepare_dataset(
    train_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
test_data = prepare_dataset(
    test_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']
)
print("{} / {} / {} sentences in train / dev / test.".format(len(train_data), len(dev_data), len(test_data)))

NameError: ignored

In [0]:
while 1:
  continue

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Tagger(nn.Module):
    def __init__(self, params):
        super(LSTM_Tagger, self).__init__()

        #maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(params['vocab_size'], params['embedding_dim'])

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(params['embedding_dim'], params['lstm_hidden_dim'], batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(params['lstm_hidden_dim'], params['num_tags'])

    def forward(self, s):
        #apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        #run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        print(s.shape)
        #reshape the Variable so that each row contains one token
        s = s.reshape(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*batch_max_len x num_tags


In [0]:
device = torch.device("cpu")
def loss_fn(outputs, labels):
  print(labels, labels.shape)
  print(outputs,outputs.shape)
  scores = nn.functional.cross_entropy(outputs, labels)
  return scores

def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
    
def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.to(device)
    
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.inputs_word
       
        target = batch.labels
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.to(device)
            target = target.to(device)
        if (text.size()[0] is not 64):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.inputs_word
            if (text.size()[0] is not 64):
                continue
            target = batch.labels
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)
	



In [0]:
learning_rate = 2e-5
batch_size = 64

lstm_hidden_dim = 256
num_tags = 18
embedding_length = 300
train_iter,valid_iter,test_iter = dic['iters']
vocab,_,_ = dic['vocabs']
vsz = len(vocab)
params={'vocab_size':vsz, 'embedding_dim':embedding_length,'lstm_hidden_dim': lstm_hidden_dim, 'num_tags':num_tags}
model = LSTM_Tagger(params)
for epoch in range(10):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
    
test_loss, test_acc = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

torch.Size([64, 100, 256])
tensor([[2, 4, 4,  ..., 7, 4, 3],
        [2, 4, 4,  ..., 1, 1, 1],
        [2, 4, 3,  ..., 1, 1, 1],
        ...,
        [2, 5, 4,  ..., 4, 4, 3],
        [2, 4, 5,  ..., 1, 1, 1],
        [2, 8, 4,  ..., 5, 4, 3]]) torch.Size([64, 18])
tensor([[-2.9930, -3.0346, -2.8021,  ..., -2.7318, -2.9520, -2.9049],
        [-2.8954, -3.0439, -2.7905,  ..., -2.7042, -3.0178, -3.0233],
        [-2.9565, -2.9891, -2.8314,  ..., -2.8096, -3.0283, -2.8922],
        ...,
        [-2.9904, -2.8957, -2.9276,  ..., -2.6503, -2.9657, -2.9736],
        [-2.9904, -2.8957, -2.9276,  ..., -2.6503, -2.9657, -2.9736],
        [-2.9904, -2.8957, -2.9276,  ..., -2.6503, -2.9657, -2.9736]],
       grad_fn=<LogSoftmaxBackward>) torch.Size([6400, 18])


ValueError: ignored

In [0]:
pred_cat = model.predict(X_te)
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)
from sklearn_crfsuite.metrics import flat_classification_report

# Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)

3

In [0]:

import torch
from torchtext import data
from torchtext.datasets import SequenceTaggingDataset, CoNLL2000Chunking
from torchtext.vocab import Vectors, GloVe, CharNGram

import numpy as np
import random
import logging
logger = logging.getLogger(__name__)


def conll2003_dataset(tag_type, batch_size, root='/content/NER/corpus/CoNLL-2003', 
                          train_file='eng.train', 
                          validation_file='eng.testa',
                          test_file='eng.testb',
                          convert_digits=True):
    """
    conll2003: Conll 2003 (Parser only. You must place the files)
    Extract Conll2003 dataset using torchtext. Applies GloVe 6B.200d and Char N-gram
    pretrained vectors. Also sets up per word character Field
    Parameters:
        tag_type: Type of tag to pick as task [pos, chunk, ner]
        batch_size: Batch size to return from iterator
        root: Dataset root directory
        train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's
    Returns:
        A dict containing:
            task: 'conll2003.' + tag_type
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """
    
    # Setup fields with batch dimension first
    inputs_word = data.Field(init_token="<bos>", eos_token="<eos>",fix_length=100, batch_first=True, lower=True)

    inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", 
                                    batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting, 
                                    init_token="<bos>", eos_token="<eos>")
                        

    labels = data.Field(init_token="<bos>", eos_token="<eos>",  batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] + 
                [('labels', labels) if label == tag_type else (None, None) 
                for label in ['pos', 'chunk', 'ner']])

    # Load the data
    train, val, test = SequenceTaggingDataset.splits(
                                path=root, 
                                train=train_file, 
                                validation=validation_file, 
                                test=test_file,
                                separator=' ',
                                fields=tuple(fields))


    
    # Build vocab
    inputs_char.build_vocab(train.inputs_char, val.inputs_char, test.inputs_char)
    inputs_word.build_vocab(train.inputs_word, val.inputs_word, test.inputs_word, max_size=50000,
                        vectors=[GloVe(name='6B', dim='300'), CharNGram()])
    
    labels.build_vocab(train.labels)
  

    # Get iterators
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
                            (train, val, test), batch_size=batch_size, 
                            device=torch.device("cpu"))
    train_iter.repeat = False
    
    return {
        'task': 'conll2003.%s'%tag_type,
        'iters': (train_iter, val_iter, test_iter), 
        'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab) 
        }
    


In [0]:
dic = conll2003_dataset('ner', batch_size = 64)

In [0]:
a,_,_= dic['iters']
for t in enumerate(a):
  print(t)
  print(len(t))
  
  print(t[1])
  

(0, 
[torchtext.data.batch.Batch of size 64]
	[.labels]:[torch.LongTensor of size 64x41]
	[.inputs_word]:[torch.LongTensor of size 64x100]
	[.inputs_char]:[torch.LongTensor of size 64x41x16])
2

[torchtext.data.batch.Batch of size 64]
	[.labels]:[torch.LongTensor of size 64x41]
	[.inputs_word]:[torch.LongTensor of size 64x100]
	[.inputs_char]:[torch.LongTensor of size 64x41x16]
(1, 
[torchtext.data.batch.Batch of size 64]
	[.labels]:[torch.LongTensor of size 64x43]
	[.inputs_word]:[torch.LongTensor of size 64x100]
	[.inputs_char]:[torch.LongTensor of size 64x43x20])
2

[torchtext.data.batch.Batch of size 64]
	[.labels]:[torch.LongTensor of size 64x43]
	[.inputs_word]:[torch.LongTensor of size 64x100]
	[.inputs_char]:[torch.LongTensor of size 64x43x20]
(2, 
[torchtext.data.batch.Batch of size 64]
	[.labels]:[torch.LongTensor of size 64x53]
	[.inputs_word]:[torch.LongTensor of size 64x100]
	[.inputs_char]:[torch.LongTensor of size 64x53x17])
2

[torchtext.data.batch.Batch of size 64]
	[.

In [0]:
d_,_,label_=dic['vocabs']

In [0]:
len(label_)

12