# Recognize named entities on Twitter with LSTMs


## DATA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/collab/tag prediction


/content/drive/My Drive/collab/tag prediction


In [3]:
%pwd

'/content/drive/My Drive/collab/tag prediction'

In [4]:
# to generate tokens and tags out of text data
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            if (token.split(":")[0] == "http" or token.split(":")[0] == "https"):
                tag = "<URL>"
            elif(token[0] == "@"):
                tag = "<USR>"
            
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

In [5]:
train_tokens, train_tags = read_data('data/train.txt')
validation_tokens, validation_tags = read_data('data/validation.txt')
test_tokens, test_tags = read_data('data/test.txt')

In [6]:
#Example of tags and tokens
for i in range(1):
    for token, tag in zip(train_tokens[i], train_tags[i]):
        print('%s\t\t\t\t%s' % (token, tag))

RT				O
@TheValarium				<USR>
:				O
Online				O
ticket				O
sales				O
for				O
Ghostland				B-musicartist
Observatory				I-musicartist
extended				O
until				O
6				O
PM				O
EST				O
due				O
to				O
high				O
demand				O
.				O
Get				O
them				O
before				O
they				O
sell				O
out				O
...				O


In [7]:
print(f"{train_tokens[2]}\n{train_tags[2]}")

['Happy', 'Birthday', '@AshForeverAshey', '!', 'May', 'Allah', 's.w.t', 'bless', 'you', 'with', 'goodness', 'and', 'happiness', '.']
['O', 'O', '<USR>', 'O', 'O', 'B-person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [8]:
from collections import defaultdict

In [9]:
#creating dictionary
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []

    n = 0
    for item in special_tokens:
        tok2idx[item] = n
        n+=1
        
    for item in set([x for y in tokens_or_tags for x in y]):
        tok2idx[item] = n
        n+=1
    idx2tok = {x:y for y,x in tok2idx.items() }
    return tok2idx, idx2tok

 - `<UNK>` token for out of vocabulary tokens;
 - `<PAD>` token for padding sentence to the same length when we create batches of sentences.

In [10]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries 
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

idx2tag[1]

'B-sportsteam'

In [11]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

In [12]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

In [13]:
#creating minibatches
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield torch.LongTensor(x), torch.LongTensor(y), lengths

size => batch_size * max_length

In [14]:
import math
import numpy as np

In [15]:
tokens,tags,length = next(iter(batches_generator(batch_size=1, tokens=train_tokens, tags=train_tags)))
tokens_,tags_,length_ = next(iter(batches_generator(batch_size=1, tokens=validation_tokens, tags=validation_tags)))
#print(tokens.size(),tags.size(),max(length))
print(tags,tags_)

tensor([[4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]]) tensor([[13,  4, 15,  4,  4,  4,  4,  4, 12,  4,  4,  4, 19,  4]])


In [16]:
class Net(nn.Module):
    def __init__(self,embed_size,hidden_size, vocab_size,dropout,n_layers = 1):
        super().__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.batch_size =  vocab_size
        self.n_layers = n_layers

        self.embed = nn.Embedding(vocab_size,embed_size)
        self.lstm = nn.LSTM(embed_size,hidden_size,n_layers,dropout=dropout,batch_first=True,bidirectional=True)
        self.linear = nn.Linear(2*self.hidden_size, len(idx2tag)+1)#(in_dim,out_dim)
    
    def forward(self,tokens,length):
        x = self.embed(tokens)#dim: batch_size x batch_max_len x embed_size
        #print("embed size",x.shape)
        h0 = torch.zeros(self.n_layers*2, x.size(0), self.hidden_size).cuda() # 2 for bidirection 
        c0 = torch.zeros(self.n_layers*2, x.size(0), self.hidden_size).cuda()
        lstm_output, _ = self.lstm(x, (h0,c0)) # dim:batch_size x batch_max_len x lstm_hidden_dim(2* because bidirectional)
        #print("lstm" , lstm_output.shape)
        lstm_output = lstm_output.reshape(-1, lstm_output.shape[2])#so each row contain one token 
        #print("lstm" , lstm_output.shape)
        output = self.linear(lstm_output)#so each token is of length of vocab  size dim: -1,no_of_tags
        softmax_output = F.log_softmax(output,dim=1)# dim =1 means along row 
        #print("next check",softmax_output.shape)
        #output = torch.argmax(softmax_output, dim=1)
        return  softmax_output 
        
    

In [17]:
net = Net(embed_size = 64,hidden_size =64, vocab_size=len(token2idx),dropout=0.5,n_layers = 2)

In [18]:
 net.cuda()

Net(
  (embed): Embedding(26729, 64)
  (lstm): LSTM(64, 64, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=128, out_features=24, bias=True)
)

In [19]:
len(token2idx)

26729

In [20]:
len(idx2tag)

23

In [21]:
def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    #mask out 'PAD' tokens
    mask = (labels != 1).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).item())
    #pdb.set_trace()

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens

In [22]:
loss =nn.CrossEntropyLoss( )
optimizer = torch.optim.Adam(params = net.parameters(), lr=0.03, eps=0.01)
#optimizer = torch.optim.RMSprop(params = net.parameters(), lr=0.01)

In [23]:
n_steps = math.ceil(len(train_tokens)/16)
n_steps

363

In [24]:
import pdb
import sys

In [25]:
net.train()
n_epoch = 5
train_loss = 0
validation_loss = 0
training = []
val = []
for epoch in range(n_epoch):
    train_loss = 0
    validation_loss = 0  
    for i ,(tokens,tags,length) in enumerate(batches_generator(batch_size=16, tokens=train_tokens, tags=train_tags),1):
        #print(tags)
        net.zero_grad()
        tokens = tokens.cuda()
        tags = tags.cuda()
        prediction = net(tokens,length)
        #print(prediction.shape)
        #print([max(x.item()) for x in prediction])
        #pdb.set_trace()
        #training_loss = loss_fn(prediction,tags)
        
        training_loss = loss(prediction.view(-1,len(idx2tag)+1),tags.view(-1) )
        train_loss += training_loss
        # Get training statistics.
        if(i%300 == 0):
            print("training data","-"*20)
            progress = ((epoch-1)*n_steps + i) / (n_epoch*n_steps) * 100
            stats = '%.2f%% Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f\n' % (progress, epoch, n_epoch-1, i, n_steps, train_loss/i, np.exp(training_loss.item()))
            print('\r' + stats, end="")
        
        tokens,tags,lenght = next(iter(batches_generator(batch_size=16, tokens=validation_tokens, tags=validation_tags)))
        tags = tags.cuda()
        tokens = tokens.cuda()
        prediction = net(tokens,length)
        val_loss = loss(prediction.view(-1,len(idx2tag)+1),tags.view(-1))
        #val_loss = loss_fn(prediction,tags)
        # Get training statistics.
        validation_loss += val_loss
        if(i%300 == 0):
            print("validation data","-"*20)
            stats = '%.2f%% Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f\n' % (progress, epoch, n_epoch-1, i, n_steps, validation_loss/i, np.exp(val_loss.item()))
            print('\r' + stats, end="")
            print()
        training_loss.backward()
        torch.nn.utils.clip_grad_value_(net.parameters(), clip_value=0.5)
        optimizer.step()
        
        sys.stdout.flush()
    val.append(validation_loss/i)
    training.append(train_loss/i)
        
    
    

training data --------------------
-3.47% Epoch [0/4], Step [300/363], Loss: 0.4387, Perplexity: 1.3043
validation data --------------------
-3.47% Epoch [0/4], Step [300/363], Loss: 0.4183, Perplexity: 1.3999

training data --------------------
16.53% Epoch [1/4], Step [300/363], Loss: 0.3318, Perplexity: 1.4756
validation data --------------------
16.53% Epoch [1/4], Step [300/363], Loss: 0.3267, Perplexity: 1.2502

training data --------------------
36.53% Epoch [2/4], Step [300/363], Loss: 0.3018, Perplexity: 1.3000
validation data --------------------
36.53% Epoch [2/4], Step [300/363], Loss: 0.3004, Perplexity: 1.3789

training data --------------------
56.53% Epoch [3/4], Step [300/363], Loss: 0.2830, Perplexity: 1.1942
validation data --------------------
56.53% Epoch [3/4], Step [300/363], Loss: 0.2965, Perplexity: 1.5304

training data --------------------
76.53% Epoch [4/4], Step [300/363], Loss: 0.2589, Perplexity: 1.2635
validation data --------------------
76.53% Epoch 

In [26]:
from sklearn.metrics import f1_score

In [53]:
n_total = 0
true = 0
for tokens,tags,length in batches_generator(batch_size=16, tokens=test_tokens, tags=test_tags):
  tokens,tags = tokens.cuda() ,tags.cuda()
  prediction = net(tokens,length)
  a = torch.argmax(prediction,dim = 1)
  tags = tags.view(-1)
  for pred,tag in zip(a,tags):
    n_total+=1
    if pred == tag :
      true +=1

In [56]:
true/n_total

0.931328297715549

**The model have achieved 93% accuracy in 5 epochs and after 5 epoch the model starts to overfit. **

---

