In [1]:
import time
import random
import numpy
import spacy
import torch
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy import datasets


  from .autonotebook import tqdm as notebook_tqdm


#### Preparing Data

In [2]:
TEXT= data.Field(lower= True)
UD_TAGS = data.Field(unk_token=None)
PTB_TAGS = data.Field(unk_token=None)

In [3]:
fields = (('text',TEXT),('udtags',UD_TAGS),('ptbtags',PTB_TAGS))

In [4]:
train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

In [5]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 12543
Number of validation examples: 2002
Number of testing examples: 2077


In [6]:
print(vars(train_data.examples[1]))

{'text': ['[', 'this', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']'], 'udtags': ['PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'AUX', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADP', 'NOUN', 'PART', 'VERB', 'PUNCT', 'PUNCT'], 'ptbtags': ['-LRB-', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'MD', 'VB', 'VBG', 'PRP', 'NN', 'IN', 'NNS', 'TO', 'VB', '.', '-RRB-']}


#### Build Vocabulary
`unk_init` - It is used to initialize the token embedding which are not in the pre trained embedding vocabulary. By default they are set to 0. So instead of initializing them all to same value, We would initalize them from a normal or gaussian distribution

In [7]:
TEXT.build_vocab(train_data,min_freq=2,vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)
UD_TAGS.build_vocab(train_data)
PTB_TAGS.build_vocab(train_data)

In [8]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 9076), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3379), ('in', 3112), ('is', 2239), ('you', 2156), ('that', 2036), ('it', 1850), ('for', 1842), ('-', 1426), ('have', 1359), ('"', 1296), ('on', 1273), ('was', 1244), ('with', 1216)]


#### Creating Iterators

In [9]:

BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [10]:
import torch.nn as nn

In [11]:
class LSTMPOSTagger(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        
        self.LSTM = nn.LSTM(emb_dim, 
                            hidden_dim,
                            num_layers= n_layers,
                            bidirectional=True,
                            dropout= dropout if n_layers>1 else 0
                           )
        self.fc = nn.Linear(hidden_dim*2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        #text = [sent_len, batch_size]
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent_len, batch_size,emb_dim]
        output,(hidden,cell) = self.LSTM(embedded)
        #output = [sent_len, batch_size, hid_dim*n_directions]
        #hidden = [n_layers*n_directions,batch_size,hid_dim]
        pred = self.fc(self.dropout(output))
        #pred = [sent_len, batch_size,output_dim]
        return pred

#### Train Model

In [12]:
input_dim = len(TEXT.vocab)
embdding_dim = 100
hidden_dim = 128
output_dim = len(UD_TAGS.vocab)
n_layers = 2
bidirectional = True
dropout = 0.25
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

In [13]:
model = LSTMPOSTagger(input_dim,embdding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout,pad_idx)

#### Weights Initialization

In [14]:
def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.normal_(param.data,mean=0 , std=0.1)
model.apply(init_weights)

LSTMPOSTagger(
  (embedding): Embedding(8866, 100, padding_idx=1)
  (LSTM): LSTM(100, 128, num_layers=2, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [15]:
pretrained_embeddings = TEXT.vocab.vectors
pretrained_embeddings.shape

torch.Size([8866, 100])

In [16]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.8229, -0.4773,  0.2685,  ..., -0.0726, -0.7076,  0.3675],
        [-1.1936,  0.9956, -0.0168,  ..., -1.7080,  0.3651, -0.4950],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.3846, -1.0335,  0.8176,  ..., -0.3827, -0.1766,  0.3171],
        [-0.5972,  0.0471, -0.2406,  ..., -0.9446, -0.1126, -0.2260],
        [-0.8438,  0.4213,  0.5783,  ...,  2.4652, -0.2974, -0.4256]])

In [17]:
model.embedding.weight.data[pad_idx] = torch.zeros(embdding_dim)
model.embedding.weight.data

tensor([[-0.8229, -0.4773,  0.2685,  ..., -0.0726, -0.7076,  0.3675],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.3846, -1.0335,  0.8176,  ..., -0.3827, -0.1766,  0.3171],
        [-0.5972,  0.0471, -0.2406,  ..., -0.9446, -0.1126, -0.2260],
        [-0.8438,  0.4213,  0.5783,  ...,  2.4652, -0.2974, -0.4256]])

#### Define our optimizer

In [18]:
optimizer = optim.Adam(model.parameters())

In [19]:
TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=TAG_PAD_IDX)

In [20]:
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def accuracy(preds,y,tag_pad_idx):
    max_prob = preds.argmax(dim=1,keepdim=True)
    non_pad_elements= (y!=tag_pad_idx).nonzero()
    correct = max_prob[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum()/y[non_pad_elements].shape[0]

In [23]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss= 0
    epoch_accuracy = 0
    model.train()
    for batch in iterator:
        #text=[seq_len,batch_size]
        text = batch.text
        tags = batch.udtags
        predictions = model(text)
        #preds =[seq_len,batch_size,output_dim]
        #tags = [sent_len,batch_size]
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        #Pytorch loss function can not handle 3-d preds, hence we need to reshape our preds with help of view()
        #preds =[seq_len*batch_size,output_dim]
        #tags = [sent_len*batch_size]
        loss = criterion(predictions,tags)
        acc = accuracy(predictions,tags,tag_pad_idx)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_accuracy += acc.item()
    
    return epoch_loss/len(iterator), epoch_accuracy/len(iterator)

#### Inside evaluate() we will not update our parameters

In [28]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_accuracy = 0
    model.eval()
    with torch.no_grad():
        
        for batch in iterator:
            # text = [sent_len, batch_size]
            text = batch.text
            tags = batch.udtags
            
            predictions = model(text)
            #precdictions = [sent_len, batch_size, output_dim]
            #tags = [sent_len,batch_size]
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            #precdictions = [sent_len* batch_size, output_dim]
            #tags = [sent_len*batch_size]
            loss = criterion(predictions,tags)
            acc = accuracy(predictions,tags,tag_pad_idx)
            epoch_loss += loss.item()
            epoch_accuracy += acc.item()
            
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.595 | Train Acc: 83.29%
	 Val. Loss: 0.744 |  Val. Acc: 83.33%
Epoch: 02 | Epoch Time: 0m 4s
	Train Loss: 0.457 | Train Acc: 88.60%
	 Val. Loss: 0.654 |  Val. Acc: 82.42%
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.425 | Train Acc: 88.49%
	 Val. Loss: 0.650 |  Val. Acc: 82.51%
Epoch: 04 | Epoch Time: 0m 4s
	Train Loss: 0.396 | Train Acc: 89.61%
	 Val. Loss: 0.655 |  Val. Acc: 84.79%
Epoch: 05 | Epoch Time: 0m 4s
	Train Loss: 0.379 | Train Acc: 89.58%
	 Val. Loss: 0.617 |  Val. Acc: 84.56%
Epoch: 06 | Epoch Time: 0m 4s
	Train Loss: 0.364 | Train Acc: 90.10%
	 Val. Loss: 0.569 |  Val. Acc: 85.61%
Epoch: 07 | Epoch Time: 0m 4s
	Train Loss: 0.353 | Train Acc: 90.11%
	 Val. Loss: 0.580 |  Val. Acc: 83.78%
Epoch: 08 | Epoch Time: 0m 4s
	Train Loss: 0.361 | Train Acc: 89.93%
	 Val. Loss: 0.606 |  Val. Acc: 83.86%
Epoch: 09 | Epoch Time: 0m 4s
	Train Loss: 0.362 | Train Acc: 90.35%
	 Val. Loss: 0.576 |  Val. Acc: 85.55%
Epoch: 10 | Epoch Time: 0m 4

In [31]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.590 |  Test Acc: 85.94%


#### Inference

In [90]:
def tag_sentence(model,device,sent,text_field,tag_field):
    model.eval()
    if isinstance(sent,str):
        nlp = spacy.load('en_core_web_sm')
        tokens = [token.text for token in nlp(sent)]
    else:
        tokens = [token.text for token in sent]
        
    if text_field.lower:
        tokens = [token.lower() for token in tokens]
        
    numericalized = [text_field.vocab.stoi[t] for t in tokens]
    unk_index =text_field.vocab.stoi[text_field.unk_token]
    unkowns = [t for t,n in zip(tokens,numericalized) if n==unk_index]
    # create token tensor to fed into model
    token_tensor = torch.LongTensor(numericalized)
    print("token_tensor",token_tensor)
    print("shape",token_tensor.shape)
    token_tensor = token_tensor.unsqueeze(-1).to(device)
    print("After token_tensor",token_tensor)
    print("shape",token_tensor.shape)
    predictions = model(token_tensor)
    top_predictions = predictions.argmax(-1)
    print(top_predictions)
    predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]
    print(predicted_tags)
    return tokens,predicted_tags,unkowns

In [91]:
ex = "I am in love of NLP"
tokens,predicted_tags,unkowns = tag_sentence(model,device,ex, TEXT,UD_TAGS)

token_tensor tensor([  9,  77,  10, 225,   8,   0])
shape torch.Size([6])
After token_tensor tensor([[  9],
        [ 77],
        [ 10],
        [225],
        [  8],
        [  0]], device='cuda:0')
shape torch.Size([6, 1])
tensor([[4],
        [9],
        [5],
        [1],
        [5],
        [7]], device='cuda:0')
['PRON', 'AUX', 'ADP', 'NOUN', 'ADP', 'PROPN']


In [93]:
predicted_tags

['PRON', 'AUX', 'ADP', 'NOUN', 'ADP', 'PROPN']