In [4]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import torch
from torch import nn
from torchcrf import CRF
import unicodedata

In [5]:
emb = KeyedVectors.load_word2vec_format("cbow_s100.txt")

### Creating dictionary with index of words from pre-trained embedding

In [18]:
dic = {}
for j in emb.wv.index2word:
    num = emb.vocab[j].index
    word = unicodedata.normalize('NFKD', j).encode('ascii', 'ignore').decode('utf8')
    dic[word] = emb.vocab[j].index
print(len(dic))

  for j in emb.wv.index2word:


866822


## Creating the LSTM-CRF model

All words in a sentence pass through the LSTM and output of hidden layers are used as the encoding passed to the CRF for classification

In [80]:
class LSTM_CRF(nn.Module):
    def __init__(self, embedding_dim, word2idx_dict, num_tags, hidden_dim):
        super(LSTM_CRF, self).__init__()
        self.embedding_dim = 100
        self.hidden_dim = hidden_dim
        self.num_tags = num_tags
        self.word2idx_dict = word2idx_dict
        # Dictionaries to convert tags (labels) to indexes (integers) and vice-versa
        self.tag2idx_dict = {}
        self.idx2tag_dict = {}
        
        # Defining all the nn layers
        self.embed_layer  = nn.Embedding.from_pretrained(torch.FloatTensor(emb.vectors))
        self.embed_layer.weight[0] = 0
        self.word_LSTM_layer   = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.sent_LSTM_layer   = nn.LSTM(hidden_dim, hidden_dim//2, num_layers=1, batch_first=True)
        self.linear_layer      = nn.Linear(hidden_dim//2, num_tags)
        self.crf_layer         = CRF(num_tags, batch_first=True)
        
    def preprocess_input(self, batch):
        """ 
        Method to preprocess the sentences switching words for their embedding indexes and padding sentences
        input: sequence of sentences to be preprocessed | 
               shape: (batch_size)x(len_sequence)x(len_sentence)
        output: Preprocessed batch of sequences of sentences and mask for the CRF layer (ignore padded inputs)
        """
        # Find sentence with max length
        max_len = 0
        for seq in batch:
            for sentence in seq:
                max_len = max(max_len, len(sentence))
                
        # Pad sentences according to max_len        
        for i in range(len(batch)):
            for j in range(len(batch[i])):
                for k in range(len(batch[i][j])):
                    if batch[i][j][k] in self.word2idx_dict:
                        batch[i][j][k] = self.word2idx_dict[batch[i][j][k]]
                    else:
                        batch[i][j][k] = 0
                for k in range(max_len - len(batch[i][j])):
                    batch[i][j].append(0)
        
        # Find sequence of sentences with max length
        max_seq = 0
        for seq in batch:
            max_seq = max(max_seq, len(seq))
            
        # Create mask for the crf to ignore padded part of sequences
        mask_pad = np.ones((len(batch), max_seq))
        
        # Pad sequence of sentences
        pad_sentence = [1 for i in range(max_len)]
        for i in range(len(batch)):
            idx = -1
            for j in range(max_seq - len(batch[i])):
                batch[i].append(pad_sentence)
                mask_pad[i][idx] = 0
                idx -= 1
                
        return torch.LongTensor(batch), torch.ByteTensor(mask_pad)
        
    def preprocess_label(self, batch_y):
        """ 
        Method to preprocess the labels switching words for their one-hot encoding and padding sentences
        input: sequence of labels to be preprocessed | 
               shape: (batch_size)x(len_seq)
        output: Preprocessed batch of labels
        """
        # Creates tag to index dictionary
        tag2idx = {}
        for seq in batch_y:
            for tag in seq:
                if tag not in tag2idx:
                    tag2idx[tag] = len(tag2idx)
                    
        # Creates index to tag dictionary
        idx2tag = {}
        for tag in tag2idx:
            idx2tag[tag2idx[tag]] = tag
        
        self.tag2idx_dict = tag2idx
        self.idx2tag_dict = idx2tag
        
        # Sub tags for their index
        for i in range(len(batch_y)):
            for j in range(len(batch_y[i])):
                batch_y[i][j] = tag2idx[batch_y[i][j]]
        
        # Pad sequence of tags
        max_len = 0
        for i in batch_y:
            max_len = max(max_len, len(i))
        for i in range(len(batch_y)):
            for j in range(max_len -len(batch_y[i])):
                batch_y[i].append(-1)
                
        return torch.LongTensor(batch_y)
    
    def forward(self, batch_input, batch_tags, mask_pad):
        """
        Method to compute the forward pass of the LSTM_CRF model
        Input: (x) shape:        (batch_size) x (sequence_length) x (sentence_length)
               (y) shape:        (batch_size) x (sequence_length)
               (mask_pad) shape: (batch_size) x (sequence_length)
        output: log_likelihood  of the probability of the expected sequence of tags
        """
        batch_size = batch_input.shape[0]
        sequence_pad_size = batch_input.shape[1]
        sentence_pad_size = batch_input.shape[2]
        # Embedding Layer
        emb_out = model.embed_layer(batch_input)
        # Word level LSTM layer
        word_lstm_out, (hn, cn) = model.word_LSTM_layer(emb_out.view(batch_size*sequence_pad_size, sentence_pad_size, 100))
        # Sentence level LSTM layer
        sent_lstm_out, (hn, cn) = model.sent_LSTM_layer(hn.view(batch_size, sequence_pad_size, 128))
        # Linear (fully-connected) layer
        lin_out = model.linear_layer(sent_lstm_out.reshape(batch_size * sequence_pad_size, 64))
        # CRF layer
        return model.crf_layer(lin_out.view(batch_size, sequence_pad_size, self.num_tags), batch_tags)

    def fit(self, x, y, epoch):
        """ 
        Method to train the LSTM_CRF model 
        Input: (x) shape: (number of batches) x (batch_size) x (sequence_length) x (sentence_length)
               (y) shape: (number of batches) x (batch_size) x (sequence_length)
        output: 
        """
        optimizer = torch.optim.SGD(self.parameters(), lr=0.01, weight_decay=1e-4)
        masks = []
        for i in range(len(x)):
            x[i], mask_pad = self.preprocess_input(x[i])
            y[i] = self.preprocess_label(y[i])
            masks.append(mask_pad)
        for it in range(epoch):
            for batch_x, batch_y, mask in zip(x, y, masks):
                self.zero_grad()
                loss = -self.forward(batch_x, batch_y, mask)
                print(loss)
                loss.backward()
                optimizer.step()
            
    def predict(self, sequence):
        """ 
        Method to predict segmentation tags
        Input: sequence - shape:(batch_size)x(sequence_size)x(sentence_size)
        output: Predicted tags - shape: (batch_size)x(sequence_size)
        """
        with torch.no_grad():
            # Get indexes for word embeddings and pad both sentences and sequences (not yet)
            sequence, mask_pad = self.preprocess_input(sequence)
            batch_size = sequence.shape[0]
            sequence_pad_size = sequence.shape[1]
            sentence_pad_size = sequence.shape[2]
            # Embedding Layer
            emb_out = model.embed_layer(sequence)
            # Word level LSTM layer
            word_lstm_out, (hn, cn) = model.word_LSTM_layer(emb_out.view(batch_size*sequence_pad_size, sentence_pad_size, 100))
            # Sentence level LSTM layer
            sent_lstm_out, (hn, cn) = model.sent_LSTM_layer(hn.view(batch_size, sequence_pad_size, 128))
            # Linear (fully-connected) layer
            lin_out = model.linear_layer(sent_lstm_out.reshape(batch_size * sequence_pad_size, 64))
            print(lin_out.view(batch_size, sequence_pad_size, self.num_tags).shape)
            # CRF layer
            return model.crf_layer.decode(lin_out.view(batch_size, sequence_pad_size, self.num_tags), mask=mask_pad)
        

In [81]:
model = LSTM_CRF(embedding_dim=100, word2idx_dict=dic, num_tags=5, hidden_dim=128)

# Testes a partir daki

In [82]:
sample1 = [
    ["quem", "disse", "que", "eu", "nao", "sei", "nadar"],
    ["cara", "tu", "nao", "sabe", "o", "que", "eu", "falei"],
    ["eu", "gosto", "de", "jogar", "tenis"], 
    ["amanha", "eu", "vou", "correr"],
    ["nao", "quero", "voltar", "no", "tempo"]
]
label1 = [
    "B",
    "I", 
    "O", 
    "O",
    "O"
]
sample2 = [
    ["ola", "hoje", "eu", "quero", "nadar"],
    ["cara", "tu", "nao", "sabe", "o", "que", "eu", "falei"],
    ["eu", "gosto", "de", "jogar", "tenis"]
]
label2 = [
    "B", 
    "O",
    "O"
]
sample3 = [
    ["ola", "hoje", "eu", "quero", "nadar"],
    ["quem", "disse", "que", "eu", "nao", "sei", "nadar"],
    ["eu", "gosto", "de", "jogar", "tenis"]
]
label3 = [ 
    "O",
    "O",
    "O"
]
sample4 = [
    ["ola", "hoje", "eu", "quero", "nadar"],
    ["quem", "disse", "que", "eu", "nao", "sei", "nadar"]
]
label4 = [
    "B",
    "I"
]
sample = [[sample1, sample2, sample3, sample4], [sample2, sample4], [sample1]]
labels = [[label1, label2, label3, label4], [label2, label4], [label1]]
sample, labels

([[[['quem', 'disse', 'que', 'eu', 'nao', 'sei', 'nadar'],
    ['cara', 'tu', 'nao', 'sabe', 'o', 'que', 'eu', 'falei'],
    ['eu', 'gosto', 'de', 'jogar', 'tenis'],
    ['amanha', 'eu', 'vou', 'correr'],
    ['nao', 'quero', 'voltar', 'no', 'tempo']],
   [['ola', 'hoje', 'eu', 'quero', 'nadar'],
    ['cara', 'tu', 'nao', 'sabe', 'o', 'que', 'eu', 'falei'],
    ['eu', 'gosto', 'de', 'jogar', 'tenis']],
   [['ola', 'hoje', 'eu', 'quero', 'nadar'],
    ['quem', 'disse', 'que', 'eu', 'nao', 'sei', 'nadar'],
    ['eu', 'gosto', 'de', 'jogar', 'tenis']],
   [['ola', 'hoje', 'eu', 'quero', 'nadar'],
    ['quem', 'disse', 'que', 'eu', 'nao', 'sei', 'nadar']]],
  [[['ola', 'hoje', 'eu', 'quero', 'nadar'],
    ['cara', 'tu', 'nao', 'sabe', 'o', 'que', 'eu', 'falei'],
    ['eu', 'gosto', 'de', 'jogar', 'tenis']],
   [['ola', 'hoje', 'eu', 'quero', 'nadar'],
    ['quem', 'disse', 'que', 'eu', 'nao', 'sei', 'nadar']]],
  [[['quem', 'disse', 'que', 'eu', 'nao', 'sei', 'nadar'],
    ['cara', 'tu', '

In [83]:
model.fit(sample, labels, 3)

tensor(32.4487, grad_fn=<NegBackward>)
tensor(15.8243, grad_fn=<NegBackward>)
tensor(7.5577, grad_fn=<NegBackward>)
tensor(30.8805, grad_fn=<NegBackward>)
tensor(15.0480, grad_fn=<NegBackward>)
tensor(7.0387, grad_fn=<NegBackward>)
tensor(29.5492, grad_fn=<NegBackward>)
tensor(14.3930, grad_fn=<NegBackward>)
tensor(6.5929, grad_fn=<NegBackward>)


In [86]:
model.predict(sample[2])

torch.Size([1, 5, 5])


[[2, 2, 2, 2, 2]]

In [579]:
model.predict(sample)

torch.Size([4, 5, 5])


[[1, 2, 1, 2, 1], [1, 2, 1, 2, 1], [1, 2, 1, 2, 1], [1, 2, 1, 2, 1]]