Trying out the exercise from: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [2]:
training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

In [3]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [4]:
# build the vocabulary

vocabulary = dict()

for sentence, _ in training_data:
    for word in sentence:
        if word not in vocabulary.keys():
            vocabulary[word] = len(vocabulary)
            
vocabulary

{'The': 0,
 'dog': 1,
 'ate': 2,
 'the': 3,
 'apple': 4,
 'Everybody': 5,
 'read': 6,
 'that': 7,
 'book': 8}

In [5]:
tagset = {"DET": 0, "NN": 1, "V": 2}

In [6]:
# create an alphabet first for a char sequence rep
# ignoring case for now
alphabet = dict()


for sentence, _ in training_data:
    for word in sentence:
        chars = list(word)
        for c in chars:
            if c not in alphabet.keys():
                alphabet[c] = len(alphabet)
        
alphabet

{'T': 0,
 'h': 1,
 'e': 2,
 'd': 3,
 'o': 4,
 'g': 5,
 'a': 6,
 't': 7,
 'p': 8,
 'l': 9,
 'E': 10,
 'v': 11,
 'r': 12,
 'y': 13,
 'b': 14,
 'k': 15}

In [8]:
# model

EMBEDDING_DIM = 16
HIDDEN_DIM = 30

class CharacterBasedTagger(nn.Module):
    def __init__(self, 
                 embedding_dim=EMBEDDING_DIM,
                 hidden_dim=HIDDEN_DIM, 
                 vocab_size=len(vocabulary), 
                 tagset_size=len(tagset),
                 alphabet_size=len(alphabet)):
        
        
        super(CharacterBasedTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        
        # char rep
        self.char_embedding = nn.Embedding(alphabet_size, embedding_dim)
        self.char_lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.tag_lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, tagset_size)
        
    def forward(self, sentence, characters):
        pass