In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# What will happen here?
training_data = [
    # Tags are: DET - determiner; NN - noun; V - verb
    # For example, the word "The" is a determiner
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Jane walked into the room".split(), ["NN", "V", "DET","DET","NN"]),
    ("Everybody does machine learning nowadays".split(), ["NN", "V", "NN", "NN", "ADV"]),
    ("It was late in the day and everyone was walking home after a long day at work".split(),
     ["NN","V","ADV","DET","DET","NN","DET","NN","V","V","NN","ADV","DET","DET","NN","DET","NN"])
    ]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
            
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2, "ADV": 3}  # Assign each tag with a unique index

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Jane': 5, 'walked': 6, 'into': 7, 'room': 8, 'Everybody': 9, 'does': 10, 'machine': 11, 'learning': 12, 'nowadays': 13, 'It': 14, 'was': 15, 'late': 16, 'in': 17, 'day': 18, 'and': 19, 'everyone': 20, 'walking': 21, 'home': 22, 'after': 23, 'a': 24, 'long': 25, 'at': 26, 'work': 27}


In [3]:
EMBEDDING_DIM = 1 #The hidden dimension is basically the number of nodes in each layer (like in the Multilayer Perceptron for example)
HIDDEN_DIM = 11 #the number of nodes in each layer
VOCAB_SIZE = len(word_to_ix)
NUM_CLASSES = len(tag_to_ix)

In [4]:
def train(model, optimizer, criterion, epochs):
    epoch_loss = []
    for epoch in range(epochs):  # again, normally you would NOT do 300 epochs, it is toy data
        final_loss = 0
        for sentence, tags in training_data:
            
            model.zero_grad()

            # get inputs and targets ready for the network!
            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)

            # get the tag scores
            tag_scores = model(sentence_in)
            
            loss = criterion(tag_scores, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            final_loss += loss.item()
        epoch_loss.append(final_loss)
    
    return epoch_loss



In [5]:
def evaluate(model, test_sequence):
    with torch.no_grad():
        inputs = prepare_sequence(training_data[test_sequence][0], word_to_ix)
        tag_scores = model(inputs)
        
        outputs = []
        
        print(tag_to_ix)
        print(training_data[test_sequence][0])
        print(training_data[test_sequence][1])
        
        for tag_score in tag_scores:
            outputs.append(tag_score.topk(1).indices.item())
            
        print(outputs)
        print("--------------")

## RNN tagger

In [6]:
class RNNTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(RNNTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The RNN takes word embeddings as inputs, and outputs hidden states and output
        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        rnn_out, _ = self.rnn(embeds.view(len(sentence),1, -1)) #The module is expecting [sentence_length, batch_size, embedding_dim]
        
        # in this case, rnn_out.view(len(sentence), -1) is the same as doing what function?
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [7]:
model = RNNTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses = train(model, optimizer, loss_function, 200)
print(losses)
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)
evaluate(model, 3)

[5.247547745704651, 5.051419138908386, 4.938004374504089, 4.854490399360657, 4.783879637718201, 4.71931004524231, 4.657251954078674, 4.59552788734436, 4.532626986503601, 4.467418789863586, 4.399017095565796, 4.3267346024513245, 4.25009685754776, 4.168894469738007, 4.08325469493866, 3.993701219558716, 3.901167333126068, 3.8069222569465637, 3.7124056220054626, 3.618972599506378, 3.527613341808319, 3.4387372732162476, 3.3521270155906677, 3.2670783400535583, 3.182667315006256, 3.0980424284934998, 3.0126612782478333, 2.9263972342014313, 2.839506894350052, 2.752471089363098, 2.6657872200012207, 2.5797893404960632, 2.494567096233368, 2.410003423690796, 2.3258964717388153, 2.24210087954998, 2.1586291939020157, 2.075689345598221, 1.9936655759811401, 1.913066104054451, 1.8344506621360779, 1.7583566009998322, 1.6852358430624008, 1.615419052541256, 1.5491089075803757, 1.4863946586847305, 1.4272720664739609, 1.371665507555008, 1.319445863366127, 1.2704474478960037, 1.2244815975427628, 1.18135058134

In [8]:
b = torch.FloatTensor(losses)
perplexity  = torch.exp(b)
print("PP:", perplexity)

PP: tensor([190.0995, 156.2441, 139.4916, 128.3153, 119.5673, 112.0909, 105.3462,
         99.0404,  93.0026,  87.1315,  81.3709,  75.6967,  70.1122,  64.6439,
         59.3383,  54.2553,  49.4602,  45.0117,  40.9522,  37.2992,  34.0426,
         31.1476,  28.5634,  26.2346,  24.1110,  22.1545,  20.3415,  18.6603,
         17.1073,  15.6813,  14.3793,  13.1944,  12.1165,  11.1340,  10.2359,
          9.4131,   8.6593,   7.9700,   7.3424,   6.7738,   6.2617,   5.8029,
          5.3937,   5.0300,   4.7073,   4.4211,   4.1673,   3.9419,   3.7413,
          3.5624,   3.4024,   3.2588,   3.1295,   3.0126,   2.9068,   2.8105,
          2.7228,   2.6426,   2.5691,   2.5015,   2.4391,   2.3815,   2.3280,
          2.2782,   2.2317,   2.1881,   2.1472,   2.1087,   2.0723,   2.0378,
          2.0050,   1.9738,   1.9441,   1.9156,   1.8883,   1.8621,   1.8370,
          1.8128,   1.7895,   1.7671,   1.7454,   1.7245,   1.7043,   1.6848,
          1.6659,   1.6477,   1.6301,   1.6130,   1.5965,   

## LSTM tagger

In [9]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [13]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function =  nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
losses_lstm = train(model, optimizer, loss_function, 250)
print(losses_lstm)
evaluate(model, 0)
evaluate(model, 1)
evaluate(model, 2)
evaluate(model, 3)

[5.393147110939026, 5.308363199234009, 5.241724252700806, 5.189165472984314, 5.147485375404358, 5.114192605018616, 5.087366580963135, 5.065536022186279, 5.047575235366821, 5.032623887062073, 5.0200241804122925, 5.009269118309021, 4.999967694282532, 4.991815090179443, 4.984573721885681, 4.978056192398071, 4.972112774848938, 4.966624140739441, 4.961493730545044, 4.956643342971802, 4.9520087242126465, 4.947536826133728, 4.943183898925781, 4.938913345336914, 4.934693336486816, 4.930498123168945, 4.9263060092926025, 4.922096848487854, 4.917854905128479, 4.913565278053284, 4.909215807914734, 4.904795527458191, 4.900294303894043, 4.895703911781311, 4.891016244888306, 4.886224150657654, 4.8813217878341675, 4.876302361488342, 4.871161341667175, 4.865893363952637, 4.860492944717407, 4.854956388473511, 4.849279284477234, 4.843457579612732, 4.837486624717712, 4.831363201141357, 4.825083494186401, 4.8186434507369995, 4.812040209770203, 4.805269360542297, 4.798327326774597, 4.791211128234863, 4.7839

In [14]:
b = torch.FloatTensor(losses_lstm)
perplexity  = torch.exp(b)
print("PP:", perplexity)

PP: tensor([219.8943, 202.0192, 188.9957, 179.3189, 171.9984, 166.3664, 161.9628,
        158.4654, 155.6446, 153.3348, 151.4150, 149.7952, 148.4083, 147.2034,
        146.1413, 145.1918, 144.3315, 143.5415, 142.8069, 142.1159, 141.4588,
        140.8277, 140.2160, 139.6185, 139.0305, 138.4485, 137.8692, 137.2902,
        136.7090, 136.1239, 135.5331, 134.9353, 134.3293, 133.7141, 133.0887,
        132.4525, 131.8048, 131.1448, 130.4724, 129.7868, 129.0878, 128.3751,
        127.6484, 126.9074, 126.1519, 125.3818, 124.5969, 123.7971, 122.9823,
        122.1524, 121.3074, 120.4472, 119.5718, 118.6813, 117.7757, 116.8551,
        115.9195, 114.9691, 114.0039, 113.0242, 112.0301, 111.0218, 109.9997,
        108.9640, 107.9150, 106.8531, 105.7787, 104.6922, 103.5940, 102.4846,
        101.3646, 100.2345,  99.0949,  97.9464,  96.7897,  95.6254,  94.4542,
         93.2769,  92.0941,  90.9067,  89.7153,  88.5208,  87.3239,  86.1254,
         84.9261,  83.7267,  82.5281,  81.3308,  80.1358,  7

## GRU tagger

In [15]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The GRU takes word embeddings as inputs, and outputs hidden states and output
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        
        embeds = self.word_embeddings(sentence)
        gru_out, _ = self.gru(embeds.view(len(sentence), 1, -1)) #The module is expecting [sentence_length, batch_size, embedding_dim]
        
        # in this case, gru_out.view(len(sentence), -1) is the same as doing what function?
        tag_space = self.hidden2tag(gru_out.view(len(sentence), -1))
        
        tag_scores = F.softmax(tag_space, dim=1)
        return tag_scores

In [26]:
model_gru = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function =  nn.CrossEntropyLoss()
optimizer = optim.SGD(model_gru.parameters(), lr=0.1)
losses_gru = train(model_gru, optimizer, loss_function, 200)
print(losses_gru)
evaluate(model_gru, 0)
evaluate(model_gru, 1)
evaluate(model_gru, 2)
evaluate(model_gru, 3)

[5.447351574897766, 5.4371432065963745, 5.427152156829834, 5.417380928993225, 5.4078357219696045, 5.398521780967712, 5.389446496963501, 5.380617141723633, 5.372041344642639, 5.363725185394287, 5.35567581653595, 5.347898602485657, 5.340397834777832, 5.333177208900452, 5.326237797737122, 5.319580674171448, 5.313204288482666, 5.307107090950012, 5.301283955574036, 5.2957305908203125, 5.290440559387207, 5.285406947135925, 5.28062117099762, 5.276074647903442, 5.271758317947388, 5.267661929130554, 5.263775587081909, 5.260089635848999, 5.256593704223633, 5.253277540206909, 5.250131130218506, 5.247145652770996, 5.244310021400452, 5.24161696434021, 5.239056468009949, 5.236620783805847, 5.234301805496216, 5.232092261314392, 5.229984283447266, 5.2279709577560425, 5.226047158241272, 5.22420608997345, 5.2224414348602295, 5.2207489013671875, 5.2191232442855835, 5.217559218406677, 5.216053128242493, 5.214600563049316, 5.213197588920593, 5.211840510368347, 5.210526704788208, 5.209251761436462, 5.208013

In [27]:
b = torch.FloatTensor(losses_gru)
perplexity  = torch.exp(b)
print("PP:", perplexity)

PP: tensor([232.1425, 229.7848, 227.5004, 225.2883, 223.1482, 221.0794, 219.0820,
        217.1562, 215.3019, 213.5189, 211.8071, 210.1662, 208.5957, 207.0949,
        205.6628, 204.2982, 202.9996, 201.7657, 200.5942, 199.4833, 198.4308,
        197.4345, 196.4919, 195.6005, 194.7581, 193.9620, 193.2096, 192.4988,
        191.8270, 191.1919, 190.5913, 190.0231, 189.4850, 188.9755, 188.4922,
        188.0336, 187.5980, 187.1841, 186.7899, 186.4142, 186.0559, 185.7137,
        185.3863, 185.0727, 184.7721, 184.4834, 184.2057, 183.9383, 183.6805,
        183.4314, 183.1905, 182.9571, 182.7308, 182.5109, 182.2970, 182.0885,
        181.8851, 181.6864, 181.4919, 181.3013, 181.1143, 180.9306, 180.7498,
        180.5716, 180.3959, 180.2223, 180.0505, 179.8805, 179.7119, 179.5445,
        179.3781, 179.2126, 179.0478, 178.8833, 178.7192, 178.5552, 178.3911,
        178.2269, 178.0623, 177.8972, 177.7312, 177.5646, 177.3971, 177.2283,
        177.0583, 176.8869, 176.7139, 176.5392, 176.3627, 17