## Task 1 (5 points)

Consider the vowel reconstruction task -- i.e. inserting missing vowels (aeuioy) to obtain proper English text. For instance for the input sentence:

<pre>
h m gd smbd hs stln ll m vwls
</pre>

the best result is

<pre>
oh my god somebody has stolen all my vowels
</pre>

In this task both dev and test data come from the two books about Winnie-the-Pooh. You have to train two RNN Language Models on *pooh-train.txt*. For the first model use the code below, for the second choose different hyperparameters (different dropout, smaller number of units or layers, or just do any modification you want).

You can assume that only words from pooh_words.txt can occur in the reconstructed text. For decoding you have two options (choose one, or implement both ang get **+1** bonus point)

1. Sample reconstructed text several times (with quite a low temperature), choose the most likely result.
2. Perform beam search.

Of course in the sampling procedure you should consider only words matching the given consonants.

Report accuracy of your methods (for both language models). The accuracy should be computed by the following function, it should be *greater than 0.25*.

In [5]:
import random
from collections import Counter
from collections import defaultdict as dd

import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SEQUENCE_LENGTH = 15

In [6]:
class PoohDataset(torch.utils.data.Dataset):
    def __init__(self, sequence_length, device):
        txt = open('/content/sample_data/pooh_train.txt').read()

        self.words = txt.lower().split()

        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]
        self.sequence_length = sequence_length
        self.device = device


    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length], device=self.device),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1], device=self.device))

In [7]:
class LSTMModel(nn.Module):
    def __init__(self, n_vocab, device):
        super(LSTMModel, self).__init__()
        self.lstm_size = 512
        self.embedding_dim = 100
        self.num_layers = 2
        self.device = device

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device))

In [8]:
class MyLSTMModel(nn.Module):
    def __init__(self, n_vocab, device):
        super(MyLSTMModel, self).__init__()
        self.lstm_size = 512
        self.embedding_dim = 256
        self.num_layers = 3
        self.device = device

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.1,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device))

In [9]:
def train(dataset, model):

    batch_size = 512
    max_epochs = 20

    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(SEQUENCE_LENGTH)
        losses = []
        for batch, (x, y) in enumerate(dataloader):

            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            losses.append(loss.item())


        print({ 'epoch': epoch, 'loss: ': np.array(losses).mean() })

In [10]:
vowels = set("aoiuye'")
def devowelize(s):
    rv = ''.join(a for a in s if a not in vowels)
    if rv:
        return rv
    return '_' # Symbol for words without consonants

pooh_words = set(open('/content/sample_data/pooh_words.txt').read().split())
representation = dd(set)

for w in pooh_words:
    r = devowelize(w)
    representation[r].add(w)

hard_words = set()
for r, ws in representation.items():
    if len(ws) > 1:
        hard_words.update(ws)

In [11]:
def reconstruct_sentence_1(model, sentence, dataset, t=1.0):
    # using only last word
    words = sentence
    devowelized_sentence = [devowelize(w) for w in words]
    model.eval()

    state_h, state_c = model.init_state(1)

    reconstructed = []
    probabilities = []

    matching = representation[devowelized_sentence[0]]
    reconstructed.append(random.choice(list(matching)))

    for i in range(len(devowelized_sentence) - 1):
        try:
            x = torch.tensor([[pooh_dataset.word_to_index[reconstructed[-1]]]])
        except KeyError:
            pass

        x = x.to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]

        matching = representation[devowelized_sentence[i + 1]]
        try:
            matching_idx = [pooh_dataset.word_to_index[match] for match in matching]
        except KeyError:
            reconstructed.append(random.choice(list(matching)))
            continue
        p = torch.nn.functional.softmax(last_word_logits/t, dim=0).detach().cpu().numpy()
        p[~np.isin(np.arange(len(p)), matching_idx)] = 0
        p = p/p.sum()
        word_index = np.random.choice(len(last_word_logits), p=p)
        reconstructed.append(dataset.index_to_word[word_index])
        probabilities.append(p[word_index])
    return reconstructed, probabilities

In [12]:
def reconstruct_sentence_2(model, words, dataset, words_to_remember, t=1.0):
    # using last words_to_remember
    devowelized_sentence = [devowelize(w) for w in words]
    model.eval()

    state_h, state_c = model.init_state(words_to_remember)

    reconstructed = words[:words_to_remember]
    probabilities = []

    for i in range(words_to_remember, len(devowelized_sentence)):
        try:
            x = torch.tensor([[pooh_dataset.word_to_index[word] for word in reconstructed[:words_to_remember]]])
        except KeyError:
            pass

        x = x.to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]

        matching = representation[devowelized_sentence[i]]
        try:
            matching_idx = [pooh_dataset.word_to_index[match] for match in matching]
        except KeyError:
            reconstructed.append(random.choice(list(matching)))
            continue
        p = torch.nn.functional.softmax(last_word_logits/t, dim=0).detach().cpu().numpy()
        p[~np.isin(np.arange(len(p)), matching_idx)] = 0
        p = p/p.sum()

        word_index = np.random.choice(len(last_word_logits), p=p)
        reconstructed.append(dataset.index_to_word[word_index])
        probabilities.append(p[word_index])
    return reconstructed, probabilities

In [13]:
def accuracy(original_sequence, reconstructed_sequence):
    sa = original_sequence
    sb = reconstructed_sequence
    score = len([1 for (a,b) in zip(sa, sb) if a == b])
    return score / len(original_sequence)

In [14]:
def sample_text(model, test_dataset, pooh_dataset, iters):
    best_p, best_text = -np.inf, None
    for _ in tqdm(range(iters)):
        t, p = reconstruct_sentence_1(model, test_dataset, pooh_dataset)
        if np.log(p).sum() > best_p:
            best_p = np.log(p).sum()
            best_text = t
    return best_text, best_p

In [15]:
pooh_dataset = PoohDataset(SEQUENCE_LENGTH, device)
test_dataset = open('/content/sample_data/pooh_test.txt').read()
test_dataset = test_dataset.lower().split()

## Model 1

In [21]:
model = LSTMModel(len(pooh_dataset.uniq_words), device)
model.to(device)
# train(pooh_dataset, model)
model.load_state_dict(torch.load('/content/sample_data/pooh_2x512_30ep.model'))

<All keys matched successfully>

In [17]:
reconstruction_1, prob_1 = reconstruct_sentence_1(model, test_dataset, pooh_dataset)
reconstruction_2, prob_2 = reconstruct_sentence_2(model, test_dataset, pooh_dataset, words_to_remember=5)
print(accuracy(test_dataset, reconstruction_1))
print(accuracy(test_dataset, reconstruction_2))

0.7759918356933282
0.6648807245822171


In [18]:
best_text, best_p = sample_text(model, test_dataset, pooh_dataset, 20)

100%|██████████| 20/20 [01:25<00:00,  4.30s/it]


In [22]:
accuracy(test_dataset, best_text)

0.7794361525704809

In [24]:
print(best_p)

-1080.584


## Model 2

In [25]:
model = MyLSTMModel(len(pooh_dataset.uniq_words), device)
model.to(device)
# train(pooh_dataset, model)
# torch.save(model.state_dict(), '/content/sample_data/pooh_3x512_20ep.model')
# model.load_state_dict(torch.load('/home/maria/Documents/NLP/data/assignment_5/pooh_2x512_30ep.model'))

{'epoch': 0, 'loss: ': 5.699029926668134}
{'epoch': 1, 'loss: ': 5.499392442535936}
{'epoch': 2, 'loss: ': 5.486383848022997}
{'epoch': 3, 'loss: ': 5.477660266976607}
{'epoch': 4, 'loss: ': 5.47069087363126}
{'epoch': 5, 'loss: ': 5.466182424311052}
{'epoch': 6, 'loss: ': 5.462064868525455}
{'epoch': 7, 'loss: ': 5.458349035497298}
{'epoch': 8, 'loss: ': 5.4553016696059915}
{'epoch': 9, 'loss: ': 5.452650664145486}
{'epoch': 10, 'loss: ': 5.4497811417830615}
{'epoch': 11, 'loss: ': 5.321557111907423}
{'epoch': 12, 'loss: ': 4.792589262912148}
{'epoch': 13, 'loss: ': 4.364009411711442}
{'epoch': 14, 'loss: ': 4.0406886799293655}
{'epoch': 15, 'loss: ': 3.8146101361826847}
{'epoch': 16, 'loss: ': 3.6332992127067163}
{'epoch': 17, 'loss: ': 3.4835236637215865}
{'epoch': 18, 'loss: ': 3.3542638682482537}
{'epoch': 19, 'loss: ': 3.219769099302459}


In [26]:
reconstruction_1, prob_1 = reconstruct_sentence_1(model, test_dataset, pooh_dataset)
reconstruction_2, prob_2 = reconstruct_sentence_2(model, test_dataset, pooh_dataset, words_to_remember=5)
print(accuracy(test_dataset, reconstruction_1))
print(accuracy(test_dataset, reconstruction_2))

0.7858145171578007
0.6540375047837734


In [27]:
best_text, best_p = sample_text(model, test_dataset, pooh_dataset, 20)

100%|██████████| 20/20 [01:33<00:00,  4.67s/it]


In [28]:
accuracy(test_dataset, best_text)

0.7799464217374665

In [29]:
print(best_p)

-1401.2726
