# INF 265 - Deep learning: Project 3 - Sequence models

### Imports

In [1]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import time
import os
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

seed = 123
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)

device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))

## Task 2

### 2.1 Word Embeddings

1. Read txt files and tokenize them to obtain train/validation/test lists of words.

In [2]:
TOKENIZER_EN = get_tokenizer('basic_english')
PATH_GENERATED = './generated/'
# Minimum number of occurence of a word in the text to add it to the vocabulary
MIN_FREQ = 100


def read_files(datapath="./"):
    files = os.listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]

    lines = []
    for f_name in files:
        with open(f_name, encoding='utf-8') as f:
            lines += f.readlines()
    return lines


def tokenize(lines, tokenizer=TOKENIZER_EN):
    """
    Tokenize the list of lines
    """
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text


def yield_tokens(lines, tokenizer=TOKENIZER_EN):
    """
    Yield tokens, ignoring names and digits to build vocabulary
    """

    no_digits = '\w*[0-9]+\w*'
    no_names = '\w*[A-Z]+\w*'
    no_spaces = '\s+'
    
    for line in lines:
        line = re.sub(no_digits, ' ', line)
        line = re.sub(no_names, ' ', line)
        line = re.sub(no_spaces, ' ', line)
        yield tokenizer(line)

def count_freqs(words, vocab):
    """
    Count occurrences of each word in vocabulary in the data
    
    Useful to get some insight on the data and to compute loss weights
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        freqs[vocab[w]] += 1
    return freqs



def create_vocabulary(lines, min_freq=MIN_FREQ):
    """
    Create a vocabulary (list of known tokens) from a list of strings
    """
    # vocab contains the vocabulary found in the data, associating an index to each word
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    # Since we removed all words with an uppercase when building the vocabulary, we skipped the word "I"
    vocab.append_token("i")
    # Value of default index. This index will be returned when OOV (Out Of Vocabulary) token is queried.
    vocab.set_default_index(vocab["<unk>"])
    return vocab


2. Define a vocabulary based on the training dataset. To avoid getting a too large vocabulary,
we keep only words that appear at least 100 times in the training dataset.
Report the total number of words in the training dataset, the number of distinct words in the
training dataset, and the size of the defined vocabulary.

In [3]:
lines_books_train = read_files('./inf265_v24_project03_data/data_train/')
lines_books_val = read_files('./inf265_v24_project03_data/data_val/')
lines_books_test = read_files('./inf265_v24_project03_data/data_test/')

# List of words contained in the dataset
words_train = tokenize(lines_books_train)
words_val = tokenize(lines_books_val)
words_test = tokenize(lines_books_test)


vocab = create_vocabulary(lines_books_train, min_freq=100)
VOCAB_SIZE = len(vocab)


print("Total number of words in the training dataset:     ", len(words_train))
print("Total number of words in the validation dataset:   ", len(words_val))
print("Total number of words in the test dataset:         ", len(words_test))
print("Number of distinct words in the training dataset:  ", len(set(words_train)))
print("Number of distinct words kept (vocabulary size):   ", VOCAB_SIZE)

freqs = count_freqs(words_train, vocab)
print("occurences:\n", [(f.item(), w) for (f, w)  in zip(freqs, vocab.lookup_tokens(range(VOCAB_SIZE)))])    

Total number of words in the training dataset:      2684706
Total number of words in the validation dataset:    49526
Total number of words in the test dataset:          124152
Number of distinct words in the training dataset:   52105
Number of distinct words kept (vocabulary size):    1880
occurences:
 [(433907, '<unk>'), (182537, ','), (151278, 'the'), (123727, '.'), (82289, 'and'), (65661, 'of'), (62763, 'to'), (49230, 'a'), (41477, 'in'), (31052, 'that'), (37167, 'he'), (29046, 'was'), (26508, 'his'), (26354, 'it'), (20862, 'with'), (20159, 'had'), (19965, 'is'), (15692, 'not'), (16593, 'as'), (15705, 'on'), (14464, 'him'), (15317, 'for'), (15838, 'at'), (15952, 'you'), (13255, 'be'), (12698, 'her'), (12798, 's'), (11924, 'which'), (11808, '!'), (11740, 'all'), (10338, '?'), (10205, 'have'), (10405, 'from'), (13251, 'but'), (11464, 'this'), (9439, 'by'), (11496, 'they'), (8797, 'said'), (8800, 'are'), (11055, 'she'), (9537, 'one'), (8219, 'were'), (8564, 'who'), (8345, 'so'), (9409

Comment on result:
The total number of words in the trainingset is a lot with 2 684 706 words. However the number goes down drasticaly when we limit it to distinct words in the trainingset and we can even narrow it down more to a vocabuary size of only 1880 when we limit words that occur over 100 times. 
The most common words in the text is special characters like "," and "."

If we look away from those special characters we see that the most common words are expected commonly used words like "the", "and", "of", "to" etc.

3. Define a continuous bag of words model architecture based on this vocabulary that contains
an embedding layer.

In [4]:
class CBOW(torch.nn.Module):
    def __init__(self, embedding_dim, vocab_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):        
        embeddings = self.embeddings(inputs)
        embeddings = torch.sum(embeddings, dim=1)        
        out = self.linear(embeddings)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

We define the training function, accuracy function and a function to create context target pairs so that we have data to train on. We use a context size of 3 in this project.

In [5]:
def create_dataset(text, vocab, context_size=3):
    """
    Create a pytorch dataset of context / target pairs from a text
    """
    n_text = len(text)
    
    # Transform the text as a list of integers.
    txt = [vocab[w] for w in text]

    # Start constructing the context / target pairs.
    contexts = []
    targets = []
    for i in range(n_text - context_size):
        # Word used to define target
        t = txt[i + context_size]
        # Context before the target
        c = txt[i:i + context_size]
        targets.append(t)
        contexts.append(torch.tensor(c))
            
    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    return TensorDataset(contexts, targets)



def train(n_epochs, optimizer, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    optimizer.zero_grad(set_to_none=True)

    for epoch in range(1, n_epochs + 1):

        loss_train = 0.0
        for contexts, targets in train_loader:

            contexts = contexts.to(device=device)
            targets = targets.to(device=device)
            outputs = model(contexts)

            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()

        losses_train.append(loss_train / n_batch)
        if epoch == 1 or epoch % 2 == 0 or epoch == n_epochs:
            print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
                datetime.now().time(), epoch, loss_train / n_batch))            
    #return losses_train


def compute_accuracy(model, loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for contexts, targets in loader:
            contexts = contexts.to(device=device)
            targets = targets.to(device=device)

            outputs = model(contexts)
            _, predicted = torch.max(outputs, dim=1)
            total += len(targets)
            correct += int((predicted == targets).sum())

    acc =  correct / total
    return acc

Training several models and picking best based on validation accuracy

In [6]:
# context size for this project
CONTEXT_SIZE = 3
train_data = create_dataset(words_train, vocab, context_size=CONTEXT_SIZE)
val_data = create_dataset(words_val, vocab, context_size=CONTEXT_SIZE)
test_data = create_dataset(words_test, vocab, context_size=CONTEXT_SIZE)

#Validation dataloader
val_loader = DataLoader(val_data, batch_size=64, shuffle=True)

#Global params
n_epochs = 5
loss_fn = nn.NLLLoss() 


#Hyperparams
batch_size = [32, 64]
lr_list = [0.1, 0.01]

#Initial best params
best_model = None
best_batch = 0
best_lr = 0
best_v_acc = 0

#Model selection
for bs in batch_size:
    for lr in lr_list:
        print(f"Training model with bach size: {bs}, and learning rate: {lr} ")
        model = CBOW(embedding_dim = 16, vocab_size=VOCAB_SIZE).to(device=device) 
        train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
        optimizer = optim.Adam(model.parameters(), lr = lr)
        #Train
        net_weights = train(n_epochs=n_epochs, optimizer=optimizer, model=model, loss_fn=loss_fn, train_loader=train_loader)
        #Compute Validation acc
        val_acc = compute_accuracy(model, val_loader, device)

        print(f"Model: validation accuracy:{val_acc}")
        if val_acc > best_v_acc:
            best_batch = bs
            best_lr = lr
            best_v_acc = val_acc
            best_model = model


print(f"Final best model is a model with bach size: {best_batch}, and learning rate: {best_lr} ")
print()
print(f"Best model: validation accuracy: {best_v_acc:.2%}")

Training model with bach size: 32, and learning rate: 0.1 
17:06:18.713324  |  Epoch 1  |  Training loss 7.67768
17:08:05.932954  |  Epoch 2  |  Training loss 7.71238
17:11:38.352978  |  Epoch 4  |  Training loss 7.71309
17:13:25.239821  |  Epoch 5  |  Training loss 7.71305
Model: validation accuracy:0.08448599640571047
Training model with bach size: 32, and learning rate: 0.01 
17:15:12.755205  |  Epoch 1  |  Training loss 4.61213
17:17:00.654419  |  Epoch 2  |  Training loss 4.58096
17:20:33.026902  |  Epoch 4  |  Training loss 4.57998
17:22:16.943694  |  Epoch 5  |  Training loss 4.57978
Model: validation accuracy:0.1690123780869495
Training model with bach size: 64, and learning rate: 0.1 
17:23:18.041192  |  Epoch 1  |  Training loss 6.12135
17:24:17.974190  |  Epoch 2  |  Training loss 6.12672
17:26:18.167468  |  Epoch 4  |  Training loss 6.12640
17:27:19.263966  |  Epoch 5  |  Training loss 6.12749
Model: validation accuracy:0.10740464026815823
Training model with bach size: 64,

Unbiased estimate using test accuracy

In [7]:
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
tes_accuracy = compute_accuracy(best_model, test_loader, device=device)
print(f"Test accuracy of best model {tes_accuracy:.2%}")

Test accuracy of best model 21.14%


4.  Compute cosine similarity

In [8]:
net_emb_w = best_model.embeddings.weight.data
normalized_embeddings = F.normalize(net_emb_w)

#Cosine similarity matrix
cosine_similarity_matrix = torch.mm(normalized_embeddings, normalized_embeddings.transpose(0, 1))
words_to_check = ['me', 'white', 'man', 'have', 'be', 'child', 'yes', 'what']
word_indices = [vocab.get_stoi()[word] for word in words_to_check if word in vocab.get_stoi()]
top = 10
most_similar_words = {}

for index in word_indices:
    similarities = cosine_similarity_matrix[index]
    top_indices = torch.topk(similarities, top + 1).indices
    similar_words = [vocab.get_itos()[i] for i in top_indices]
    most_similar_words[vocab.get_itos()[index]] = similar_words


for word, similar_words in most_similar_words.items():
    print(f"Words most similar to '{word}':")
    for rank, similar_word in enumerate(similar_words, 1):
        print(f"  Nr. {rank}: {similar_word}")    


Words most similar to 'me':
  Nr. 1: me
  Nr. 2: yourself
  Nr. 3: die
  Nr. 4: ourselves
  Nr. 5: us
  Nr. 6: himself
  Nr. 7: myself
  Nr. 8: to-night
  Nr. 9: mine
  Nr. 10: him
  Nr. 11: matters
Words most similar to 'white':
  Nr. 1: white
  Nr. 2: thin
  Nr. 3: red
  Nr. 4: deep
  Nr. 5: bright
  Nr. 6: blue
  Nr. 7: grey
  Nr. 8: bare
  Nr. 9: green
  Nr. 10: large
  Nr. 11: black
Words most similar to 'man':
  Nr. 1: man
  Nr. 2: child
  Nr. 3: girl
  Nr. 4: dog
  Nr. 5: woman
  Nr. 6: doctor
  Nr. 7: officer
  Nr. 8: bondes
  Nr. 9: person
  Nr. 10: mother
  Nr. 11: enemy
Words most similar to 'have':
  Nr. 1: have
  Nr. 2: has
  Nr. 3: ve
  Nr. 4: nearly
  Nr. 5: had
  Nr. 6: having
  Nr. 7: hast
  Nr. 8: produce
  Nr. 9: give
  Nr. 10: paid
  Nr. 11: make
Words most similar to 'be':
  Nr. 1: be
  Nr. 2: being
  Nr. 3: art
  Nr. 4: been
  Nr. 5: were
  Nr. 6: remain
  Nr. 7: names
  Nr. 8: nearly
  Nr. 9: died
  Nr. 10: commander
  Nr. 11: sleep
Words most similar to 'child':

Not all the words seem to be similar, for example with the word 'child' only 'boy' and 'girl' seems to be words with similar meaning. This is probably a consequence of our small embedding dimension size, however we still get some similar words.
Like for example the most similar words to 'have' (has, ve, nearly, had, having, hast) or 'what' (what, whatever, how, although) shows very similar words. This shows that our embedding space does manage to show patterns of similarity.


6. Visualize the embedding space on https://projector.tensorflow.org/. 

In [9]:
import csv

with open("./embedding.tsv", "wt") as out_file:
    tsv_writer = csv.writer(out_file, delimiter="\t")
    for i in net_emb_w:
        tsv_writer.writerow(i.cpu().numpy())

with open("./vocab.tsv", "wt") as out_file:
    tsv_writer = csv.writer(out_file, delimiter="\t")
    vocab_dict = vocab.get_itos()
    for i in vocab_dict:
        tsv_writer.writerow([i])


We visualize the embedding space to see if we can find some meaningfull clusters.
The figures shows the 100 nearest neighbors using cosine similarity.

![Embedding of gives](Figures\Embedding_gives.png)

Cluster of word: 'gives'.

![Embedding of two](Figures\Embedding_two.png)

Cluster of word: 'two'.

![Embedding of gazed](Figures\Embedding_gazed.png)

Cluster of word: 'Gazed'

After visualizing the embedding space, we found three words that showed some meaningful clustering, 'gives', 'two' and 'gazed'

The word ‘gives’ has datapoints that are more spread but seem to be pointing in the same direction. The embedding shows words like ‘gave’, ‘give’, ‘offered’, ‘sent’, ‘giving’ that are very similar.

The word ‘two’ seems to have a cluster that is less spread, and we find that the cluster includes different numbers like ‘four’. ‘eight’, ‘fifteen’, ‘seven’, ‘nine’ etc. There are also words like ‘several’ and ‘many’ in this cluster, so this cluster seems to represent quantity.

The word ‘gazed’ gives a cluster more like the ‘gives’ cluster, where the points are more spread out but seem to be pointing in the same direction. Here we see similar words like ‘glanced’, ‘looked’, ‘gazing’, ‘looking’, ‘stared’, ‘glancing’ and ‘glance’.




### 2.2 Conjugating be and have

1. Use your trained word embedding and define a simple MLP architecture, an MLP architecture
that has first an attention layer (see section 4), as well as a RNN architecture to predict be
and have conjugation given the context around the missing target. Use the same context
size max len for both MLPs and RNNs, even though RNNs and attention layers could take a
context size of arbitrary length. You are not allowed to use nn.LazyLinear in this project.


Need to define 3 networks:
1. Simple MLP
2. MLP with attention layer
3. A recurrent neural network

Define our targets, we have 12 targets when we account for all conjuagtions of both 'be' and 'having'

In [10]:
conjugate_be =["be", "been", "being", "am", "is", "are", "was", "were"]
conjugate_have = ["have", "has", "had", "having"]
all_targets = (conjugate_be + conjugate_have)

Define our models and create our target dataset using our all_targets array 

In [11]:
# Simple MLP Model using the pretrained embeddings
class MLP(nn.Module):
    def __init__(self, pretrained_embeddings, num_classes, context_size, hidden_dim=128):
        super(MLP, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        (_, embedding_dim) = pretrained_embeddings.shape
        self.fc1 = nn.Linear(embedding_dim * context_size *2, hidden_dim) #Times 2 to get context around the target
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        embeddings = self.embeddings(inputs).view(inputs.shape[0], -1)
        x = self.relu(self.fc1(embeddings))
        x = self.fc2(x)
        return x


# MLP Model with attention layer using the pretrained embeddings

# 4.1 Positional encoding
def positional_encoding(max_len, emb_dim):
    P = torch.zeros((max_len, emb_dim))
    
    for i in range(max_len):
        for j in range(emb_dim // 2):
            # Apply sin to even indices in the array; 2i
            P[i, 2 * j] = np.sin(i / np.power(10000, (2 * j) / emb_dim))
            # Apply cos to odd indices in the array; 2i+1
            P[i, 2 * j + 1] = np.cos(i / np.power(10000, (2 * j) / emb_dim))
    return P

# 4.2 Simple-head dot-product self-attention
class SimpleAttention(torch.nn.Module):
    def __init__(self, emb_dim, p):
        super(SimpleAttention, self).__init__()
        self.emb_dim = emb_dim
        self.p = p
        # weight matrices
        self.Wq = torch.nn.Parameter(torch.randn(emb_dim, p))
        self.Wk = torch.nn.Parameter(torch.randn(emb_dim, p))
        self.Wv = torch.nn.Parameter(torch.randn(emb_dim, p))

    def forward(self, Xf):
        # Compute queries, keys, values
        Q = torch.matmul(Xf, self.Wq)
        K = torch.matmul(Xf, self.Wk)
        V = torch.matmul(Xf, self.Wv)

        # Compute the dot products of Q and K for the attention scores
        scores = torch.bmm(Q, K.transpose(-2, -1)) / np.sqrt(self.p)

        # Apply softmax to get the attention weights
        attention_weights = F.softmax(scores, dim=-1)

        # Compute the output of the attention layer
        h = torch.bmm(attention_weights, V)

        return h

# 4.3 Multi-head attention
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, p, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention_heads = nn.ModuleList([SimpleAttention(emb_dim, p) for _ in range(n_head)])
        self.Wo = nn.Linear(p * n_head, emb_dim)

    def forward(self, Xf):
        heads = [head(Xf) for head in self.attention_heads]
        heads_concat = torch.cat(heads, dim=-1)
        H = self.Wo(heads_concat) #Combine heads
        return H

# MLP Model with attention layer
class MLPAttention(nn.Module):
    def __init__(self, pretrained_embeddings, p, n_heads, max_len, num_classes):
        super(MLPAttention, self).__init__()
        # Pre-trained embeddings
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        (_, emb_dim) = pretrained_embeddings.shape
        #Positonal encoding with max_len*2 for context around target
        self.pos_encoding = positional_encoding(max_len*2, emb_dim).to(device)
        self.attention = MultiHeadAttention(emb_dim, p, n_heads)
        self.fc = nn.Linear(emb_dim, num_classes)

    def forward(self, input_ids):
        embeddings = self.embeddings(input_ids)
        # Add positional encoding
        embeddings += self.pos_encoding[:embeddings.size(1), :]
        #Attention
        attention_output = self.attention(embeddings)    
        x = F.relu((attention_output.mean(dim=1)))
        logits = self.fc(x)
        
        return logits


# RNN Model using pretrained embeddings
class RecurrentNN(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_size, num_classes):
        super(RecurrentNN, self).__init__()
        (vocab_size, embedding_dim) = pretrained_embeddings.shape
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embeddings(x)
        output, _ = self.rnn(embedded)                                           
        out = output[:,-1,:]
        out = self.fc(out)
        return out


# Function to create context target dataset 
def create_target_dataset(text, vocab, context_size=3, target_words=all_targets):
    n_text = len(text)
    txt = [vocab[w] for w in text]
    contexts = []
    targets = []

    for i in range(context_size, n_text - context_size):
        if vocab.lookup_token(txt[i]) in target_words:
            c = txt[i-(context_size):i] + txt[i+1:i+1+(context_size)] #Context around the target
            targets.append(target_words.index(vocab.lookup_token(txt[i])))
            contexts.append(torch.tensor(c))

    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    return TensorDataset(contexts, targets)


Train several models and select best based on validation performance

In [12]:
n_epochs_conj = 8 # Small number of epochs to speed up training time
loss_fn_conj = nn.CrossEntropyLoss() #Use crossEntropy since we do not do softmax on output
max_len = 3 # Same max_len for all

#Create validation and training dataset
val_data_conj = create_target_dataset(words_val, vocab, context_size=max_len, target_words=all_targets)
train_data_conj = create_target_dataset(words_train, vocab, context_size=max_len, target_words=all_targets)

# Validation loader with a batch size of 64
val_loader_conj = DataLoader(val_data_conj, batch_size=64, shuffle=False)

#Init best parametres
best_model = None
best_time = 0
best_batch = 0
best_lr = 0
best_v_acc = 0

#Hyperparams
model_architectures = ['SimpleMLP', 'MlpAttention', 'RecurrentNN']
batch_size = [32, 64]
lr_list = [0.1, 0.01]

for model_cls in model_architectures:
    for bs in batch_size:
        for lr in lr_list:
            print("----------------------------------------------------------------------------")
            print(f"Training {model_cls} with batch size: {bs}, and learning rate: {lr}")

            # Start timing
            start_time = time.time()
            if model_cls == 'SimpleMLP':
                model = MLP(pretrained_embeddings=net_emb_w, context_size=max_len,
                                 num_classes=len(all_targets)).to(device=device)
            elif model_cls == 'MlpAttention':
                model = MLPAttention(pretrained_embeddings=net_emb_w, p=32, n_heads = 12, 
                                     max_len=max_len, num_classes=len(all_targets)).to(device=device)

            elif model_cls == 'RecurrentNN':
                model = RecurrentNN(pretrained_embeddings=net_emb_w, hidden_size=12, 
                                        num_classes=len(all_targets)).to(device=device)


            train_loader = DataLoader(train_data_conj, batch_size=bs, shuffle=True)
            optimizer = optim.Adam(model.parameters(), lr=lr)
            
            # Train the model
            train(n_epochs=n_epochs_conj, model=model, optimizer=optimizer, loss_fn=loss_fn_conj, train_loader=train_loader)
            
            # Training time
            elapsed_time = time.time() - start_time

            # Validation Accuracy
            val_acc = compute_accuracy(model, val_loader_conj, device)

            print(f"{model._get_name()}: Validation accuracy: {val_acc:.2%}, Training Time: {elapsed_time:.2f} seconds")
            if val_acc > best_v_acc:
                best_batch = bs
                best_lr = lr
                best_v_acc = val_acc
                best_model = model
                best_time = elapsed_time

# Output the best model and hyperparams
print("-------------------- Modelselection complete --------------------")
print(f"Best Model is {best_model._get_name()} with batch size: {best_batch}, and learning rate: {best_lr}")
print(f"Best Model Validation Accuracy: {best_v_acc:.2%} And it's training time was {best_time:.2f} seconds")

----------------------------------------------------------------------------
Training SimpleMLP with batch size: 32, and learning rate: 0.1
17:32:31.142442  |  Epoch 1  |  Training loss 1.94580
17:32:36.244326  |  Epoch 2  |  Training loss 1.85612
17:32:46.330994  |  Epoch 4  |  Training loss 1.83992
17:32:56.311069  |  Epoch 6  |  Training loss 1.81631
17:33:07.083825  |  Epoch 8  |  Training loss 1.81488
MLP: Validation accuracy: 38.61%, Training Time: 41.05 seconds
----------------------------------------------------------------------------
Training SimpleMLP with batch size: 32, and learning rate: 0.01
17:33:12.418409  |  Epoch 1  |  Training loss 1.26009
17:33:17.955166  |  Epoch 2  |  Training loss 1.17409
17:33:28.839519  |  Epoch 4  |  Training loss 1.13768
17:33:39.547720  |  Epoch 6  |  Training loss 1.12298
17:33:49.602359  |  Epoch 8  |  Training loss 1.10968
MLP: Validation accuracy: 57.22%, Training Time: 42.48 seconds
-----------------------------------------------------

Unbiased estimate of best model

In [13]:
test_data_conj = create_target_dataset(words_val, vocab, context_size=max_len, target_words=all_targets)

test_loader_conj = DataLoader(test_data_conj, batch_size=64, shuffle=False)
tes_accuracy_conj = compute_accuracy(best_model, test_loader_conj, device=device)
print(f"Test accuracy of best model {tes_accuracy_conj:.2%}")

Test accuracy of best model 57.72%


We train the three different architectures with 8 epochs and with 4 different combinations of hyperparameters.
this gives us a total of 12 models. 

All the RNNs we train use the same hidden_size = 12. All of The MLPs with attention layer uses the same amount of heads n_heads=12 and a p value of p=32

The small number of epochs is due to time, as it takes quite a while to train the models if we increase it to much.


We then measure the validation accuracy after each training to find the best model. If we look at the training times we can see that the MLPs with a single
attention layer seemed to take the longest to train with around 220 seconds.
While the simple MLP and the RNN used much shorter time, with the RNN using least amount of time. We also notice that the training times shorten when we increase the batch size.


Our validation accuracies range all the way from 18.42% when using a large learning rate to our
best model with 57.72% accuracy. It does seem like the learning rate plays a big role in the accuracies of the models, idealy
we would test more learning rates but due to time constraints this was not possible. The final selected model was a simple MLP with batch size 64 and
learning rate 0.01, and a training time of around 22 seconds. This is a bit suprising as we assumed the MLP with attention would perform best, but this is presumably not the case since we only use one attention layer here.


## 2.3 Text Generation

1. Define a RNN architecture that can predict the next word given the context before the target

In [14]:
class TextRNN(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_size):
        super(TextRNN, self).__init__()
        (vocab_size, embedding_dim) = pretrained_embeddings.shape
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, VOCAB_SIZE)
        self.softmax = nn.LogSoftmax(dim=-1) 

    def forward(self, x):
        embedded = self.embeddings(x)
        output, _ = self.rnn(embedded)
        out = output[:, -1, :]
        out = self.fc(out)
        return self.softmax(out)  # Returning log probabilities

2. Train several models

In [15]:
sequence_length = 15 

train_data = create_dataset(words_train, vocab, context_size=sequence_length)
val_data = create_dataset(words_val, vocab, context_size=sequence_length)

batch_size = [512, 1024]
lr_list = [0.01, 0.001]
hidden_size = 12

#Init best parametres
best_model = None
best_batch = 0
best_lr = 0
best_v_acc = 0

val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

for bs in batch_size:
    for lr in lr_list:
        model = TextRNN(net_emb_w, hidden_size)
        train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
        optimizer = optim.Adam(model.parameters(), lr=lr)        
        model.to(device)        

        print(f'Training model with batch_size={bs} and lr={lr}')
        train(n_epochs=5, model=model, optimizer=optimizer, loss_fn=loss_fn, train_loader=train_loader)


        val_acc = compute_accuracy(model, val_loader, device)

        print(f"Model: validation accuracy:{val_acc}")
        if val_acc > best_v_acc:
            best_batch = bs
            best_lr = lr
            best_v_acc = val_acc
            best_model = model

print(f"Final best model is a model with batch size: {best_batch}, and learning rate: {best_lr} ")
print()
print(f"Best model: validation accuracy: {best_v_acc:.2%}")

Training model with batch_size=512 and lr=0.01
17:48:28.189295  |  Epoch 1  |  Training loss 4.28182
17:48:55.498761  |  Epoch 2  |  Training loss 4.18086
17:49:49.514675  |  Epoch 4  |  Training loss 4.16636
17:50:16.654495  |  Epoch 5  |  Training loss 4.15735
Model: validation accuracy:0.20759023247359173
Training model with batch_size=512 and lr=0.001
17:50:44.821699  |  Epoch 1  |  Training loss 4.62276
17:51:10.681419  |  Epoch 2  |  Training loss 4.29648
17:52:00.406645  |  Epoch 4  |  Training loss 4.20922
17:52:26.402983  |  Epoch 5  |  Training loss 4.19087
Model: validation accuracy:0.2063177879663105
Training model with batch_size=1024 and lr=0.01
17:52:48.621073  |  Epoch 1  |  Training loss 4.33052
17:53:09.872835  |  Epoch 2  |  Training loss 4.17568
17:53:52.083105  |  Epoch 4  |  Training loss 4.14742
17:54:14.112533  |  Epoch 5  |  Training loss 4.14223
Model: validation accuracy:0.20769122013289976
Training model with batch_size=1024 and lr=0.001
17:54:36.285085  |  

In [16]:
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
tes_accuracy = compute_accuracy(best_model, test_loader, device=device)
print(f"Test accuracy of best model {tes_accuracy:.2%}")

Test accuracy of best model 24.61%


In [22]:
def beam_search(model, initial_words, vocab, n_words=10, beam_width=3):
    unk_index = vocab['<unk>']  
    initial_indices = [vocab[w] for w in initial_words]
    candidates = [(initial_indices, 0)]

    for _ in range(n_words):
        new_candidates = []
        for seq, score in candidates:
            input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
            with torch.no_grad():
                logits = model(input_tensor)
                probabilities = torch.exp(logits).view(-1)  
                topk_probs, topk_indices = probabilities.topk(beam_width)

            for i in range(beam_width):
                if topk_indices[i].item() != unk_index:  
                    new_seq = seq + [topk_indices[i].item()]
                    new_score = score - np.log(topk_probs[i].item())  
                    new_candidates.append((new_seq, new_score))

        candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]

    return ' '.join([vocab.get_itos()[i] for i in candidates[0][0]])


In [23]:
initial_words = ["the", "king", "and", "i"] 
generated_text = beam_search(best_model, initial_words, vocab, n_words=10, beam_width=10)
print(generated_text)

the king and i had been at the door , and he had been


In [24]:
initial_words = ["you", "and", "i"] 
generated_text = beam_search(best_model, initial_words, vocab, n_words=10, beam_width=10)
print(generated_text)

you and i know me . i shall not know that i have


In [30]:
initial_words = ["horses", "are", "big"] 
generated_text = beam_search(best_model, initial_words, vocab, n_words=10, beam_width=10)
print(generated_text)

horses are big , and he had been at the door , and


We tested with some words to try and make some sentences, they are generaly understandable but not the best. In the report we tested some more sentences.