In [55]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn import functional as F
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc
import warnings
import os
warnings.filterwarnings('ignore')
from torch.optim import Adam
import random
from collections import Counter
import copy
import pandas as pd

In [56]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

In [57]:
!wget https://www.cse.iitb.ac.in/~pjyothi/cs335/dataset-lab9.tar.gz

--2023-10-16 17:08:13--  https://www.cse.iitb.ac.in/~pjyothi/cs335/dataset-lab9.tar.gz
Resolving www.cse.iitb.ac.in (www.cse.iitb.ac.in)... 103.21.127.134
Connecting to www.cse.iitb.ac.in (www.cse.iitb.ac.in)|103.21.127.134|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 425206 (415K) [application/x-gzip]
Saving to: ‘dataset-lab9.tar.gz’


2023-10-16 17:08:15 (495 KB/s) - ‘dataset-lab9.tar.gz’ saved [425206/425206]



In [58]:
!mv "dataset-lab9.tar.gz" dataset.tar.gz

In [59]:
!tar -xvzf dataset.tar.gz

dataset/
dataset/test.csv
dataset/validation.txt
dataset/train.txt


## Training character-based LSTM language model

In [60]:
# load ascii text and convert to lowercase
train_file = "dataset/train.txt"
train_raw_text = open(train_file, 'r', encoding='utf-8').read()
train_raw_text = train_raw_text.lower()
print(len(train_raw_text))

val_file = "dataset/validation.txt"
val_raw_text = open(val_file, 'r', encoding='utf-8').read()
val_raw_text = val_raw_text.lower()
print(len(val_raw_text))

1016242
51726


In [61]:
# extract unique chars
train_chars = [char for char in train_raw_text[0:150000]]  # taking a subset to enable faster training times
val_chars = [char for char in val_raw_text[0:30000]]
train_chars_uniq = list(set(train_chars))
train_chars_uniq.append("[UNK]")

print(len(train_chars), len(val_chars))

150000 30000


In [62]:
# create mapping of unique chars to integers
char_to_int = {}
char_to_int["[UNK]"] = 0
cnt = 1
for char in train_chars:
  if char in train_chars_uniq and char not in char_to_int:
    char_to_int[char] = cnt
    cnt += 1
print(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())

{'[UNK]': 0, 'f': 1, 'i': 2, 'r': 3, 's': 4, 't': 5, ' ': 6, 'c': 7, 'z': 8, 'e': 9, 'n': 10, ':': 11, '\n': 12, 'b': 13, 'o': 14, 'w': 15, 'p': 16, 'd': 17, 'a': 18, 'y': 19, 'u': 20, 'h': 21, ',': 22, 'm': 23, 'k': 24, '.': 25, 'l': 26, 'v': 27, '?': 28, "'": 29, 'g': 30, ';': 31, '!': 32, 'j': 33, '-': 34, 'q': 35, 'x': 36, '&': 37}


In [63]:

n_train_chars = len(train_chars)
n_val_chars = len(val_chars)

n_vocab = len(train_chars_uniq)
print("Total train chars: ", n_train_chars)
print("Total val chars: ", n_val_chars)
print("Total char vocab size: ", n_vocab)

Total train chars:  150000
Total val chars:  30000
Total char vocab size:  38


In [64]:
# Prepare the training dataset of input to output pairs encoded as integers
seq_length_char = 100
train_dataX = []
train_dataY = []

for i in range(0, n_train_chars - seq_length_char, 1):
    seq_in = train_raw_text[i:i + seq_length_char]
    seq_out = train_raw_text[i + seq_length_char]

    if seq_out not in char_to_int: continue

    train_dataX.append([char_to_int[char] if char in char_to_int else char_to_int["[UNK]"] for char in seq_in])
    train_dataY.append(char_to_int[seq_out])

n_patterns = len(train_dataX)
print("Total number of train patterns: ", n_patterns)

example = [int_to_char[char] for char in train_dataX[0]]
print("".join(example))

Total number of train patterns:  149900
first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you


In [65]:
# Prepare the validation dataset of input to output pairs encoded as integers
val_dataX = []
val_dataY = []

for i in range(0, n_val_chars - seq_length_char, 1):
    seq_in = val_raw_text[i:i + seq_length_char]
    seq_out = val_raw_text[i+seq_length_char]
    assert len(seq_in) == seq_length_char
    if seq_out not in char_to_int: continue

    val_dataX.append([char_to_int[char] if char in char_to_int else char_to_int["[UNK]"] for char in seq_in])
    val_dataY.append(char_to_int[seq_out])

n_val_patterns = len(val_dataX)
print("Total number of validation patterns: ", n_val_patterns)
example = [int_to_char[char] for char in val_dataX[0]]
print("".join(example))

Total number of validation patterns:  29900
she vied so fast, protesting oath on oath,
that in a twink she won me to her love.
o, you are novice


In [66]:
# Randomizing the train and val sentences for better learning

all_X = train_dataX + val_dataX
all_Y = train_dataY + val_dataY
print(len(all_X), len(all_Y))

indexes = [i for i in range(len(all_X))]
random.seed(42)
random.shuffle(indexes)

train_X = [all_X[i] for i in indexes[0:int(0.9*len(indexes))]]
train_Y = [all_Y[i] for i in indexes[0:int(0.9*len(indexes))]]

val_X = [all_X[i] for i in indexes[int(0.9*len(indexes)):]]
val_Y = [all_Y[i] for i in indexes[int(0.9*len(indexes)):]]

print(len(train_X), len(val_X))

179800 179800
161820 17980


In [67]:
X_train, Y_train = torch.tensor(train_X, dtype=torch.int32), torch.tensor(train_Y)
X_val, Y_val = torch.tensor(val_X, dtype=torch.int32), torch.tensor(val_Y)

print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)

torch.Size([161820, 100]) torch.Size([161820])
torch.Size([17980, 100]) torch.Size([17980])


In [68]:
from torch.utils.data import DataLoader, TensorDataset

vectorized_train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(vectorized_train_dataset, batch_size=128, shuffle=True)

vectorized_val_dataset = TensorDataset(X_val, Y_val)
val_loader = DataLoader(vectorized_val_dataset, batch_size=128, shuffle=False)

In [69]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [70]:

from torch import nn
from torch.nn import functional as F

class LSTMTextGeneratorChar(nn.Module):
    def __init__(self, n_vocab, embed_len, n_layers, hidden_dim):
    #     super(LSTMTextGeneratorChar, self).__init__()
    #     # TODO: Create an LSTM LM followed by a linear layer with dropout (p = 0.3)
    #     # n_vocab: vocabulary size
    #     # embed_len: dimensionality of the input embeddings
    #     # n_layers: number of LSTM layers
    #     # hidden_dim: dimensionality of the LSTM hidden states
    #     self.n_vocab=n_vocab
    #     self.embed_len=embed_len
    #     self.n_layers=n_layers
    #     self.hidden_dim=hidden_dim



    #     # self.word_embedding = nn.Embedding(n_vocab, embed_len)

    #     lstm = nn.LSTM(embed_len, hidden_dim, n_layers, batch_first=True)

    #     linear=nn.Linear(hidden_dim,n_vocab)
    #     dropout=nn.Dropout(0.3)
        super(LSTMTextGeneratorChar, self).__init__()

        self.n_vocab = n_vocab
        self.embed_len = embed_len
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Define the word embedding layer
        self.word_embedding = nn.Embedding(n_vocab, embed_len)

        # Define the LSTM layer
        self.lstm = nn.LSTM(embed_len, hidden_dim, n_layers, batch_first=True)

        # Define the linear layer followed by dropout
        self.linear = nn.Linear(hidden_dim, n_vocab)
        self.dropout = nn.Dropout(0.3)


    def forward(self, X_batch):
        # TODO: Make sure you go through and understand all the following lines of code
        embeddings = self.word_embedding(X_batch)


        hidden, carry = torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device), torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device)
        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        return self.linear(self.dropout(output[:,-1, :]))

In [71]:
def train(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
  set_seed(42)

  best_checkpoint = None
  val_prev_loss = 10000000.0

  for i in range(1, epochs+1):
      losses = []
      print("Current epoch: ", i)
      model.train()

      for X, Y in tqdm(train_loader):
        Y_preds = model(X.to(device))

        loss = loss_fn(Y_preds, Y.to(device))
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

      if (i%3) == 0:
          val_losses = []
          model.eval()
          with torch.no_grad():
            for X, Y in tqdm(val_loader):
              # TODO: Complete the implementation and populate val_losses
              Y_val_preds = model(X.to(device))
              val_loss = loss_fn(Y_val_preds, Y.to(device))
              val_losses.append(val_loss.item())

          training_perplexity = torch.exp(torch.tensor(losses).mean()).item()
          validation_perplexity = torch.exp(torch.tensor(val_losses).mean()).item()
          print(f"Training Perplexity: {training_perplexity:.2f}")
          print(f"Validation Perplexity: {validation_perplexity:.2f}")


          # TODO: Print training/validation perplexities

          if torch.tensor(val_losses).mean().item() < val_prev_loss:
            print("checkpointing current model")
            best_checkpoint = copy.deepcopy(model)
            val_prev_loss = torch.tensor(val_losses).mean().item()

  return best_checkpoint, model

In [72]:
%%time

from torch.optim import Adam

epochs = 30
learning_rate = 5e-3
embed_len = 100
hidden_dim = 128
n_layers=1

set_seed(42)
loss_fn = nn.CrossEntropyLoss().to(device)
text_generator = LSTMTextGeneratorChar(n_vocab, embed_len, n_layers, hidden_dim).to(device)
optimizer = Adam(text_generator.parameters(), lr=learning_rate)

CPU times: user 4.64 ms, sys: 0 ns, total: 4.64 ms
Wall time: 4.51 ms


In [73]:

best_checkpoint_char, last_checkpoint_char = train(text_generator, loss_fn, optimizer, train_loader, val_loader, epochs)

Current epoch:  1


100%|██████████| 1265/1265 [00:10<00:00, 124.30it/s]


Current epoch:  2


100%|██████████| 1265/1265 [00:09<00:00, 130.67it/s]


Current epoch:  3


100%|██████████| 1265/1265 [00:09<00:00, 126.74it/s]
100%|██████████| 141/141 [00:00<00:00, 280.59it/s]


Training Perplexity: 5.47
Validation Perplexity: 4.98
checkpointing current model
Current epoch:  4


100%|██████████| 1265/1265 [00:10<00:00, 125.36it/s]


Current epoch:  5


100%|██████████| 1265/1265 [00:10<00:00, 125.48it/s]


Current epoch:  6


100%|██████████| 1265/1265 [00:10<00:00, 124.70it/s]
100%|██████████| 141/141 [00:00<00:00, 279.87it/s]


Training Perplexity: 5.03
Validation Perplexity: 4.70
checkpointing current model
Current epoch:  7


100%|██████████| 1265/1265 [00:10<00:00, 123.76it/s]


Current epoch:  8


100%|██████████| 1265/1265 [00:10<00:00, 126.50it/s]


Current epoch:  9


100%|██████████| 1265/1265 [00:10<00:00, 124.97it/s]
100%|██████████| 141/141 [00:00<00:00, 281.01it/s]


Training Perplexity: 4.85
Validation Perplexity: 4.63
checkpointing current model
Current epoch:  10


100%|██████████| 1265/1265 [00:10<00:00, 123.61it/s]


Current epoch:  11


100%|██████████| 1265/1265 [00:10<00:00, 123.34it/s]


Current epoch:  12


100%|██████████| 1265/1265 [00:10<00:00, 123.37it/s]
100%|██████████| 141/141 [00:00<00:00, 250.83it/s]


Training Perplexity: 4.77
Validation Perplexity: 4.60
checkpointing current model
Current epoch:  13


100%|██████████| 1265/1265 [00:10<00:00, 123.58it/s]


Current epoch:  14


100%|██████████| 1265/1265 [00:10<00:00, 123.41it/s]


Current epoch:  15


100%|██████████| 1265/1265 [00:09<00:00, 127.40it/s]
100%|██████████| 141/141 [00:00<00:00, 231.42it/s]


Training Perplexity: 4.72
Validation Perplexity: 4.57
checkpointing current model
Current epoch:  16


100%|██████████| 1265/1265 [00:10<00:00, 120.24it/s]


Current epoch:  17


100%|██████████| 1265/1265 [00:10<00:00, 123.09it/s]


Current epoch:  18


100%|██████████| 1265/1265 [00:10<00:00, 123.80it/s]
100%|██████████| 141/141 [00:00<00:00, 284.89it/s]


Training Perplexity: 4.70
Validation Perplexity: 4.57
Current epoch:  19


100%|██████████| 1265/1265 [00:10<00:00, 123.66it/s]


Current epoch:  20


100%|██████████| 1265/1265 [00:10<00:00, 124.25it/s]


Current epoch:  21


100%|██████████| 1265/1265 [00:10<00:00, 125.93it/s]
100%|██████████| 141/141 [00:00<00:00, 214.69it/s]


Training Perplexity: 4.69
Validation Perplexity: 4.58
Current epoch:  22


100%|██████████| 1265/1265 [00:09<00:00, 127.36it/s]


Current epoch:  23


100%|██████████| 1265/1265 [00:10<00:00, 123.54it/s]


Current epoch:  24


100%|██████████| 1265/1265 [00:10<00:00, 123.32it/s]
100%|██████████| 141/141 [00:00<00:00, 277.27it/s]


Training Perplexity: 4.70
Validation Perplexity: 4.54
checkpointing current model
Current epoch:  25


100%|██████████| 1265/1265 [00:10<00:00, 123.38it/s]


Current epoch:  26


100%|██████████| 1265/1265 [00:10<00:00, 123.38it/s]


Current epoch:  27


100%|██████████| 1265/1265 [00:10<00:00, 124.30it/s]
100%|██████████| 141/141 [00:00<00:00, 276.78it/s]


Training Perplexity: 4.73
Validation Perplexity: 4.58
Current epoch:  28


100%|██████████| 1265/1265 [00:10<00:00, 123.97it/s]


Current epoch:  29


100%|██████████| 1265/1265 [00:10<00:00, 123.92it/s]


Current epoch:  30


100%|██████████| 1265/1265 [00:10<00:00, 119.37it/s]
100%|██████████| 141/141 [00:00<00:00, 281.71it/s]

Training Perplexity: 4.77
Validation Perplexity: 4.54





## Training word-based LSTM language model

In [74]:
# extract unique words
train_words = [word for word in train_raw_text.split(" ")]
val_words = [word for word in val_raw_text.split(" ")]

# make a list of train words
train_words_subset_vocab = list(set(train_words[0:int(0.8*len(train_words))]))
train_words_subset_vocab.append("[UNK]")
train_words_vocab = list(set(train_words))
train_words_subset_vocab_set = set(train_words_subset_vocab)

print(len(train_words), len(val_words))

155159 7833


In [75]:
# create mapping of unique words to integers
word_to_int = {}
word_to_int["[UNK]"] = 0
cnt = 1
for word in train_words:
  if word in train_words_subset_vocab_set and word not in word_to_int:
    word_to_int[word] = cnt
    cnt += 1

int_to_word = dict((i, w) for w, i in word_to_int.items())

In [76]:
n_train_words = len(train_words)
n_val_words = len(val_words)

n_vocab = len(train_words_subset_vocab)
print("Total train words: ", n_train_words)
print("Total val words: ", n_val_words)
print("Total vocab size: ", n_vocab)

Total train words:  155159
Total val words:  7833
Total vocab size:  31954


In [77]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 30
train_dataX = []
train_dataY = []

train_raw_words = train_raw_text.split()

for i in range(0, n_train_words - seq_length, 1):
    seq_in = train_raw_words[i:i + seq_length]
    seq_out = train_raw_words[i+seq_length]
    assert len(seq_in) == seq_length

    if seq_out not in word_to_int: continue

    train_dataX.append([word_to_int[word] if word in word_to_int else word_to_int["[UNK]"] for word in seq_in])
    train_dataY.append(word_to_int[seq_out])

n_patterns = len(train_dataX)
print("Total train Patterns: ", n_patterns)
example = [int_to_word[word] for word in train_dataX[0]]
print(" ".join(example))

example = [int_to_word[word] for word in train_dataX[1]]
print(" ".join(example))

Total train Patterns:  142965
first [UNK] before we proceed any further, hear me speak. all: speak, speak. first [UNK] you are all resolved rather to die than to [UNK] all: [UNK] [UNK] first [UNK]
[UNK] before we proceed any further, hear me speak. all: speak, speak. first [UNK] you are all resolved rather to die than to [UNK] all: [UNK] [UNK] first [UNK] first,


In [78]:
# prepare the validation dataset of input to output pairs encoded as integers
val_dataX = []
val_dataY = []

val_raw_words = val_raw_text.split()

for i in range(0, n_val_words - seq_length, 1):
    seq_in = val_raw_words[i:i + seq_length]
    #print(" ".join(seq_in))
    seq_out = val_raw_words[i+seq_length]
    assert len(seq_in) == seq_length
    if seq_out not in word_to_int: continue

    val_dataX.append([word_to_int[word] if word in word_to_int else word_to_int["[UNK]"] for word in seq_in])
    val_dataY.append(word_to_int[seq_out])

n_val_patterns = len(val_dataX)
print("Total val Patterns: ", n_val_patterns)
example = [int_to_word[word] for word in val_dataX[0]]
print(" ".join(example))

Total val Patterns:  6306
she [UNK] so fast, [UNK] oath on oath, that in a [UNK] she won me to her love. o, you are [UNK] 'tis a world to see, how [UNK] when


In [79]:
# randomizing the train and val sentences for better learning
all_X = train_dataX + val_dataX
all_Y = train_dataY + val_dataY
print(len(all_X), len(all_Y))

indexes = [i for i in range(len(all_X))]
random.seed(42)
random.shuffle(indexes)

train_X = [all_X[i] for i in indexes[0:int(0.95*len(indexes))]]
train_Y = [all_Y[i] for i in indexes[0:int(0.95*len(indexes))]]

val_X = [all_X[i] for i in indexes[int(0.95*len(indexes)):]]
val_Y = [all_Y[i] for i in indexes[int(0.95*len(indexes)):]]

print(len(train_X), len(val_X))

149271 149271
141807 7464


In [80]:
X_train, Y_train = torch.tensor(train_X, dtype=torch.int32), torch.tensor(train_Y)
X_val, Y_val = torch.tensor(val_X, dtype=torch.int32), torch.tensor(val_Y)

print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)

torch.Size([141807, 30]) torch.Size([141807])
torch.Size([7464, 30]) torch.Size([7464])


In [81]:
from torch.utils.data import DataLoader, TensorDataset

vectorized_train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(vectorized_train_dataset, batch_size=128, shuffle=True)

vectorized_val_dataset = TensorDataset(X_val, Y_val)
val_loader = DataLoader(vectorized_val_dataset, batch_size=128, shuffle=False)

In [82]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [83]:
from torch import nn
from torch.nn import functional as F

from torch import nn
from torch.nn import functional as F

class LSTMTextGeneratorWord(nn.Module):
    def __init__(self, n_vocab, embed_len, n_layers, hidden_dim):
        # TODO: Complete the __init__ definition (as in char-based LSTMs)
        super(LSTMTextGeneratorWord, self).__init__()

        self.n_vocab = n_vocab
        self.embed_len = embed_len
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Define the word embedding layer
        self.word_embedding = nn.Embedding(n_vocab, embed_len)

        # Define the LSTM layer
        self.lstm = nn.LSTM(embed_len, hidden_dim, n_layers, batch_first=True)

        # Define the linear layer followed by dropout
        self.linear = nn.Linear(hidden_dim, n_vocab)
        self.dropout = nn.Dropout(0.3)


    def forward(self, X_batch):
        embeddings = self.word_embedding(X_batch)

        hidden, carry = torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device), torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device)
        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        # output, (hidden, carry) = self.lstm(embeddings)
        return self.linear(self.dropout(output[:,-1, :]))

In [84]:
%%time

from torch.optim import Adam

epochs = 10
learning_rate = 5e-3
embed_len = 300
hidden_dim = 256
n_layers=2

set_seed(42)
loss_fn = nn.CrossEntropyLoss().to(device)
text_generator = LSTMTextGeneratorWord(n_vocab, embed_len, n_layers, hidden_dim).to(device)
optimizer = Adam(text_generator.parameters(), lr=learning_rate)

CPU times: user 146 ms, sys: 33 ms, total: 179 ms
Wall time: 184 ms


In [85]:
best_checkpoint_word, last_checkpoint_word = train(text_generator, loss_fn, optimizer, train_loader, val_loader, epochs)

Current epoch:  1


100%|██████████| 1108/1108 [00:21<00:00, 51.41it/s]


Current epoch:  2


100%|██████████| 1108/1108 [00:21<00:00, 51.02it/s]


Current epoch:  3


100%|██████████| 1108/1108 [00:21<00:00, 50.48it/s]
100%|██████████| 59/59 [00:00<00:00, 170.34it/s]


Training Perplexity: 630.42
Validation Perplexity: 714.66
checkpointing current model
Current epoch:  4


100%|██████████| 1108/1108 [00:21<00:00, 50.70it/s]


Current epoch:  5


100%|██████████| 1108/1108 [00:21<00:00, 50.41it/s]


Current epoch:  6


100%|██████████| 1108/1108 [00:21<00:00, 50.81it/s]
100%|██████████| 59/59 [00:00<00:00, 168.12it/s]


Training Perplexity: 360.99
Validation Perplexity: 710.95
checkpointing current model
Current epoch:  7


100%|██████████| 1108/1108 [00:21<00:00, 50.80it/s]


Current epoch:  8


100%|██████████| 1108/1108 [00:21<00:00, 50.68it/s]


Current epoch:  9


100%|██████████| 1108/1108 [00:21<00:00, 50.80it/s]
100%|██████████| 59/59 [00:00<00:00, 135.24it/s]


Training Perplexity: 227.81
Validation Perplexity: 891.45
Current epoch:  10


100%|██████████| 1108/1108 [00:21<00:00, 50.77it/s]


## Generating text starting from a prompt

In [86]:
np.random.seed(48)
start = np.random.randint(0, len(val_raw_words)-seq_length)
prompt = val_raw_words[start:start+seq_length]
print("Prompt is: ", " ".join(prompt))
pattern = [word_to_int[w] if w in word_to_int else word_to_int["[UNK]"] for w in prompt]

last_checkpoint_word.eval()
set_seed(54)
print("Generation is:")
print()
with torch.no_grad():
    for i in range(10):
        # TODO: Generate the next ten words starting from prompt
        # Convert the pattern to a tensor
        pattern_tensor = torch.tensor(pattern, dtype=torch.int32).unsqueeze(0).to(device)

        # Use the model to generate the next word
        next_word_logits = last_checkpoint_word(pattern_tensor)

        # Apply a softmax function to get probabilities
        next_word_index = torch.argmax(next_word_logits,dim=-1).item()

        # Convert the word index back to a word
        next_word = int_to_word[next_word_index]

        # Print the generated word
        print(next_word, end=" ")
        # Update the pattern by adding the generated word's integer index
        pattern.append(next_word_index)

        # Remove the first word in the pattern to keep the length constant
        pattern = pattern[1:]


Prompt is:  bride and bridegroom coming home? gremio: a bridegroom say you? 'tis a groom indeed, a grumbling groom, and that the girl shall find. tranio: curster than she? why, 'tis impossible.
Generation is:

and i am a man of the house of the 

## Creating the submission file

In [87]:
# create the submission file
df = pd.read_csv("dataset/test.csv")
ids, sents = list(df["id"]), list(df["inputs"])
outputs = []

# making the predictions
last_checkpoint_word.eval()
softmax = nn.Softmax()

with torch.no_grad():
    for sent in sents:
        # format input array of int into PyTorch tensor
        sent_ids = [word_to_int[word] if word in word_to_int else word_to_int["[UNK]"] for word in sent.split()]
        x = torch.tensor(sent_ids, dtype=torch.int32).reshape(1, len(sent_ids))
        x = torch.tensor(x, dtype=torch.int32).detach()
        # generate logits as output from the model
        prediction = last_checkpoint_word(x.to(device))[0]
        # take softmax for probs
        # TODO: shape of outputs is (200, 100, 2)
        # For each of the 200 test sentences in test.csv, given the prefix in sent, outputs contains
        # the list of top 100 next-word predictions and its corresponding probabilities
        # Apply softmax to get probabilities
        prob = softmax(prediction)

        # Find the top 100 words with the highest probabilities
        top_words = torch.argsort(prob, descending=True)[:100]
        top_probabilities = prob[top_words].cpu().numpy()


        # Store the top words and their corresponding probabilities for this sentence
        X=[[int_to_word[w.item()],p.item()]for w,p in zip(top_words,top_probabilities)]
        outputs.append(X)


print(outputs[0])
print(np.shape(outputs))

# save the output file
np.save("outputs", outputs)
# TODO: Create new cells below for the extra credit part
# TODO: Also save the outputs for the extra credit part in a new file, np.save("ec-outputs", outputs)


[['i', 0.06877771019935608], ['and', 0.03624435141682625], ['my', 0.0319378636777401], ['ay,', 0.029673200100660324], ['what', 0.028927214443683624], ['the', 0.028812628239393234], ['why,', 0.025021085515618324], ['well,', 0.02181084267795086], ['o', 0.020405389368534088], ['a', 0.01868399977684021], ['he', 0.01843300089240074], ['o,', 0.01839754916727543], ['you', 0.01837000623345375], ['no,', 0.01834946870803833], ["'tis", 0.015607225708663464], ['nay,', 0.012782913632690907], ['king', 0.012461908161640167], ['but', 0.012355445884168148], ['how', 0.012009838595986366], ['what,', 0.01183494832366705], ['come,', 0.011769074015319347], ['if', 0.010838332585990429], ['good', 0.010691329836845398], ['not', 0.010589848272502422], ['so', 0.010029594413936138], ['for', 0.009642292745411396], ['it', 0.009501303546130657], ['in', 0.009496590122580528], ['then', 0.009235513396561146], ['we', 0.009130850434303284], ['with', 0.009009452536702156], ['now,', 0.00851100217550993], ['that', 0.0084535