In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.nn import functional as F
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc
import warnings
import os
warnings.filterwarnings('ignore')
from torch.optim import Adam
import random
from collections import Counter
import copy
import pandas as pd

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

In [None]:
!wget https://www.cse.iitb.ac.in/~pjyothi/cs335/dataset-lab9.tar.gz

In [None]:
!mv "dataset-lab9.tar.gz" dataset.tar.gz

In [None]:
!tar -xvzf dataset.tar.gz

## Training character-based LSTM language model

In [None]:
# load ascii text and convert to lowercase
train_file = "dataset/train.txt"
train_raw_text = open(train_file, 'r', encoding='utf-8').read()
train_raw_text = train_raw_text.lower()
print(len(train_raw_text))

val_file = "dataset/validation.txt"
val_raw_text = open(val_file, 'r', encoding='utf-8').read()
val_raw_text = val_raw_text.lower()
print(len(val_raw_text))

In [None]:
# extract unique chars
train_chars = [char for char in train_raw_text[0:150000]]  # taking a subset to enable faster training times
val_chars = [char for char in val_raw_text[0:30000]]
train_chars_uniq = list(set(train_chars))
train_chars_uniq.append("[UNK]")

print(len(train_chars), len(val_chars))

In [None]:
# create mapping of unique chars to integers
char_to_int = {}
char_to_int["[UNK]"] = 0
cnt = 1
for char in train_chars:
  if char in train_chars_uniq and char not in char_to_int:
    char_to_int[char] = cnt
    cnt += 1
print(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())

In [None]:
n_train_chars = len(train_chars)
n_val_chars = len(val_chars)

n_vocab = len(train_chars_uniq)
print("Total train chars: ", n_train_chars)
print("Total val chars: ", n_val_chars)
print("Total char vocab size: ", n_vocab)

In [None]:
# Prepare the training dataset of input to output pairs encoded as integers
seq_length_char = 100
train_dataX = []
train_dataY = []

for i in range(0, n_train_chars - seq_length_char, 1):
    seq_in = train_raw_text[i:i + seq_length_char]
    seq_out = train_raw_text[i + seq_length_char]

    if seq_out not in char_to_int: continue

    train_dataX.append([char_to_int[char] if char in char_to_int else char_to_int["[UNK]"] for char in seq_in])
    train_dataY.append(char_to_int[seq_out])

n_patterns = len(train_dataX)
print("Total number of train patterns: ", n_patterns)

example = [int_to_char[char] for char in train_dataX[0]]
print("".join(example))

In [None]:
# Prepare the validation dataset of input to output pairs encoded as integers
val_dataX = []
val_dataY = []

for i in range(0, n_val_chars - seq_length_char, 1):
    seq_in = val_raw_text[i:i + seq_length_char]
    seq_out = val_raw_text[i+seq_length_char]
    assert len(seq_in) == seq_length_char
    if seq_out not in char_to_int: continue

    val_dataX.append([char_to_int[char] if char in char_to_int else char_to_int["[UNK]"] for char in seq_in])
    val_dataY.append(char_to_int[seq_out])

n_val_patterns = len(val_dataX)
print("Total number of validation patterns: ", n_val_patterns)
example = [int_to_char[char] for char in val_dataX[0]]
print("".join(example))

In [None]:
# Randomizing the train and val sentences for better learning

all_X = train_dataX + val_dataX
all_Y = train_dataY + val_dataY
print(len(all_X), len(all_Y))

indexes = [i for i in range(len(all_X))]
random.seed(42)
random.shuffle(indexes)

train_X = [all_X[i] for i in indexes[0:int(0.9*len(indexes))]]
train_Y = [all_Y[i] for i in indexes[0:int(0.9*len(indexes))]]

val_X = [all_X[i] for i in indexes[int(0.9*len(indexes)):]]
val_Y = [all_Y[i] for i in indexes[int(0.9*len(indexes)):]]

print(len(train_X), len(val_X))

In [None]:
X_train, Y_train = torch.tensor(train_X, dtype=torch.int32), torch.tensor(train_Y)
X_val, Y_val = torch.tensor(val_X, dtype=torch.int32), torch.tensor(val_Y)

print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

vectorized_train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(vectorized_train_dataset, batch_size=128, shuffle=True)

vectorized_val_dataset = TensorDataset(X_val, Y_val)
val_loader = DataLoader(vectorized_val_dataset, batch_size=128, shuffle=False)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
from torch import nn
from torch.nn import functional as F

class LSTMTextGeneratorChar(nn.Module):
    def __init__(self, n_vocab, embed_len, n_layers, hidden_dim):
        # TODO: Create an LSTM LM followed by a linear layer with dropout (p = 0.3)
        # n_vocab: vocabulary size
        # embed_len: dimensionality of the input embeddings
        # n_layers: number of LSTM layers
        # hidden_dim: dimensionality of the LSTM hidden states
        pass

    def forward(self, X_batch):
        # TODO: Make sure you go through and understand all the following lines of code
        embeddings = self.word_embedding(X_batch)

        hidden, carry = torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device), torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device)
        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        return self.linear(self.dropout(output[:,-1, :]))

In [None]:
def train(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
  set_seed(42)

  best_checkpoint = None
  val_prev_loss = 10000000.0

  for i in range(1, epochs+1):
      losses = []
      print("Current epoch: ", i)
      model.train()

      for X, Y in tqdm(train_loader):
        Y_preds = model(X.to(device))

        loss = loss_fn(Y_preds, Y.to(device))
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

      if (i%3) == 0:
          val_losses = []
          model.eval()
          with torch.no_grad():
            for X, Y in tqdm(val_loader):
              # TODO: Complete the implementation and populate val_losses
              pass

          # TODO: Print training/validation perplexities

          if torch.tensor(val_losses).mean().item() < val_prev_loss:
            print("checkpointing current model")
            best_checkpoint = copy.deepcopy(model)
            val_prev_loss = torch.tensor(val_losses).mean().item()

  return best_checkpoint, model

In [None]:
%%time

from torch.optim import Adam

epochs = 30
learning_rate = 5e-3
embed_len = 100
hidden_dim = 128
n_layers=1

set_seed(42)
loss_fn = nn.CrossEntropyLoss().to(device)
text_generator = LSTMTextGeneratorChar(n_vocab, embed_len, n_layers, hidden_dim).to(device)
optimizer = Adam(text_generator.parameters(), lr=learning_rate)

In [None]:
best_checkpoint_char, last_checkpoint_char = train(text_generator, loss_fn, optimizer, train_loader, val_loader, epochs)

## Training word-based LSTM language model

In [None]:
# extract unique words
train_words = [word for word in train_raw_text.split(" ")]
val_words = [word for word in val_raw_text.split(" ")]

# make a list of train words
train_words_subset_vocab = list(set(train_words[0:int(0.8*len(train_words))]))
train_words_subset_vocab.append("[UNK]")
train_words_vocab = list(set(train_words))
train_words_subset_vocab_set = set(train_words_subset_vocab)

print(len(train_words), len(val_words))

In [None]:
# create mapping of unique words to integers
word_to_int = {}
word_to_int["[UNK]"] = 0
cnt = 1
for word in train_words:
  if word in train_words_subset_vocab_set and word not in word_to_int:
    word_to_int[word] = cnt
    cnt += 1

int_to_word = dict((i, w) for w, i in word_to_int.items())

In [None]:
n_train_words = len(train_words)
n_val_words = len(val_words)

n_vocab = len(train_words_subset_vocab)
print("Total train words: ", n_train_words)
print("Total val words: ", n_val_words)
print("Total vocab size: ", n_vocab)

In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 30
train_dataX = []
train_dataY = []

train_raw_words = train_raw_text.split()

for i in range(0, n_train_words - seq_length, 1):
    seq_in = train_raw_words[i:i + seq_length]
    seq_out = train_raw_words[i+seq_length]
    assert len(seq_in) == seq_length

    if seq_out not in word_to_int: continue

    train_dataX.append([word_to_int[word] if word in word_to_int else word_to_int["[UNK]"] for word in seq_in])
    train_dataY.append(word_to_int[seq_out])

n_patterns = len(train_dataX)
print("Total train Patterns: ", n_patterns)
example = [int_to_word[word] for word in train_dataX[0]]
print(" ".join(example))

example = [int_to_word[word] for word in train_dataX[1]]
print(" ".join(example))

In [None]:
# prepare the validation dataset of input to output pairs encoded as integers
val_dataX = []
val_dataY = []

val_raw_words = val_raw_text.split()

for i in range(0, n_val_words - seq_length, 1):
    seq_in = val_raw_words[i:i + seq_length]
    #print(" ".join(seq_in))
    seq_out = val_raw_words[i+seq_length]
    assert len(seq_in) == seq_length
    if seq_out not in word_to_int: continue

    val_dataX.append([word_to_int[word] if word in word_to_int else word_to_int["[UNK]"] for word in seq_in])
    val_dataY.append(word_to_int[seq_out])

n_val_patterns = len(val_dataX)
print("Total val Patterns: ", n_val_patterns)
example = [int_to_word[word] for word in val_dataX[0]]
print(" ".join(example))

In [None]:
# randomizing the train and val sentences for better learning
all_X = train_dataX + val_dataX
all_Y = train_dataY + val_dataY
print(len(all_X), len(all_Y))

indexes = [i for i in range(len(all_X))]
random.seed(42)
random.shuffle(indexes)

train_X = [all_X[i] for i in indexes[0:int(0.95*len(indexes))]]
train_Y = [all_Y[i] for i in indexes[0:int(0.95*len(indexes))]]

val_X = [all_X[i] for i in indexes[int(0.95*len(indexes)):]]
val_Y = [all_Y[i] for i in indexes[int(0.95*len(indexes)):]]

print(len(train_X), len(val_X))

In [None]:
X_train, Y_train = torch.tensor(train_X, dtype=torch.int32), torch.tensor(train_Y)
X_val, Y_val = torch.tensor(val_X, dtype=torch.int32), torch.tensor(val_Y)

print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

vectorized_train_dataset = TensorDataset(X_train, Y_train)
train_loader = DataLoader(vectorized_train_dataset, batch_size=128, shuffle=True)

vectorized_val_dataset = TensorDataset(X_val, Y_val)
val_loader = DataLoader(vectorized_val_dataset, batch_size=128, shuffle=False)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

In [None]:
from torch import nn
from torch.nn import functional as F

from torch import nn
from torch.nn import functional as F

class LSTMTextGeneratorWord(nn.Module):
    def __init__(self, n_vocab, embed_len, n_layers, hidden_dim):
        # TODO: Complete the __init__ definition (as in char-based LSTMs)

    def forward(self, X_batch):
        embeddings = self.word_embedding(X_batch)

        hidden, carry = torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device), torch.randn(self.n_layers, len(X_batch), self.hidden_dim).to(device)
        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        # output, (hidden, carry) = self.lstm(embeddings)
        return self.linear(self.dropout(output[:,-1, :]))

In [None]:
%%time

from torch.optim import Adam

epochs = 10
learning_rate = 5e-3
embed_len = 300
hidden_dim = 256
n_layers=2

set_seed(42)
loss_fn = nn.CrossEntropyLoss().to(device)
text_generator = LSTMTextGeneratorWord(n_vocab, embed_len, n_layers, hidden_dim).to(device)
optimizer = Adam(text_generator.parameters(), lr=learning_rate)

In [None]:
best_checkpoint_word, last_checkpoint_word = train(text_generator, loss_fn, optimizer, train_loader, val_loader, epochs)

## Generating text starting from a prompt

In [None]:
np.random.seed(48)
start = np.random.randint(0, len(val_raw_words)-seq_length)
prompt = val_raw_words[start:start+seq_length]
print("Prompt is: ", " ".join(prompt))
pattern = [word_to_int[w] if w in word_to_int else word_to_int["[UNK]"] for w in prompt]

last_checkpoint_word.eval()
set_seed(54)
print("Generation is:")
print()
with torch.no_grad():
    for i in range(10):
        # TODO: Generate the next ten words starting from prompt
        pass

## Creating the submission file

In [None]:
# create the submission file
df = pd.read_csv("dataset/test.csv")
ids, sents = list(df["id"]), list(df["inputs"])
outputs = []

# making the predictions
last_checkpoint_word.eval()
softmax = nn.Softmax()

with torch.no_grad():x
    for sent in sents:
        # format input array of int into PyTorch tensor
        sent_ids = [word_to_int[word] if word in word_to_int else word_to_int["[UNK]"] for word in sent.split()]
        x = torch.tensor(sent_ids, dtype=torch.int32).reshape(1, len(sent_ids))
        x = torch.tensor(x, dtype=torch.int32).detach()
        # generate logits as output from the model
        prediction = last_checkpoint_word(x.to(device))[0]
        # take softmax for probs
        # TODO: shape of outputs is (200, 100, 2)
        # For each of the 200 test sentences in test.csv, given the prefix in sent, outputs contains
        # the list of top 100 next-word predictions and its corresponding probabilities
        pass

print(outputs[0])
print(np.shape(outputs))

# save the output file
np.save("outputs", outputs)
# TODO: Create new cells below for the extra credit part
# TODO: Also save the outputs for the extra credit part in a new file, np.save("ec-outputs", outputs)
