In [None]:
import torch 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F 

import functools 
import operator 
import random 
import pickle 

import numpy as np 

from sklearn.metrics import confusion_matrix 
from torch.utils.data import TensorDataset 
from torch.utils.data import DataLoader 
from matplotlib import pyplot as plt 

In [None]:
# Exercise 1 
# Training data 
keepNew = True 
with open('PTB/ptb.train.txt', mode = 'r', newline = None) as train_f: 
    train_dat = train_f.read().splitlines(keepNew) 
    
# Validation data 
with open('PTB/ptb.valid.txt', mode = 'r', newline = None) as valid_f: 
    valid_dat = valid_f.read().splitlines(keepNew) 

# Testing data 
with open('PTB/ptb.test.txt', mode = 'r', newline = None) as test_f: 
    test_dat = test_f.read().splitlines(keepNew) 
    
# Printing examples phrases 
n_examp = np.random.choice(1000, size = 6, replace = False) 

print("Example training set phrase:", train_dat[n_examp[0]]) 
print("Example training set phrase:", train_dat[n_examp[1]]) 
print("Example validation set phrase:", valid_dat[n_examp[2]]) 
print("Example validation set phrase:", valid_dat[n_examp[3]]) 
print("Example test set phrase:", test_dat[n_examp[4]]) 
print("Example test set phrase:", test_dat[n_examp[5]]) 

# Adding <eos> 
train_proc, valid_proc, test_proc = [], [], [] 
for l_train in train_dat: 
    l_mod = l_train.replace('\n', '<eos>') 
    train_proc.append(l_mod) 
    
for l_valid in valid_dat: 
    l_mod = l_valid.replace('\n', '<eos>') 
    valid_proc.append(l_mod) 
    
for l_train in test_dat: 
    l_mod = l_train.replace('\n', '<eos>') 
    test_proc.append(l_mod) 
    
# Split up each line in individual words 
train_words, valid_words, test_words = [], [], [] 
for tp in train_proc: 
    train_words.append(tp.split()) 
    
for tp in valid_proc: 
    valid_words.append(tp.split()) 
    
for tp in test_proc: 
    test_words.append(tp.split()) 
    
# Flatten list of lists into a single list 
train_words = functools.reduce(operator.iconcat, train_words, []) 
valid_words = functools.reduce(operator.iconcat, valid_words, []) 
test_words = functools.reduce(operator.iconcat, test_words, []) 
    
num_train = len(train_words) 
num_valid = len(valid_words) 
num_test = len(test_words) 

print("Number of training words:", num_train) 
print("Number of validation words:", num_valid) 
print("Number of testing words:", num_test) 

# Building a dictionary 
all_words = train_words + valid_words + test_words 
set_words = set(all_words) 
num_unique = len(set_words) 

print("Number of unique words in training + validation + testing splits:", num_unique) 

num_id = np.random.choice(num_unique, size = num_unique, replace = False) 

n = 0 
unique_dict = {} 
for uw in set_words: 
    unique_dict.update({uw : num_id[n]}) 
    n += 1 
    
# Replacing all words in the training/validation/testing splits with their integer representation 
train_ints, valid_ints, test_ints = [], [], [] 

for word in train_words: 
    int_rep = unique_dict[word] 
    train_ints.append(int_rep) 

for word in valid_words: 
    int_rep = unique_dict[word] 
    valid_ints.append(int_rep) 
    
for word in test_words: 
    int_rep = unique_dict[word] 
    test_ints.append(int_rep) 

In [None]:
# Check if CUDA is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", torch.cuda.get_device_name(device)) 

In [None]:
# Resetting model function 
# Credits: https://discuss.pytorch.org/t/reset-model-weights/19180/4 
def reset_model(model):
    for layer in model.children(): 
       if hasattr(layer, 'reset_parameters'): 
           layer.reset_parameters() 

In [None]:
# Exercise 2 
# Convert our training/validation/testing splits to Torch tensors 
train_dat = torch.tensor(train_ints) 
valid_dat = torch.tensor(valid_ints) 
test_dat = torch.tensor(test_ints) 

### Data loading 
batch_size = 32  
batch_eval = 512 
batch_test = 512  
seq_train = 50 
seq_valid = 50 
seq_test = 50 

s_train_l = num_train // seq_train 
s_valid_l = num_valid // seq_valid 
s_test_l = num_test // seq_test 

# Trim training/validation/testing data and reshape into tensor of num_sequences by sequence_length 
# Training data
train_seq = torch.narrow(train_dat, 0, 0, seq_train * s_train_l) 
train_lab = torch.roll(train_seq, shifts = -1, dims = 0) 

train_seq = train_seq.reshape(s_train_l, seq_train)
train_lab_seq = train_lab.reshape(s_train_l, seq_train)

# Validation data
valid_seq = torch.narrow(valid_dat, 0, 0, seq_valid * s_valid_l)
valid_lab = torch.roll(valid_seq, shifts = -1, dims = 0) 

valid_seq = valid_seq.reshape(s_valid_l, seq_valid)
valid_lab_seq = valid_lab.reshape(s_valid_l, seq_valid) 

# Testing data
test_seq = torch.narrow(test_dat, 0, 0, seq_test * s_test_l)
test_lab = torch.roll(test_seq, shifts = -1, dims = 0) 

test_seq = test_seq.reshape(s_test_l, seq_test) 
test_lab_seq = test_lab.reshape(s_test_l, seq_test) 

# Divide training and validation data into correct mini-batches
num_batches = train_seq.shape[0] // batch_size 
valid_batches = valid_seq.shape[0] // batch_eval 
test_batches = test_seq.shape[0] // batch_test 

training_set = TensorDataset(train_seq.to(device), train_lab_seq.to(device)) 
training_loader = DataLoader(training_set, shuffle = False, batch_size = num_batches) 

valid_set = TensorDataset(valid_seq.to(device), valid_lab_seq.to(device)) 
valid_loader = DataLoader(valid_set, shuffle = False, batch_size = valid_batches) 

test_set = TensorDataset(test_seq.to(device), test_lab_seq.to(device)) 
test_loader = DataLoader(test_set, shuffle = False, batch_size = test_batches) 

In [None]:
### RNN code  
# Embedding parameter 
embed_dim = 500  

# RNN parameters
hidden_dim = 500    
in_size = embed_dim 
n_layers = 2 

# Vanilla RNN  
class ElmanRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_embeddings, embedding_dim, hidden_dim, num_unique, drop_out):
        super(ElmanRNN, self).__init__()
        
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        self.num_layer = num_layers 
        self.drop_out = drop_out
        self.num_embeddings = num_embeddings 
        self.embedding_dim = embedding_dim 
        
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        self.elman = nn.RNN(input_size, hidden_size, num_layers, dropout = drop_out, batch_first = True) 
        self.linear = nn.Linear(hidden_dim, num_unique) 
        
    def forward(self, mod_input): 
        word_embed = self.embed(mod_input)
        rnn_out, hidden_out = self.elman(word_embed) 
        rnn_out = self.linear(rnn_out) 
        rnn_out = rnn_out.view(-1, num_unique)
        
        return rnn_out, hidden_out 
    
# LSTM 
class LSTMRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_embeddings, embedding_dim, hidden_dim, num_unique, drop_out):
        super(LSTMRNN, self).__init__() 
        
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        self.num_layer = num_layers 
        self.drop_out = drop_out 
        self.num_embeddings = num_embeddings 
        self.embedding_dim = embedding_dim
        
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout = drop_out, batch_first = True, bidirectional = False) 
        self.linear = nn.Linear(hidden_dim, num_unique) 
        
    def forward(self, mod_input): 
        word_embed = self.embed(mod_input)
        lstm_out, hidden_out = self.lstm(word_embed) 
        lstm_out = self.linear(lstm_out) 
        lstm_out = lstm_out.view(-1, num_unique)
        
        return lstm_out, hidden_out 
    
# GRU 
class GRURNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_embeddings, embedding_dim, hidden_dim, num_unique, drop_out):
        super(GRURNN, self).__init__() 
        
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        self.num_layer = num_layers 
        self.drop_out = drop_out 
        self.num_embeddings = num_embeddings 
        self.embedding_dim = embedding_dim
        
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        self.gru = nn.GRU(input_size, hidden_size, num_layers, dropout = drop_out, batch_first = True, bidirectional = False) 
        self.linear = nn.Linear(hidden_dim, num_unique) 
        
    def forward(self, mod_input): 
        word_embed = self.embed(mod_input)
        gru_out, hidden_out = self.gru(word_embed) 
        gru_out = self.linear(gru_out) 
        gru_out = gru_out.view(-1, num_unique)
        
        return gru_out, hidden_out 
        
elman_rnn = ElmanRNN(in_size, hidden_dim, n_layers, num_unique, embed_dim, hidden_dim, num_unique, 0.5) 
lstm_rnn = LSTMRNN(in_size, hidden_dim, n_layers, num_unique, embed_dim, hidden_dim, num_unique, 0.8)
gru_rnn = GRURNN(in_size, hidden_dim, n_layers, num_unique, embed_dim, hidden_dim, num_unique, 0.8)

# Selecting model 
model = gru_rnn 

reset_model(model) 
model.to(device) 

# Optimizer 
l_rate = 10.0
sgd = optim.SGD(model.parameters(), lr = l_rate, weight_decay = 0, momentum = 0.0) 
adam = optim.Adam(model.parameters(), lr = l_rate, betas = (0.9, 0.999), eps = 1e-08, weight_decay = 0.0, amsgrad = False)

optimizer = sgd 

# Cross entropy loss 
loss = nn.CrossEntropyLoss() 

# Training parameters 
num_epochs = 50 
train_epoch = np.zeros(num_epochs)
valid_epoch = np.zeros(num_epochs) 

train_loss = [] 
valid_loss = [] 
test_loss = [] 

# Gradient clipping 
clipGrad = True  

# Learning rate scheduler
exp_sched = optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.5,last_epoch = -1, verbose = True) 
cosine_sched = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 500, eta_min = 0.1, last_epoch = -1, verbose = True) 
#plat_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.1, patience = 10, threshold = 0.0001, threshold_mode = 'rel', cooldown = 0, min_lr = 0, eps = 1e-08, verbose = True)

sched = cosine_sched 
useScheduler = True  

In [None]:
# Training and evaluation on validation data 
for epoch in range(num_epochs): 
    train_losss = [] 
    valid_losss = [] 
    
    for data, lab_train in training_loader: 
        model.train() 
        optimizer.zero_grad() 
        
        pred_out, hid_out = model(data) 
        
        ce_loss = loss(pred_out, lab_train.flatten()) 
    
        train_loss.append(ce_loss.item()) 
        train_losss.append(ce_loss.item()) 
    
        ce_loss.backward() 
        
        # Using gradient clipping 
        if clipGrad: 
            nn.utils.clip_grad_norm_(model.parameters(), max_norm = 0.5, norm_type = 2.0) 
            
        optimizer.step() 
        
        # Using learning rate scheduler 
        if useScheduler: 
            sched.step() 
        
    train_epoch[epoch] = np.mean(train_losss) 
        
    for data_eval, lab_eval in valid_loader: 
        model.eval() 
        with torch.no_grad(): 
            valid_preds, valid_hid = model(data_eval) 
            
            ce_valid = loss(valid_preds, lab_eval.flatten()) 
            valid_loss.append(ce_valid.item()) 
            valid_losss.append(ce_valid.item()) 
            
    valid_epoch[epoch] = np.mean(valid_losss) 
    
    print("Epoch: %s" % (epoch + 1)) 

# Evaluation on test data 
model.eval()
for data_test, lab_test in test_loader:
    with torch.no_grad(): 
            test_preds, test_hid = model(data_test) 
            
            ce_test = loss(test_preds, lab_test.flatten()) 
            test_loss.append(ce_test.item()) 

In [None]:
plt.figure(1, figsize = (6.4, 4.8)) 
train, = plt.plot(train_epoch, 'r') 
valid, = plt.plot(valid_epoch, 'b') 
plt.xlabel("Epoch") 
plt.ylabel("Loss") 
plt.title("Averaged training and validation loss") 
plt.legend([train, valid], ['Train loss', 'Validation loss']) 
plt.annotate("Final average training loss: %s" % (train_epoch[-1]) ,xycoords = 'figure fraction', xy = (0.25,0.55)) 
plt.annotate("Final average validation loss: %s" % (valid_epoch[-1]), xycoords = 'figure fraction', xy = (0.25,0.50)) 
plt.savefig("train_valid_epoch", dpi = 500) 
print("Final average training loss: %s." % train_epoch[-1]) 
print("Final average validation loss: %s." % valid_epoch[-1]) 
print() 

plt.figure(2, figsize = (6.4, 4.8)) 
train_it, = plt.plot(train_loss, 'r') 
valid_it, = plt.plot(valid_loss, 'b') 
plt.xlabel("Iteration") 
plt.ylabel("Loss") 
plt.title("Training and validation loss") 
plt.legend([train_it, valid_it], ['Train loss', 'Validation loss']) 
plt.annotate("Final training loss: %s" % (train_loss[-1]) ,xycoords = 'figure fraction', xy = (0.25,0.55)) 
plt.annotate("Final validation loss: %s" % (valid_loss[-1]), xycoords = 'figure fraction', xy = (0.25,0.50)) 
#plt.savefig("train_valid_loss_iter", dpi = 500) 
print("Final training loss: %s." % train_loss[-1]) 
print("Final validation loss: %s." % valid_loss[-1]) 
print() 

plt.figure(3, figsize = (6.4, 4.8)) 
train_fin_ep, = plt.plot(train_losss, 'r') 
valid_fin_ep, = plt.plot(valid_losss, 'b') 
plt.xlabel("Iteration") 
plt.ylabel("Loss") 
plt.title("Training and validation loss final epoch") 
plt.legend([train_fin_ep, valid_fin_ep], ['Train loss', 'Validation loss']) 
plt.annotate("Training loss final epoch: %s" % (train_losss[-1]) ,xycoords = 'figure fraction', xy = (0.25,0.55)) 
plt.annotate("Validation loss final epoch: %s" % (valid_losss[-1]), xycoords = 'figure fraction', xy = (0.25,0.50)) 
#plt.savefig("train_valid_final_epoch", dpi = 500) 

perplexity = np.exp(np.mean(test_loss)) 
print("Test perplexity: %s." % (perplexity)) 

In [None]:
# Generate sample phrases 
dictionary = unique_dict # number of unique words/tokens is 10000 
dict_keys = list(dictionary.keys())
dict_values = list(dictionary.values()) 

# Words 
w_1 = "there" 
i_1 = dictionary[w_1]
w_2 = "is" 
i_2 = dictionary[w_2] 
w_3 = "a"
i_3 = dictionary[w_3] 
w_4 = "why" 
i_4 = dictionary[w_4] 
w_5 = "on"
i_5 = dictionary[w_5] 

# Input sequence 
input_tens = torch.randint(num_unique, (2, 1)).long().to(device) 
input_tens[0] = i_1
print("Input", input_tens)

inputs_list = input_tens.tolist()
print("List input", inputs_list)

softmax_layer = nn.Softmax(dim = 1) 
num_words = 5 # how many words to predict 
model.eval() 
with torch.no_grad(): 
    for n in range(num_words): 
        out, hidden = model(input_tens) 
        probs = softmax_layer(out) 
    
        maxv, maxi = torch.max(probs, dim = 1)
        sortedv, sortedi = -np.sort(-maxv.cpu().numpy()), -np.sort(-maxi.cpu().numpy())
        
        next_word = sortedi[0]
        
        inputs_list.append([next_word])
        
        input_tens = input_tens.tolist() 
        input_tens.append([next_word])
        input_tens = torch.tensor(input_tens).long().to(device)

    
print(inputs_list)
output_sentence = [] 

for i in inputs_list: 
    word_i = dict_keys[dict_values.index(i[0])]
    output_sentence.append(word_i)

print(output_sentence) 