In [None]:
from google.colab import drive
drive._mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
# from tests import test_prediction, test_generation

In [None]:
import numpy as np

def log_softmax(x, axis):
    ret = x - np.max(x, axis=axis, keepdims=True)
    lsm = np.log(np.sum(np.exp(ret), axis=axis, keepdims=True))
    return ret - lsm


def array_to_str(arr, vocab):
    # print('arr',arr)
    return " ".join(vocab[a] for a in arr)


def test_prediction(out, targ):
    # out = out.to('cpu')
    out = log_softmax(out, 1)
    nlls = out[np.arange(out.shape[0]), targ]
    nll = -np.mean(nlls)
    return nll

def test_generation(inp, pred, vocab):
    outputs = u""
    for i in range(inp.shape[0]):
        # print('input i', inp[i])
        w1 = array_to_str(inp[i], vocab)
        # print('after w1')
        # print('pred i', pred[i])
        w2 = array_to_str(pred[i], vocab)
        # print('after w2')
        outputs += u"Input | Output #{}: {} | {}\n".format(i, w1, w2)
    return outputs

In [None]:
# load all that we need

dataset = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/dataset/wiki.valid.npy', allow_pickle=True)
fixtures_pred = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/fixtures/prediction.npz')  # dev - 2 keys - inp,out
fixtures_gen = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/fixtures/prediction_test.npz')  # test
fixtures_gen_test = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/fixtures/generation_test.npy')  # test
vocab = np.load('/content/gdrive/MyDrive/IDL-HW4-P1/dataset/vocab.npy')

In [None]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, shuffle=True):
        self.dataset = np.concatenate(dataset)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.sequence_len = 50
#         raise NotImplemented

    def __iter__(self):
        # concatenate your articles and build into batches
        seq_data = []
        seq_label = []
        j = 0
#         for i in range(self.dataset.shape[0]-self.sequence_len-1):
#             data = self.dataset[i:i+self.sequence_len],self.dataset[i+1:i+1+self.sequence_len]
#             seq_data.append(data)
#             j+=1
#             if j == self.batch_size:
#                 j = 0
#                 yield seq_data[-self.batch_size:]

        for i in range(0,self.dataset.shape[0]-self.sequence_len-1,self.sequence_len):
            data = self.dataset[i:i+self.sequence_len]
            seq_data.append(data)
            label = self.dataset[i+1:i+1+self.sequence_len]
            seq_label.append(label)
            j+=1
            if j == self.batch_size:
                j = 0
                yield (seq_data[-self.batch_size:],seq_label[-self.batch_size:])
                
        # raise NotImplemented

        
        

In [None]:
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        # mask = mask.long()
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

In [None]:
# model
class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, vocab_size):
        super(LanguageModel, self).__init__()
        self.vocab_size = vocab_size
        embedding_size = 32
        input_size = embedding_size
        hidden_size = 256
        hidden_size_2 = 512
        self.emb = torch.nn.Embedding(self.vocab_size,embedding_size)
        self.drop1 = LockedDropout(0.5)
        self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1, bidirectional=False)
        self.drop2 = nn.Dropout(0.5) #LockedDropout(0.5)
        self.lstm2 = nn.LSTM(input_size=int(hidden_size), hidden_size=int(hidden_size_2), num_layers=1, bidirectional=False)
        self.drop3 = nn.Dropout(0.5) #LockedDropout(0.5)
        self.lstm3 = nn.LSTM(input_size=int(hidden_size_2), hidden_size=int(hidden_size_2*2), num_layers=1, bidirectional=False)
        # self.drop4 = LockedDropout(0.5)
        # self.lstm4 = nn.LSTM(input_size=hidden_size_2*8, hidden_size=hidden_size_2*16, num_layers=1, bidirectional=True)
        self.fc1 = nn.Linear(int(hidden_size_2*2), self.vocab_size)
        # raise NotImplemented

    def forward(self, x):
        # Feel free to add extra arguments to forward (like an argument to pass in the hiddens)
        # raise NotImplemented
        # print(x)
        # x = np.array(x)
        # print(x.shape)
        # x = x.reshape((x.shape[0],x.shape[1],1))
        x = self.emb(x)
        x = x.permute(1, 0, 2)
        # print('After permute 1',x.shape)
        x = self.drop1(x)
        x = self.lstm1(x)[0]
        # print('After lstm 1',x.shape)
        x = self.drop2(x)
        x = self.lstm2(x)[0]
        # print('After lstm 2',x.shape)
        x = self.drop3(x)
        x = self.lstm3(x)[0]
        # print('After lstm 3',x.shape)
        x = x.permute(1, 0, 2)
        # print('After permute 2',x.shape)
        # x = x[:,-1,:]
        x = self.fc1(x)
        x = x.permute(0,2,1)
        return x


    


In [None]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model.to('cuda')
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.ASGD(model.parameters(), lr=0.1, weight_decay=0.1)
        # self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')
        self.criterion = nn.CrossEntropyLoss()

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            inputs = torch.tensor(np.array(inputs)).to('cuda')
            targets = torch.LongTensor(targets).to('cuda')
            # print('Shape of targets', targets.shape)
            # print(model.vocab_size)
            # targets = torch.nn.functional.one_hot(targets,model.vocab_size)
            # targets = nn.Embedding(model.vocab_size,model.vocab_size)(targets)
            # targets = targets.type(torch.LongTensor)
            if batch_num % 10 == 9:
              print('Processed %d batches'%(batch_num+1))
            epoch_loss += self.train_batch(inputs, targets)
        # self.scheduler.step()
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        """
        out = model(inputs) #.float()
        # print('shape of preds',out.shape)
        # print('shape of targets', targets.shape)
        loss = self.criterion(out,targets)
        loss.backward()
        self.optimizer.step()
        return loss
        # raise NotImplemented

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [None]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        # model = model.to('cuda')
        inp = torch.tensor(inp).to('cuda')
        out = model(inp)
        out = out.cpu().detach().numpy()
        out = out[:,:,-1]
        out = np.reshape(out,(out.shape[0],out.shape[1]))
        print('output shape in prediction is', out.shape)
        return out

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """  
        inp = torch.tensor(inp).to('cuda')
        out = model(inp)
        out = out.cpu().detach().numpy()
        out = out[:,:,-forward:]
        # out = np.reshape(out,(out.shape[0],out.shape[1]))
        out = np.argmax(out,axis=1)
        print('output shape in generation is', out.shape)
        print('unique values',np.unique(out))
        return out      
        # raise NotImplemented
        

In [None]:
# TODO: define other hyperparameters here

NUM_EPOCHS = 20
BATCH_SIZE = 256


In [None]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1639073926


In [None]:
# dataset[:60].shape

In [None]:
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
model = LanguageModel(len(vocab))
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [None]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

Processed 10 batches
Processed 20 batches
Processed 30 batches
Processed 40 batches
Processed 50 batches
Processed 60 batches
Processed 70 batches
Processed 80 batches
Processed 90 batches
Processed 100 batches
Processed 110 batches
Processed 120 batches
Processed 130 batches
Processed 140 batches
Processed 150 batches
Processed 160 batches
[TRAIN]  Epoch [22/20]   Loss: 6.0113
output shape in prediction is (128, 33278)
output shape in generation is (32, 10)
unique values [    1    72    73    76    79    87    88  1417  1419  1420  1424  1425
 13276 14549 14606 14658 15340 16134 16176 16802 21201 21626 22200 22968
 23592 23956 25821 31352 31353 31543 32747]
output shape in generation is (128, 10)
unique values [    1    72    73    76    79    86    88  1417  1419  1420  1423  1424
  1425  1821  7012  8204 13276 13774 14118 14549 14658 15219 15340 16134
 16176 16802 17154 18779 20243 20398 21201 21626 22200 22968 23592 23816
 23956 24452 24958 25639 25821 26190 29092 29294 30547 31348

In [None]:
                                                                                                                                                                                                                                                                                                                                                                                                                                                        # Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output