In [1]:
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
import torch.optim as optim
import torch.nn.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
from torch.utils import data

cuda = torch.cuda.is_available()
print(cuda, sys.version)
device = torch.device("cuda" if cuda else "cpu")
np.random.seed(420)
torch.manual_seed(420)

LETTER_LIST = ['<sos>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ', '<eos>']

True 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]


In [2]:
from google.colab import drive
drive._mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from typing import List, Tuple

def create_dictionaries(letter_list: List[str]):# -> Tuple[Dict[str, int], Dict[int, str]]:
    '''
    Create dictionaries for letter2index and index2letter transformations
    '''
    letter2index = {letter_list[i]:i for i in range(len(letter_list))}
    index2letter = {j:i for (i,j) in letter2index.items()}
    return letter2index, index2letter
    
def transform_letter_to_index(raw_transcripts,letter2index):#: List[str]) -> List[int]:
    '''
    Transforms text input to numerical input by converting each letter 
    to its corresponding index from letter_list

    Args:
        raw_transcripts: Raw text transcripts with the shape of (N, )
    
    Return:
        transcripts: Converted index-format transcripts. This would be a list with a length of N
    '''  
    return [letter2index[i] for i in raw_transcripts]
    
# Create the letter2index and index2letter dictionary
letter2index, index2letter = create_dictionaries(LETTER_LIST)

In [4]:
cd '/content/drive/MyDrive/IDL-HW4-P2/hw4p2_toy_dataset'

/content/drive/MyDrive/IDL-HW4-P2/hw4p2_toy_dataset


In [5]:
# Load the training, validation and testing data
train_data = np.load('train.npz', allow_pickle=True, encoding='bytes')
valid_data = np.load('dev.npz', allow_pickle=True, encoding='bytes')
# test_data = np.load('test.npy', allow_pickle=True, encoding='bytes')

# Load the training, validation raw text transcripts
raw_train_transcript = np.load('train_transcripts.npz', allow_pickle=True,encoding='bytes')
raw_valid_transcript = np.load('dev_transcripts.npz', allow_pickle=True,encoding='bytes')

# TODO: Convert the raw text transcripts into indexes
train_transcript = [transform_letter_to_index(list(raw_train_transcript['data'][i]),letter2index) for i in range(len(raw_train_transcript['data']))] 
valid_transcript = [transform_letter_to_index(list(raw_valid_transcript['data'][i]),letter2index) for i in range(len(raw_valid_transcript['data']))]

In [6]:
class MyDataset(data.Dataset):
    def __init__(self, X, Y):
        self.X = X#.astype() if you want its data type converted
        self.Y = Y#.astype()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        # For testing set, return only x
        if self.Y is None:
            return torch.as_tensor(self.X[index])
        # For training and validation set, return x and y
        else:
            return torch.as_tensor(self.X[index]), torch.as_tensor(self.Y[index])

def collate_train_val(data):
    """
    Return:
        pad_x: the padded x (training/validation speech data) 
        pad_y: the padded y (text labels - transcripts)
        x_len: the length of x
        y_len: the length of y
    """
    # Consider batch_first = True
    x_batch = [x for (x,y) in data]
    y_batch = [y for (x,y) in data]

    x_len = []
    y_len = []

    for i in range(len(x_batch)):
        x_len.append(x_batch[i].shape[0])
    
    for i in range(len(y_batch)):
        y_len.append(y_batch[i].shape[0])

    x_len = torch.tensor(x_len)
    y_len = torch.tensor(y_len)

    pad_x = rnn_utils.pad_sequence(x_batch,batch_first=True)
    pad_y = rnn_utils.pad_sequence(y_batch,batch_first=True)
    
    return pad_x, pad_y, x_len, y_len
    

def collate_test(data):
    """
    Return:
        pad_x: the padded x (testing speech data) 
        x_len: the length of x
    """
    # Consider batch_first = True
    x_batch = [x for (x,y) in data]
    y_batch = [y for (x,y) in data]

    x_len = []
    y_len = []

    for i in range(10):
        x_len.append(x_batch[i].shape[0])
    
    for i in range(10):
        y_len.append(y_batch[i].shape[0])

    x_len = torch.tensor(x_len)
    y_len = torch.tensor(y_len)

    pad_x = rnn_utils.pad_sequence(x_batch,batch_first=True)
    pad_y = rnn_utils.pad_sequence(y_batch,batch_first=True)
    
    return pad_x, pad_y, x_len, y_len
    

In [7]:
# Create datasets
train_dataset = MyDataset(train_data['data'],train_transcript)
valid_dataset = MyDataset(valid_data['data'],valid_transcript)
test_dataset = MyDataset(valid_data['data'],valid_transcript) # fill this out

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=128,shuffle=True,num_workers=4,pin_memory=True,collate_fn=collate_train_val)
valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=128,shuffle=False,num_workers=4,pin_memory=True,collate_fn=collate_train_val)
valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=128,shuffle=False,num_workers=4,pin_memory=True,collate_fn=collate_train_val)

In [8]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key, value and unpacked_x_len.

    '''
    def __init__(self, input_dim, encoder_hidden_dim, key_value_size=128):
        super(Encoder, self).__init__()
        # The first LSTM layer at the bottom
        self.lstm = nn.LSTM(input_dim,encoder_hidden_dim,batch_first=True,num_layers=2,bidirectional=True)
        self.key_network = nn.Linear(encoder_hidden_dim*2,key_value_size)
        self.value_network = nn.Linear(encoder_hidden_dim*2,key_value_size)

    def forward(self, x, x_len):
        """
        1. Pack your input and pass it through the first LSTM layer (no truncation)
        2. Pass it through the pyramidal LSTM layer
        3. Pad your input back to (B, T, *) or (T, B, *) shape
        4. Output Key, Value, and truncated input lens

        Key and value could be
            (i) Concatenated hidden vectors from all time steps (key == value).
            (ii) Linear projections of the output from the last pBLSTM network.
                If you choose this way, you can use the final output of
                your pBLSTM network.
        """
        pack_encoder = rnn_utils.pack_padded_sequence(x,batch_first=True,enforce_sorted=False,lengths=x_len)

        out_lstm, _ = self.lstm(pack_encoder)

        unpack_encoder, len_encoder = rnn_utils.pad_packed_sequence(out_lstm,batch_first=True)

        key = self.key_network(unpack_encoder)
        value = self.value_network(unpack_encoder)
        
        return key, value, len_encoder

In [9]:
def plot_attention(attention):
    # utility function for debugging
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

class Attention(nn.Module):
    '''
    Attention is calculated using key and value from encoder and query from decoder.
    Here are different ways to compute attention and context:
    1. Dot-product attention
        energy = bmm(key, query) 
        # Optional: Scaled dot-product by normalizing with sqrt key dimension
        # Check "attention is all you need" Section 3.2.1
    * 1st way is what most TAs are comfortable with, but if you want to explore...
    2. Cosine attention
        energy = cosine(query, key) # almost the same as dot-product xD 
    3. Bi-linear attention
        W = Linear transformation (learnable parameter): d_k -> d_q
        energy = bmm(key @ W, query)
    4. Multi-layer perceptron
        # Check "Neural Machine Translation and Sequence-to-sequence Models: A Tutorial" Section 8.4
    
    After obtaining unnormalized attention weights (energy), compute and return attention and context, i.e.,
    energy = mask(energy) # mask out padded elements with big negative number (e.g. -1e9)
    attention = softmax(energy)
    context = bmm(attention, value)

    5. Multi-Head Attention
        # Check "attention is all you need" Section 3.2.2
        h = Number of heads
        W_Q, W_K, W_V: Weight matrix for Q, K, V (h of them in total)
        W_O: d_v -> d_v

        Reshape K: (B, T, d_k)
        to (B, T, h, d_k // h) and transpose to (B, h, T, d_k // h)
        Reshape V: (B, T, d_v)
        to (B, T, h, d_v // h) and transpose to (B, h, T, d_v // h)
        Reshape Q: (B, d_q)
        to (B, h, d_q // h)

        energy = Q @ K^T
        energy = mask(energy)
        attention = softmax(energy)
        multi_head = attention @ V
        multi_head = multi_head reshaped to (B, d_v)
        context = multi_head @ W_O
    '''
    def __init__(self):
        super(Attention, self).__init__()
        # Optional: dropout

    def forward(self, query, key, value, mask):
        """
        input:
            key: (batch_size, seq_len, d_k)
            value: (batch_size, seq_len, d_v)
            query: (batch_size, d_q)
        * Hint: d_k == d_v == d_q is often true if you use linear projections
        return:
            context: (batch_size, key_val_dim)
        
        """
        # query = torch.tensor(query)
        query = torch.unsqueeze(query,dim=2)

        energy = torch.unsqueeze(mask, 2) * torch.bmm(key,query)

        attention = torch.softmax(energy,dim=1)

        context = torch.bmm(torch.transpose(attention,1,2),value)

        context = torch.squeeze(context)
        return context, attention
        # we return attention weights for plotting (for debugging)

In [19]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step.
    Thus we use LSTMCell instead of LSTM here.
    The output from the last LSTMCell can be used as a query for calculating attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Decoder, self).__init__()
        # Hint: Be careful with the padding_idx
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        # The number of cells is defined based on the paper
        self.lstm1 = nn.LSTMCell(key_value_size+embed_dim,decoder_hidden_dim)
        # self.lstm2 = nn.LSTMCell()
    
        self.attention = Attention()     
        self.vocab_size = vocab_size
        # Optional: Weight-tying
        self.character_prob = nn.Linear(key_value_size+embed_dim,vocab_size) #: d_v -> vocab_size
        self.key_value_size = key_value_size
        
        # Weight tying
        # self.character_prob.weight = self.embedding.weight

    def forward(self, key, value, encoder_len, y=None, mode='train'):
        '''
        Args:
            key :(B, T, d_k) - Output of the Encoder (possibly from the Key projection layer)
            value: (B, T, d_v) - Output of the Encoder (possibly from the Value projection layer)
            y: (B, text_len) - Batch input of text with text_length
            mode: Train or eval mode for teacher forcing
        Return:
            predictions: the character perdiction probability 
        '''

        B, key_seq_max_len, key_value_size = key.shape

        if mode == 'train':
            max_len =  y.shape[1]
            char_embeddings = self.embedding(y)
        else:
            max_len = 600

        # TODO: Create the attention mask here (outside the for loop rather than inside) to aviod repetition
        mask = torch.ones(B,key_seq_max_len) *1e-5
        
        for i in range(B):
            mask[i,:encoder_len[i]] = 1
        mask = mask.to(device)
        
        predictions = []
        # This is the first input to the decoder
        # What should the fill_value be?
        prediction = torch.full((B,), fill_value= 0, device=device)
        # The length of hidden_states vector should depend on the number of LSTM Cells defined in init
        # The paper uses 2
        hidden_states = [None, None] 
        
        # TODO: Initialize the context
        context = torch.zeros(B,key_value_size)
        context = context.to(device)

        attention_plot = [] # this is for debugging

        for i in range(max_len):
            if mode == 'train':
                # TODO: Implement Teacher Forcing
                """
                if using teacher_forcing:
                    if i == 0:
                        # This is the first time step
                        # Hint: How did you initialize "prediction" variable above?
                    else:
                        # Otherwise, feed the label of the *previous* time step
                else:
                    char_embed = embedding of the previous prediction
                """     
                if np.random.randint(0,10) >= 5:

                    if i == 0:

                        prediction = prediction.to(device)
                        char_embed = self.embedding(prediction)

                    else:

                        char_emb = char_embeddings[:,i,:]
                
                else:
                  
                    prediction = prediction.to(device)
                    char_embed = self.embedding(prediction)
            else:
                prediction = prediction.to(device)
                char_embed = self.embedding(prediction)

            # what vectors should be concatenated as a context?
            y_context = torch.cat([char_embed,context], dim=1)
            # context and hidden states of lstm 1 from the previous time step should be fed
            hidden_states,_ = self.lstm1(y_context)
            query = hidden_states
            
            # Compute attention from the output of the second LSTM Cell
            context, attention = self.attention(query, key, value, mask)
            #print('After Attention')

            # We store the first attention of this batch for debugging
            attention_plot.append(attention[:,:,0].detach().cpu())
          
            output_context = torch.cat([query,context], dim=1)
            prediction = self.character_prob(output_context)
            # store predictions
            predictions.append(torch.unsqueeze(prediction,dim=1))
            prediction = torch.argmax(F.softmax(prediction,dim=1),dim=1)
        # Concatenate the attention and predictions to return
        attentions = torch.stack(attention_plot, dim=0)
        predictions = torch.cat(predictions, dim=1)
        return predictions, attentions

In [20]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, encoder_hidden_dim, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(Seq2Seq,self).__init__()
        self.encoder = Encoder(input_dim,encoder_hidden_dim,key_value_size)
        self.decoder = Decoder(vocab_size,decoder_hidden_dim,embed_dim,key_value_size)

    def forward(self, x, x_len, y=None, mode='train'):
        key, value, encoder_len = self.encoder(x, x_len)
        predictions = self.decoder(key, value, encoder_len, y=y, mode=mode) 
        return predictions

In [21]:
def train(model, train_loader, criterion, optimizer, mode):
    model.train()
    running_loss = 0
    
    
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        x, y, x_len, y_len = batch
        x,y = x.to(device),y.to(device)
        predictions, attentions = model(x, x_len, y, mode=mode)
        
        # Generate a mask based on target length. This is to mark padded elements
        # so that we can exclude them from computing loss
        mask = torch.ones(y.shape[0],y.shape[1])*1e-5
        
        for i in range(y.shape[0]):
            mask[i,:y_len[i]] = 1
        mask = torch.flatten(mask)
        mask = mask.to(device)

        predictions = torch.flatten(predictions,start_dim=0,end_dim=1)
        y = torch.flatten(y)

        # Make sure you have the correct shape of predictions when putting into criterion
        loss = criterion(predictions, y)
        # Use the mask you defined above to compute the average loss
        masked_loss = torch.mean(loss*mask)

        if (i+1)%5 == 0:
            print("Batch: {}, Loss: {}".format(i+1,masked_loss.item())) 

        # backprop
        running_loss += masked_loss.item() 
        # Optional: Gradient clipping
        masked_loss.backward()
        # When computing Levenshtein distance, make sure you truncate prediction/target
        optimizer.step()
        # Optional: plot your attention for debugging
        # plot_attention(attentions)
    print("Total Loss: ",running_loss/len(train_loader))
    plot_attention(attentions[:,0,:])


In [None]:
# TODO: Define your model and put it on the device here
# ...
model = Seq2Seq(input_dim=40,vocab_size=len(LETTER_LIST),encoder_hidden_dim=512,decoder_hidden_dim=512,embed_dim=512,key_value_size=512)
model = model.to('cuda')
n_epochs = 100
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,step_size = 10,gamma=0.5,verbose=True)
# Make sure you understand the implication of setting reduction = 'none'
criterion = nn.CrossEntropyLoss(reduction='none')
mode = 'train'

for epoch in range(n_epochs):
    print("Epoch:{}\n".format(epoch))
    train(model, train_loader, criterion, optimizer, mode)
    PATH = "/content/drive/MyDrive/IDL-HW4-P2/saved_model_hw4P2/S2S_Model_{}.pt".format(epoch)
    val_loss = val(model, valid_loader)
    torch.save({'epoch': epoch,
               'model_state_dict':model.state_dict(),
               'scheduler_state_dict':scheduler.state_dict(),
               'optimizer_state_dict':optimizer.state_dict(),
               'val_loss':val_loss},PATH)
              
              
    scheduler.step()


In [22]:
model = Seq2Seq(input_dim=40,vocab_size=len(LETTER_LIST),encoder_hidden_dim=512,decoder_hidden_dim=512,embed_dim=512,key_value_size=512)
model = model.to('cuda')
n_epochs = 100
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,step_size = 10,gamma=0.5,verbose=True)
# Make sure you understand the implication of setting reduction = 'none'
criterion = nn.CrossEntropyLoss(reduction='none')
mode = 'val'

Adjusting learning rate of group 0 to 1.0000e-03.


In [23]:
checkpoint = torch.load('/content/drive/MyDrive/IDL-HW4-P2/saved_model_hw4P2/S2S_Model_83.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
epochs = checkpoint['epoch']

In [29]:
def val_func(model, valid_loader):
    model.eval()
    running_loss = 0

    for i, batch in enumerate(valid_loader):

        x, y, x_len, y_len = batch
        x = x[0:10]
        y = y[0:10]
        x_len = x_len[0:10]
        print('lengths', x_len)
        y_len = y_len[0:10]
        x,y = x.to(device),y.to(device)
        predictions, attentions = model(x, x_len, y, mode=mode)


        break
    print(attentions.shape)
    final_attn = []
    for i in range(10):
        print(attentions[0:x_len[i], i, 0:x_len[i]].shape)
        final_attn.append(np.transpose(attentions[0:x_len[i], i, 0:x_len[i]].cpu().detach().numpy()))

    final_attn_new = np.array(final_attn)
    print(final_attn_new[0].shape)
    np.save('/content/drive/MyDrive/IDL-HW4-P2/handin/attention.npy', final_attn_new)
    print("Total Validation Loss: ",running_loss/len(valid_loader))

In [30]:
val_func(model, valid_loader)

lengths tensor([29, 21, 21, 31, 26, 31, 20, 18, 34, 15])
torch.Size([600, 10, 34])
torch.Size([29, 29])
torch.Size([21, 21])
torch.Size([21, 21])
torch.Size([31, 31])
torch.Size([26, 26])
torch.Size([31, 31])
torch.Size([20, 20])
torch.Size([18, 18])
torch.Size([34, 34])
torch.Size([15, 15])
(29, 29)




Total Validation Loss:  0.0


In [32]:
# cd /content/drive/MyDrive/IDL-HW4-P2/

/content/drive/MyDrive/IDL-HW4-P2


In [34]:
# !tar -cf /content/drive/MyDrive/IDL-HW4-P2/handin.tar ./handin