In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


In [None]:
# !pip install kaggle
# !mkdir .kaggle
# import json
# token = {"username":"saiprahladhp","key":"f253b9da706f456d8092de8a09590c5b"}
# with open('/content/.kaggle/kaggle.json', 'w') as file:
#     json.dump(token, file)
# !chmod 600 /content/.kaggle/kaggle.json
# !cp /content/.kaggle/kaggle.json /root/.kaggle/
# !kaggle config path -p /content

**Imports**

In [None]:
!pip install python-Levenshtein

In [None]:
import numpy as np
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import itertools
import torch.optim as optim
import pandas as pd
from torch.nn.utils.rnn import *
from statistics import mean 
import time
import matplotlib.pyplot as plt
import torch.nn.utils as utils
import operator
import Levenshtein
import seaborn as sns
from torch.autograd import Variable

In [None]:
speech_train = np.load('train.npy', allow_pickle=True, encoding='bytes')

In [None]:
speech_valid = np.load('dev.npy', allow_pickle=True, encoding='bytes')
speech_test = np.load('test.npy', allow_pickle=True, encoding='bytes')

transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')

In [None]:
LETTER_LIST = ['<pad>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']    

**Helper functions**

In [None]:
def transform_letter_to_index(transcript, letter_list,letter2index):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    letter_to_index_list = []
    for line in transcript:
      letter_indices = [letter2index['<sos>']]
      
      for idx,word in enumerate(line):
        keys = list(word.decode()) 

        vals = operator.itemgetter(*keys)(letter2index)
        
        if isinstance(vals,tuple):
          vals = list(vals)
        else:
          vals = [vals]
        
        letter_indices += vals

        if idx < len(line)-1:

          letter_indices.append(letter2index[' '])

      letter_indices.append(letter2index['<eos>'])
      letter_to_index_list.append(letter_indices)


    return letter_to_index_list

'''
Optional, create dictionaries for letter2index and index2letter transformations
'''
def create_dictionaries(letter_list):
    indices = [*range(len(letter_list))]

    letter2index = dict(zip(letter_list,indices))
    index2letter = dict(zip(indices,letter_list))
    return letter2index, index2letter

def transform_index_to_letter(index,letter2index,index2letter):
  
  if(torch.is_tensor(index)):
    index = index.numpy()

  index_to_letter_list = []
  breaks = [letter2index['<eos>'], letter2index['<pad>']]
  for idx in index:
      pred = ""
      for i in idx:
          #characterwise looping of the sentence
          if i in breaks:
              break
          elif i == letter2index['<sos>']:
            pred+=""
          else:
              pred += index2letter[i]
      index_to_letter_list.append(pred)
  return index_to_letter_list

**Dataset Class**

In [None]:
class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))


def collate_train(batch_data):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    
    speech_pad = pad_sequence([b[0] for b in batch_data],batch_first= True)

    speech_lens = torch.tensor([len(b[0]) for b in batch_data])

    text_pad = pad_sequence([b[1][1:] for b in batch_data],batch_first= True)

    text_lens = torch.tensor([len(b[1])-1 for b in batch_data])

    return speech_pad,speech_lens,text_pad,text_lens


def collate_test(batch_data):
    ### Return padded speech and length of utterance ###
    speech_pad = pad_sequence([b for b in batch_data],batch_first=True)

    speech_lens = torch.tensor([len(b) for b in batch_data])

    return speech_pad,speech_lens

In [None]:
letter2index, index2letter = create_dictionaries(LETTER_LIST)

**Transformation of train and validation transcripts to indices**

In [None]:
train_letter_to_index = transform_letter_to_index(transcript_train,LETTER_LIST,letter2index)
valid_letter_to_index = transform_letter_to_index(transcript_valid,LETTER_LIST,letter2index)

In [None]:
print(transform_index_to_letter(train_letter_to_index,letter2index,index2letter)[3])

which was pretty good evidence frank thought that the wounded boy must take considerable interest in the discussion why who else would try to turn on mister darrel that way and burn his shanties down just when winter is setting in asked bluff


In [None]:
train_data = Speech2TextDataset(speech_train,train_letter_to_index)
valid_data = Speech2TextDataset(speech_valid,valid_letter_to_index)

In [None]:
test_data = Speech2TextDataset(speech_test,isTrain=False)

In [None]:
train_loader = DataLoader(train_data,batch_size=64,shuffle=True,collate_fn=collate_train)
valid_loader = DataLoader(valid_data,batch_size=64,shuffle=False,collate_fn=collate_train)

In [None]:
test_loader = DataLoader(test_data,batch_size=64,shuffle=False,collate_fn=collate_test)

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

**Model Definition**

1. **Attention**

In [None]:
class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
        '''
        :param query :(batch_size, hidden_size) Query is the output of LSTMCell from Decoder
        :param keys: (batch_size, max_len, encoder_size) Key Projection from Encoder
        :param values: (batch_size, max_len, encoder_size) Value Projection from Encoder
        :return context: (batch_size, encoder_size) Attended Context
        :return attention_mask: (batch_size, max_len) Attention mask that can be plotted 
        '''
        key = torch.transpose(key,0,1)
        value = torch.transpose(value,0,1)

        energy = torch.bmm(key,query.unsqueeze(2)).squeeze(2)
        
        mask = (torch.arange(key.size(1)).unsqueeze(0) >= lens.unsqueeze(1)).to(DEVICE)
        

        energy.masked_fill_(mask, -1e9)

        attention_mask = F.softmax(energy, dim=1)

        context = torch.bmm(attention_mask.unsqueeze(1),value).squeeze(1)

        return context,attention_mask

In [None]:
#DUMMY Calculation for creating binary mask

# np.random.seed(1)
# X = np.random.random((4,6)).round(1) * 2 + 3
# X = torch.from_numpy(X)
# X_len = torch.LongTensor([4, 1, 6, 3])  # length of each sequence
# print(X)
# max_len = X.shape[1]
# mask = torch.arange(max_len)[None,:] >= X_len[:,None]
# X[mask] = float('-inf')
# print(X)
# print(F.softmax(X, dim=1))


**Locked Dropout**

In [None]:
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
        mask = Variable(m, requires_grad=False) / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

**2. Pyramidal BiLSTM**

In [None]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first = True)
       
        self.dropout = LockedDropout()
    def forward(self, x):
        '''
        :param x :(N, T) input to the pBLSTM
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        out, out_lens = pad_packed_sequence(x, batch_first= True)

        #Implementing locked dropout
        out = self.dropout(out,0.2)    

        out_lens = out_lens.to(DEVICE)
        #Dealing with odd dimensions:

        batch,Length,dim = out.shape

        if Length%2==1:
          out_cropped = Length-1
        else:
          out_cropped = Length
        out = out[:,:out_cropped,:]

        out_reshaped = out.reshape(batch,Length//2,dim*2)

        out_lens = out_lens//2

        out_lens = out_lens.cpu()

        packed_x = pack_padded_sequence(out_reshaped,lengths=out_lens,batch_first= True, enforce_sorted= False)

        pack_out = self.blstm(packed_x)[0]

        return pack_out

**Encoder**

In [None]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        
        ### Add code to define the blocks of pBLSTMs! ###
        list_pBLSTM = [pBLSTM(hidden_dim*4,hidden_dim)]*3
        self.pBLSTM = nn.Sequential(*list_pBLSTM)
        
        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        lens = lens.cpu()
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=True, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)

        ### Use the outputs and pass it through the pBLSTM blocks! ###
        outputs = self.pBLSTM(outputs)
        linear_input, lens = utils.rnn.pad_packed_sequence(outputs)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, lens


**Teacher forcing**

In [None]:
def teacher_forcing(prediction,iter,batch_size,embedding_layer,embeddings,percentage,isTrain):

  #Training
  if (isTrain):

    gen = np.random.random(1)[0]

    if gen < (percentage/100):
      if iter == 0:
        sos = torch.ones(batch_size,dtype=torch.long)
        sos = sos*(letter2index['<sos>'])
        sos = sos.to(DEVICE)        
        char_embed = embedding_layer(sos)

      else:
        char_embed = embeddings[:,iter-1,:]
    else:
      
      char_embed = embedding_layer(prediction.argmax(dim = -1))
  
  #Testing
  else:
    if iter == 0:
        sos = torch.ones(batch_size,dtype=torch.long)
        sos = sos*(letter2index['<sos>'])
        sos = sos.to(DEVICE)
        char_embed = embedding_layer(sos)

    else:
        char_embed = embedding_layer(prediction.argmax(dim = -1))


  return char_embed

**Decoder**

In [None]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)
        
        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

        #weight tying
        self.character_prob.weight = self.embedding.weight

    #Changing arguments to include batch_idx
    def forward(self, key, values, lens, batch_idx, text=None, isTrain=True, teacher_forcing_percent = 90):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[1]

        if (isTrain == True):
            max_len =  text.shape[1]
            embeddings = self.embedding(text)
        else:
            max_len = 600

        predictions = []
        hidden_states = [None, None]
        prediction = torch.zeros(batch_size,1).to(DEVICE)#(torch.ones(batch_size, 1)*33).to(DEVICE)

        #initialization
        context = values[0,:,:]

        attentionPlot = []

        for i in range(max_len):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do not get index out of range errors. 

            if (isTrain):
              #-----------------------Teacher forcing---------------------------
              char_embed = teacher_forcing(prediction,i,batch_size,self.embedding,embeddings,
                                           teacher_forcing_percent,isTrain = True)
              
            else:
              char_embed = teacher_forcing(prediction,i,batch_size,self.embedding,None,
                                           teacher_forcing_percent,isTrain = False)
              #-----------------------------------------------------------------
          
            inp = torch.cat([char_embed, context], dim=1)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]

            #---------------------- Attention-----------------------------------

            T,N,value_size = values.shape

            context,attention = self.attention(output,key,values,lens)

            if batch_idx % 50 == 0 and isTrain:
                    currAtten = attention[0].detach().cpu()

                    attentionPlot.append(currAtten) 

            prediction = self.character_prob(torch.cat([output,context], dim=1))
            predictions.append(prediction.unsqueeze(1))
        #--------------Plotting code------------------------------------------
        if batch_idx % 50 == 0 and isTrain:
            attentions = torch.stack(attentionPlot, dim=1)

            plt.clf()
            sns.heatmap(attentions, cmap='GnBu')
            plt.savefig("./attention/heat_{}s.png".format(time.time()))
        #----------------------------------------------------------------------
        return torch.cat(predictions, dim=1)


**Seq2Seq**

In [None]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim, value_size, key_size, isAttended)

    def forward(self, speech_input, speech_len, batch_id, text_input=None, isTrain=True, tf_perc = 90):
        key, value, lens = self.encoder(speech_input, speech_len)
        if (isTrain == True):
            predictions = self.decoder(key, value, lens, batch_id, text_input,teacher_forcing_percent = tf_perc)
        else:
            predictions = self.decoder(key, value, lens, batch_id, text=None, isTrain=False)
        return predictions


**Train and Test**

In [None]:
def get_mask(lengths):

  lengths = torch.tensor(lengths).to(DEVICE)
    
  max_length = torch.max(lengths)

  mask = torch.arange(max_length)
  msk = mask.expand((len(lengths),len(mask))).to(DEVICE)
  len_exp = lengths.unsqueeze(1).expand((len(lengths),max_length)).int()
  
  mask = msk < len_exp     
  return mask 

def get_Levenshtein(prediction,label):

  dist_total = 0

  for i in range(len(prediction)):
    preds = prediction[i]
    target = label[i]

    lev_dis = Levenshtein.distance(preds,target)

    dist_total += lev_dis
 
  return dist_total





In [None]:
def train(model, train_loader, criterion, optimizer, epoch,tf_perc = 90):
    model.train()
    model.to(DEVICE)
    start = time.time()

    runningLoss = 0
    tf = tf_perc
    factor = 50 
    # 1) Iterate through your loader

    for id,(x,xlens,y,ylens) in enumerate(train_loader):

      with torch.autograd.set_detect_anomaly(False):

        # 2) Set the inputs to the device.
        x,xlens,y,ylens = x.to(DEVICE), xlens.to(DEVICE), y.to(DEVICE), ylens.to(DEVICE)
        
        optimizer.zero_grad()

        # 3) Pass your inputs, and length of speech into the model.
        preds = model(x,xlens,id,y,isTrain = True,tf_perc = tf)

        # 4) Generate a mask based on the lengths of the text to create a masked loss.
        mask = get_mask(ylens).to(DEVICE)

        # 5) If necessary, reshape your predictions and origianl text input 
        # 6) Use the criterion to get the loss.
        loss = criterion(preds.reshape((-1,preds.shape[2])),y.reshape(-1))

        # 7) Use the mask to calculate a masked loss. 
        masked_loss = torch.sum(loss * mask.reshape(-1))/torch.sum(mask)

        Loss = masked_loss.item()
        Perplex = torch.exp(masked_loss).item()
        runningLoss += Loss
        
        # 8) Run the backward pass on the masked loss. 
        masked_loss.backward()

        # 9) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)
        torch.nn.utils.clip_grad_norm(model.parameters(), 2)

        # 10) Take a step with your optimizer
        optimizer.step()

        # 11) Normalize the masked loss

        # 12) Optionally print the training loss after every N batches        

        if(id%50==0):
          print("Epoch:",epoch,"Batch_id:",id,"Loss: ", runningLoss/factor,'Perplexity:', Perplex, 'time elapsed: ',(time.time()-start))
          factor+=50


        del x
        del xlens
        del y
        del ylens
        torch.cuda.empty_cache()
      end = time.time()

def val(model, valid_loader, criterion, optimizer, epoch):
    model.eval()
    model.to(DEVICE)
    start = time.time()
  
    leven_dist = 0
    total_seq_len = 0

    for id,(x,xlens,y,ylens) in enumerate(valid_loader):

        x,xlens,y,ylens = x.to(DEVICE), xlens.to(DEVICE), y.to(DEVICE), ylens.to(DEVICE)

        preds = model(x,xlens,id,y,isTrain = False)

        # predn_nums = preds.argmax(-1).detach().cpu().numpy()
        predn_nums = preds.argmax(-1).detach().cpu()

        predicted_text = transform_index_to_letter(predn_nums,letter2index,index2letter)
        true_text = transform_index_to_letter(y.detach().cpu(),letter2index,index2letter)

        dist = get_Levenshtein(predicted_text,true_text)

        leven_dist+=dist
        total_seq_len += len(predicted_text)
        if(id%20==0):
          print('Levenshtein: ', leven_dist/total_seq_len)
          print("Predicted text:",predicted_text[0],"\n\n","True text:", true_text[0],"\n\n")


        del x
        del xlens
        del y
        del ylens
        torch.cuda.empty_cache()
    end = time.time()
    return leven_dist/total_seq_len

def test(model, test_loader):
    ### Write your test code here! ###
    result = []
    model.eval()
    with torch.no_grad():
      for id,(x,xlens) in enumerate(test_loader):
        x,xlens= x.to(DEVICE), xlens.to(DEVICE)

        preds = model(x,xlens,id,None,isTrain = False)

        predn_nums = preds.argmax(-1).detach().cpu().numpy()
        
        predicted_text = transform_index_to_letter(predn_nums,letter2index,index2letter)

        result += predicted_text
    
      idxs = np.array(list(range(len(result))))
      predictions = np.array(result)
      df = pd.DataFrame({"id" : idxs, "label" : predictions})
      df.to_csv('please_work_TF0.2.csv',index = False)
      
      return df

**Main**

In [None]:
model = Seq2Seq(input_dim=40,vocab_size=len(LETTER_LIST),
                hidden_dim = 256, isAttended = True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.75, patience=1, verbose=True, threshold=1e-2)

In [None]:
model.to(DEVICE)
print(model)

In [None]:
tf = 90
for epoch in range(60):
  if epoch == 27:
    tf = 80
  if epoch == 50:
    tf = 70
  startTime = time.time()
  train(model, train_loader, criterion, optimizer, epoch,tf_perc=tf)
  lev_dist = val(model, valid_loader, criterion, optimizer, epoch)
  scheduler.step(lev_dist)
  file = "/content/gdrive/My Drive/18786/HW4/wt_ckpt/0.1_lockwtty_epoch{0}.pth".format(epoch+1)
  torch.save(model.state_dict(),file)

In [None]:
state_dict = torch.load('/content/gdrive/My Drive/18786/HW4/wt_ckpt/0.1_lockwtty_epoch44.pth')
model.load_state_dict(state_dict)

In [None]:
test_pred = test(model,test_loader) 