In [17]:
## Import package
import numpy as np
import pandas as pd
from __future__ import print_function, division
import os
import torch
from skimage import io, transform
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils,datasets, models
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import time 
import copy
from collections import namedtuple

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

In [159]:
os.environ["WSJ_PATH"] = '/Users/thierryhuang/Desktop/2019Spring/Deep_Learning/CTC speech learning/hw3p2-data-V2'

import numpy as np
import os

class WSJ():
    """ Load the WSJ speech dataset
        
        Ensure WSJ_PATH is path to directory containing 
        all data files (.npy) provided on Kaggle.
        
        Example usage:
            loader = WSJ()
            trainX, trainY = loader.train
            assert(trainX.shape[0] == 24590)
            
    """
  
    def __init__(self):
        self.dev_set = None
        self.train_set = None
        self.test_set = None
  
    @property
    def dev(self):
        if self.dev_set is None:
            self.dev_set = load_raw(os.environ['WSJ_PATH'], 'wsj0_dev')
        return self.dev_set
    
    os.environ["WSJ_PATH"] = '/Users/thierryhuang/Desktop/2019Spring/Deep_Learning/CTC speech learning/hw3p2-data-V2'
    @property
    def train(self):
        if self.train_set is None:
            self.train_set = load_raw(os.environ['WSJ_PATH'], 'wsj0_train')
        return self.train_set
  
    @property
    def test(self):
        if self.test_set is None:
            self.test_set = (np.load(os.path.join(os.environ['WSJ_PATH'], 'transformed_test_data.npy'), encoding='bytes'), None)
        return self.test_set
    
def load_raw(path, name):
    return (
        np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes'), 
        np.load(os.path.join(path, '{}_merged_labels.npy'.format(name)), encoding='bytes')
    )

In [160]:
loader = WSJ()
trainX, trainY = loader.train
trainX.shape[0]


24724

In [161]:
trainX, trainY = trainX[:7], trainY[:7]
trainX.shape[0]

7

In [162]:
devX, devY = loader.dev[:2]
devX.shape[0]

1106

In [163]:
testX = loader.test

In [164]:
devY[2]

array([36, 15, 16, 33, 34,  8, 26, 40, 35,  8, 28, 33, 23, 25, 42, 11, 17,
       44, 14, 22, 41, 16, 26,  8, 32, 27,  8, 28, 37,  8, 41, 15, 23, 43,
       40, 27,  8, 28, 25,  8, 32,  7, 34, 22, 37, 23, 19, 17, 34, 30, 35,
        8, 26, 22, 28, 37, 33,  8, 34, 37, 36])

In [165]:
trainX[1].shape,trainY[1].shape

((442, 40), (52,))

In [166]:
class SpeechDatasets(Dataset):

    def __init__(self, data, label):
#         self.data = data
        self.data = data
#         self.data = torch.tensor(data).view(-1,len(self.data))
#         self.label = label
        self.label= label

    def __getitem__(self, index):
        x, y = torch.tensor(self.data[index]),torch.tensor((self.label[index]))
        
        return x,y

    def __len__(self):
        return len(self.data)

trainDatasets = SpeechDatasets(trainX,trainY)
valDatasets = SpeechDatasets(devX,devY)

In [167]:
trainDatasets[0]

(tensor([[-9.2617, -9.6756, -9.6329,  ..., -1.9969, -2.1711, -2.7854],
         [-9.5377, -9.5849, -8.5210,  ..., -2.3204, -1.6032, -2.2624],
         [-8.8446, -9.2419, -8.8936,  ..., -2.4073, -2.4993, -2.6594],
         ...,
         [-5.2646, -5.2775, -6.3596,  ..., -3.6862, -3.9479, -4.0261],
         [-6.4038, -5.0855, -5.0489,  ..., -3.6954, -3.3632, -4.2306],
         [-4.8946, -4.5879, -5.1229,  ..., -3.3868, -2.9438, -3.5909]]),
 tensor([36, 15,  8, 19, 23, 27, 18, 26, 32, 33,  8, 14, 40, 34, 22, 44,  8, 26,
         22, 37, 17,  1,  8, 41, 37, 40, 37,  8, 19,  9, 33, 43,  8, 29,  2, 22,
         28, 28, 30, 41, 16, 27, 12, 17,  1,  7, 28, 14, 14, 22, 34, 16, 27, 12,
         17,  0, 36]))

In [168]:
import torch
import numpy as np

def padding(batch):
    # 1. Sort
    sorted_pairs = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
    sorted_sequences = [x[0] for x in sorted_pairs]
    
    # 2. Pad sequence
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sorted_sequences, batch_first=True)
    length = torch.LongTensor([len(x) for x in sorted_sequences])

    labels = [x[1]+1 for x in sorted_pairs]
    labels_length = torch.LongTensor([len(x) for x in labels])
    
# #     labels = torch.LongTensor(np.array(map(lambda x: x[1], sorted_pairs)))
#     print(sequences_padded.shape)
# #     print(len(labels))
#     print(length)
# #     print(labels)
#     print(labels_length)

    return sequences_padded, length, labels, labels_length



In [169]:
# Add dataset to dataloader that handles batching
batch_size = 3
train_size = len(trainDatasets)
val_size = len(valDatasets)
# test_size = test_data.test_data.shape[0]

train_loader = torch.utils.data.DataLoader(trainDatasets, 
                                           shuffle = True,
                                           batch_size=batch_size,
                                           collate_fn=padding)

val_loader = torch.utils.data.DataLoader(valDatasets, 
                                           shuffle = True,
                                           batch_size=batch_size,
                                           collate_fn=padding)

In [170]:
len(trainDatasets),len(valDatasets)

(7, 1106)

In [171]:
for index,i in enumerate(train_loader):
    print(index)
#     print(i)
    x, x_lengths, label, labels_length = i
    x_pack = torch.nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
    
    label_pack = torch.cat(label)
    
    print(x_pack[0].shape,x_pack[1].shape)
    print(label_pack,labels_length)
    
    if index ==1:
        break

0
torch.Size([1478, 40]) torch.Size([559])
tensor([37, 37, 38, 41, 29, 17, 34, 31, 21, 19, 25, 34, 19, 27, 34, 31, 15, 45,
        20, 18, 28, 14, 12, 29,  9,  3, 17, 29, 38, 18, 16,  9, 35, 23, 38, 24,
        20, 18, 28, 16,  9, 29, 10, 34, 39, 24, 35, 38,  9, 29, 15, 29, 10, 34,
        39, 43, 17, 35, 38, 37, 37, 16,  9, 20, 24, 28, 19, 27, 33, 34,  9, 15,
        41, 35, 23, 45,  9, 27, 23, 38, 18,  2,  9, 42, 38, 41, 38,  9, 20, 10,
        34, 44,  9, 30,  3, 23, 29, 29, 31, 42, 17, 28, 13, 18,  2,  8, 29, 15,
        15, 23, 35, 17, 28, 13, 18,  1, 37, 37,  4, 29, 41, 28, 18,  9, 35, 43,
        18, 26, 35,  9, 42,  7, 34, 38, 37, 18, 13, 19, 35, 38,  7, 29, 16,  9,
        35, 38, 10, 34, 24,  9, 42, 16,  9, 35,  8, 26, 34,  9, 20, 12, 35,  9,
        42, 12, 45,  9, 26, 37, 37]) tensor([60, 57, 52])
1
torch.Size([1589, 40]) torch.Size([563])
tensor([37, 37, 23, 29, 34, 24, 35,  9, 29, 38, 44, 18, 45,  2, 36, 24, 22,  9,
        45, 33, 34, 12, 28, 17, 34,  9, 27, 24,  9, 33, 

In [172]:
from torch.nn.utils.rnn import pack_padded_sequence 

class CTCSpeech(nn.Module):
    def __init__(self):
        super(CTCSpeech, self).__init__()
        
        self.hidden_dim = 256
        self.embedding_dim = 40
        self.feature_size = 47
        
        self.lstm = nn.LSTM(input_size = self.embedding_dim, 
                            hidden_size = self.hidden_dim, 
                            num_layers=3, 
                            bidirectional=True,
                            batch_first=True)
        
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(self.hidden_dim * 2, self.feature_size)
#         self.hidden = self.init_hidden()

    def forward(self, batch):
        data, data_lengths, labels, labels_length = batch
        x_pack = pack_padded_sequence(x, x_lengths, batch_first=True)
        output, self.hidden = self.lstm(x_pack)
        output = nn.utils.rnn.pad_packed_sequence(output)
        output = self.hidden2tag(output[0])
        
        return output, self.hidden


In [173]:
print(CTCSpeech())
speechmodel = CTCSpeech()

CTCSpeech(
  (lstm): LSTM(40, 256, num_layers=3, batch_first=True, bidirectional=True)
  (hidden2tag): Linear(in_features=512, out_features=47, bias=True)
)


In [174]:
import torch.optim as optim

criterion = nn.CTCLoss()

optimizer = optim.Adam(speechmodel.parameters(), lr=0.001)
# optimizer = optim.SGD(speechmodel.parameters(), lr = 0.001, momentum = 0.9, weight_decay = 0.001)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Decay LR by a factor of 0.1 every 7 epochs
# scheduler = lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)

In [175]:
def train_epoch(model, train_loader, val_loader, criterion, optimizer, epochs,train_size,val_size):
    ini_time = time.time()
    
    # initial weight
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    # metrics record statistics
    metrics = []
    
    # loop through each epoch
    for epoch in range(epochs):
            # set model to train model
        model.train()
        model.to(device)
        
#         scheduler.step()
        
        print('Epoch {}/{}'.format(epoch, epochs - 1))
        print('-' * 10)
        
        # initialize the running loss to 0
        epoch_loss = 0.0
        correct = 0
        start_time = time.time()
        
        for batch_idx, (batch) in enumerate(train_loader):
            print(batch_idx)
            if batch_idx == len(train_loader)-1:
                break
            
            data, data_lengths, label, label_length  = batch
            
            # refresh the parameter gradients
            optimizer.zero_grad()
#             if batch_idx % 20000 == 0:
#                 print(' Progress %s: %d/%d'%(epoch,batch_idx,len(train_loader)))
#             data = data.to(device)
#             label = label.long().to(device)

            # forward + backward + optimize
            outputs,hidden = model(batch)
            outputs = outputs.log_softmax(2).detach().requires_grad_()
            
            label_pack = torch.cat(label)
#             _, predicted = torch.max(outputs,1)
#             correct += torch.sum(predicted == label)
            print(label_pack)
            print(label_length)
            print(len(data_lengths))
            loss = criterion(outputs, label_pack, data_lengths, label_length)
            
            loss.backward()
            optimizer.step()
            
            # accumulate loss
            epoch_loss += loss.item() 
        
        # end of an epoch
        end_time = time.time()
        print('Epoch %d Training Loss: '%(epoch+1), epoch_loss, 'Time: ',end_time - start_time, 's')
        
#         # print statistics
#         total_loss = epoch_loss/train_size
#         train_error = 1.0 - correct/train_size
#         train_acc = correct/train_size
        
        
#         # validation process
#         val_correct = 0
#         model.eval()
        
#         with torch.no_grad():
#             for batch_idx, (val_data, val_label) in enumerate(val_loader):

#                 val_data = val_data.to(device)
#                 val_label = val_label.long().to(device)
#                 out = model(val_data)
#                 _, val_predicted = torch.max(out,1)
#                 val_correct += torch.sum(val_predicted == val_label)
                
#             val_acc = val_correct.double() / val_size
#             val_error = 1.0 - val_acc

#             # record best weights
#             if val_acc > best_acc:
#                 best_acc = val_acc
#                 best_model_wts = copy.deepcopy(model.state_dict())
                
# #         print("epoch: {0}, loss: {1:.8f}".format(epoch+1, total_loss))
#         print("epoch: {0}, acc: {1:.8f}".format(epoch+1, val_acc))
#         metrics.append(Metric(loss=total_loss, 
#                               train_error=train_error,
#                               val_error=val_error))
    # end of total training
    time_elapsed = time.time() - ini_time
    
    print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model

In [176]:
model_ft = train_epoch(speechmodel, train_loader, val_loader, criterion, optimizer, 12,train_size,val_size)

Epoch 0/11
----------
0
tensor([37, 37, 38, 41, 29, 17, 34, 31, 21, 19, 25, 34, 19, 27, 34, 31, 15, 45,
        20, 18, 28, 14, 12, 29,  9,  3, 17, 29, 38, 18, 16,  9, 35, 23, 38, 24,
        20, 18, 28, 16,  9, 29, 10, 34, 39, 24, 35, 38,  9, 29, 15, 29, 10, 34,
        39, 43, 17, 35, 38, 37, 37, 37, 16,  9, 33, 34,  7, 35, 17, 35, 13, 12,
        43, 23, 14, 16,  9, 27, 17, 29, 45, 20, 31, 26, 23, 35, 23, 45,  7, 29,
        23, 26, 35, 38, 18, 29,  9, 27,  7, 13, 25, 17, 26, 38, 35, 23, 45, 26,
        10, 27, 15,  9, 26,  7, 28,  9, 15, 19, 36,  9, 29,  4, 37, 37,  4, 29,
        41, 28, 18,  9, 35, 43, 18, 26, 35,  9, 42,  7, 34, 38, 37, 18, 13, 19,
        35, 38,  7, 29, 16,  9, 35, 38, 10, 34, 24,  9, 42, 16,  9, 35,  8, 26,
        34,  9, 20, 12, 35,  9, 42, 12, 45,  9, 26, 37, 37])
tensor([60, 63, 52])
3
1
tensor([37, 22, 23, 45, 28, 31, 35, 35, 23, 21, 29, 23, 20, 23, 26,  9, 29, 38,
        35, 12,  9, 29, 38, 23, 20, 23, 26, 33,  9, 13, 27, 23, 26, 19, 36,  9,
        29

tensor([37, 37, 23, 29, 34, 24, 35,  9, 29, 38, 44, 18, 45,  2, 36, 24, 22,  9,
        45, 33, 34, 12, 28, 17, 34,  9, 27, 24,  9, 33, 23, 34, 15, 23, 29, 38,
        17, 27,  9, 42, 23, 46,  9, 29, 20, 23, 27, 28, 45, 35,  9, 14, 17, 45,
        27, 23, 38,  9, 27, 21, 27, 10, 34, 24,  9,  6, 37, 37, 22, 23, 45, 28,
        31, 35, 35, 23, 21, 29, 23, 20, 23, 26,  9, 29, 38, 35, 12,  9, 29, 38,
        23, 20, 23, 26, 33,  9, 13, 27, 23, 26, 19, 36,  9, 29, 45, 37, 43, 18,
        35, 38,  9, 15, 24, 45,  9, 42, 13, 18, 15, 45,  4,  8, 29, 15,  3,  8,
        29,  9, 28,  9, 27, 45,  4, 37, 37, 16,  9, 20, 24, 28, 19, 27, 33, 34,
         9, 15, 41, 35, 23, 45,  9, 27, 23, 38, 18,  2,  9, 42, 38, 41, 38,  9,
        20, 10, 34, 44,  9, 30,  3, 23, 29, 29, 31, 42, 17, 28, 13, 18,  2,  8,
        29, 15, 15, 23, 35, 17, 28, 13, 18,  1, 37])
tensor([67, 67, 57])
3
1
tensor([37, 37, 38, 41, 29, 17, 34, 31, 21, 19, 25, 34, 19, 27, 34, 31, 15, 45,
        20, 18, 28, 14, 12, 29,  9,  3, 17

KeyboardInterrupt: 

In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True,batch_first=True)  # <- change here

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (torch.autograd.Variable(torch.zeros(2, 1, self.hidden_dim)),   
                torch.autograd.Variable(torch.zeros(2, 1, self.hidden_dim)))    # <- change here: first dim of hidden needs to be doubled

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores