In [1]:
import argparse
import os
import math
import random
import json
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from encoder import Encoder
from attention import Attention
from decoder import Decoder
from generator import Generator

from data_loader import SpeechDataset, Padding, ToTensor

Using TensorFlow backend.


In [None]:
parser = argparse.ArgumentParser(description='RSR-GAN training')
parser.add_argument('--train-manifest', metavar='DIR',
                    help='path to train manifest csv', default='data/train_manifest.csv')
parser.add_argument('--val-manifest', metavar='DIR',
                    help='path to validation manifest csv', default='data/val_manifest.csv')
parser.add_argument('--sample-rate', default=16000, type=int, help='Sample rate')
parser.add_argument('--batch-size', default=20, type=int, help='Batch size for training')
parser.add_argument('--num-workers', default=4, type=int, help='Number of workers used in data-loading')
parser.add_argument('--labels-path', default='labels_dict.json', help='Contains all characters for transcription')
parser.add_argument('--window-size', default=.02, type=float, help='Window size for spectrogram in seconds')
parser.add_argument('--window-stride', default=.01, type=float, help='Window stride for spectrogram in seconds')
parser.add_argument('--window', default='hamming', help='Window type for spectrogram generation')

parser.add_argument('--enc-hid-dim', default=256, type=int, help='Encoder hidden dimension')
parser.add_argument('--dec-hid-dim', default=256, type=int, help='Decoder hidden dimension')
parser.add_argument('--dec-emb-dim', default=256, type=int, help='Decoder embedding dimension')
parser.add_argument('--dropout-rate', default=0.2, type=float, help='Dropout rate')

parser.add_argument('--epochs', default=500, type=int, help='Number of training epochs')
parser.add_argument('--cuda', dest='cuda', action='store_true', help='Use cuda to train model')
parser.add_argument('--lr', '--learning-rate', default=3e-4, type=float, help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
parser.add_argument('--max-norm', default=400, type=int, help='Norm cutoff to prevent explosion of gradients')
parser.add_argument('--learning-anneal', default=1.1, type=float, help='Annealing applied to learning rate every epoch')
parser.add_argument('--silent', dest='silent', action='store_true', help='Turn off progress tracking per iteration')
parser.add_argument('--checkpoint', dest='checkpoint', action='store_true', help='Enables checkpoint saving of model')
parser.add_argument('--checkpoint-per-batch', default=0, type=int, help='Save checkpoint per batch. 0 means never save')
parser.add_argument('--tensorboard', dest='tensorboard', action='store_true', help='Turn on tensorboard graphing')
parser.add_argument('--log-dir', default='visualize/rsrgan_final', help='Location of tensorboard log')
parser.add_argument('--log-params', dest='log_params', action='store_true', help='Log parameter values and gradients')
parser.add_argument('--id', default='Deepspeech training', help='Identifier for visdom/tensorboard run')
parser.add_argument('--save-folder', default='models/', help='Location to save epoch models')
parser.add_argument('--model-path', default='models/rsrgan_final.pth',
                    help='Location to save best validation model')
parser.add_argument('--continue-from', default='', help='Continue from checkpoint model')
parser.add_argument('--finetune', dest='finetune', action='store_true',
                    help='Finetune the model from checkpoint "continue_from"')

parser.add_argument('--no-shuffle', dest='no_shuffle', action='store_true',
                    help='Turn off shuffling and sample from dataset based on sequence length (smallest to largest)')

#args = parser.parse_args()

In [2]:
with open('labels_dict.json', 'r') as f:
    labels = json.loads(f.read())
    
len(labels)

32

In [3]:
SIGNAL_INPUT_SIZE = 1500 
TXT_INPUT_SIZE = 135
OUTPUT_DIM = 135

audio_conf = {'window': 'hamming',
              'window_size' : 0.02,
              'window_stride' : 0.01,
              'sampling_rate': 16000}

train_dataset = SpeechDataset('../../SpeechRecognition.EN/deepspeech.cv/data/cv-valid-dev_manifest.csv', 
                            'labels_dict.json',
                            audio_conf,
                            transform=transforms.Compose([Padding(SIGNAL_INPUT_SIZE, OUTPUT_DIM, 'labels_dict.json')]) 
                              )

val_dataset = SpeechDataset('../../SpeechRecognition.EN/deepspeech.cv/data/cv-other-dev_manifest.csv', 
                            'labels_dict.json',
                            audio_conf,
                            transform=transforms.Compose([Padding(SIGNAL_INPUT_SIZE, OUTPUT_DIM, 'labels_dict.json')]) 
                              )

train_dataloader = DataLoader(train_dataset, batch_size=4,
                            shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=4,
                            shuffle=True, num_workers=4)

In [49]:
NUM_GRU = 6
ENC_HID_DIM = 256
DEC_HID_DIM = 256 
DEC_EMB_DIM = 256
DROPOUT_RATE = 0.2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

encoder = Encoder(SIGNAL_INPUT_SIZE, NUM_GRU, ENC_HID_DIM, DEC_HID_DIM, DROPOUT_RATE,)
attention = Attention(enc_hid_dim=ENC_HID_DIM, dec_hid_dim=DEC_HID_DIM)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DROPOUT_RATE, attention)
model = Generator(encoder, decoder, device).to(device)

In [50]:
model

Generator(
  (encoder): Encoder(
    (rnn_stack): ModuleList(
      (0): GRU(1500, 256, batch_first=True, dropout=0.2, bidirectional=True)
      (1): GRU(512, 256, batch_first=True, dropout=0.2, bidirectional=True)
      (2): GRU(512, 256, batch_first=True, dropout=0.2, bidirectional=True)
      (3): GRU(512, 256, batch_first=True, dropout=0.2, bidirectional=True)
      (4): GRU(512, 256, batch_first=True, dropout=0.2, bidirectional=True)
      (5): GRU(512, 256, batch_first=True, dropout=0.2, bidirectional=True)
    )
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (pool): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
    )
    (embedding): Embedding(135, 256)
    (gru): GRU(768, 256, batch_first=True)
    (fc): Linear(in_features=1024, out_features=135, bias=True)
    (dropout): Dropout(p=0.2)
  )
)

In [45]:
optimizer = optim.Adam(model.parameters())
pad_idx = labels['pad']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [51]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, sample in enumerate(iterator):
        
        src = sample['signal'].type(torch.FloatTensor).to(device)
        trg = sample['transcript'].type(torch.FloatTensor).to(device)

        optimizer.zero_grad()
        
        _, _, output = model(src, trg)
        print(output.size())
        break
        
        #loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
        
        #loss.backward()
        
        #torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        #optimizer.step()
        
        #epoch_loss += loss.item()
        
    #return epoch_loss / len(iterator)

In [52]:
N_EPOCHS = 5
CLIP = 10
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'rsr_gan.pt')

best_valid_loss = float('inf')

if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)

for epoch in range(N_EPOCHS):
    
    #train_loss = 
    train(model, train_dataloader, optimizer, criterion, CLIP)
    """
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print('Epoch: ', epoch+1, '| Train Loss: ', train_loss, '| Train PPL: ', math.exp(train_loss),
          '| Val. Loss: ', valid_loss, '| Val. PPL: ', math.exp(valid_loss))
    """
    #print('Train loss - ', train_loss)

torch.Size([4, 1, 135])
torch.Size([4, 1, 135])
torch.Size([4, 1, 135])
torch.Size([4, 1, 135])
torch.Size([4, 1, 135])


In [None]:
!ls '../../SpeechRecognition.EN/deepspeech.cv/data/'

In [32]:
from tqdm import tqdm

In [None]:
max_audio_len = 0
max_txt_len = 0
for i in tqdm(range(len(speech_dataset))):
    sample = speech_dataset[i]
    if sample['signal'].shape[1] > max_audio_len:
        max_audio_len = sample['signal'].shape[1]
        
    if sample['transcript'].shape[0] > max_txt_len:
        max_txt_len = sample['transcript'].shape[0]

max_audio_len, max_txt_len

In [None]:
#cv-valid-dev_manifest
train_max_audio_len = 1477
train_max_txt_len = 135