In [1]:
!export CUDA_VISIBLE_DEVICES=1

In [2]:
import argparse
import os
import math
import random
import json
import numpy as np
from tqdm import tqdm

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from encoder import Encoder
from attention import Attention
from decoder import Decoder
from generator import Generator

from data_loader import SpeechDataset, Padding, ToTensor

Using TensorFlow backend.


In [3]:
torch.cuda.set_device(1)

In [4]:
#args = parser.parse_args()

In [None]:
with open('labels_dict.json', 'r') as f:
    labels = json.loads(f.read())
    
len(labels)

32

In [None]:
!ls '../../SpeechRecognition.EN/deepspeech.cv/data/'

cv-invalid_manifest.csv      cv-valid-test_manifest.csv   __pycache__
cv-other-dev_manifest.csv    cv-valid-train_manifest.csv  test_manifest.csv
cv-other-test_manifest.csv   data_loader.py		  train_manifest.csv
cv-other-train_manifest.csv  distributed.py		  utils.py
cv-valid-dev_manifest.csv    __init__.py


In [None]:
SIGNAL_SEQ_LEN = 1100 
TXT_SEQ_LEN = 189
OUTPUT_DIM = len(labels)
BATCH_SIZE = 12

audio_conf = {'window': 'hamming',
              'window_size' : 0.02,
              'window_stride' : 0.01,
              'sampling_rate': 16000}

train_dataset = SpeechDataset('train_manifest.csv', 
                            'labels_dict.json',
                            audio_conf,
                            transform=transforms.Compose([Padding(SIGNAL_SEQ_LEN, TXT_SEQ_LEN, 'labels_dict.json')]) 
                              )

val_dataset = SpeechDataset('val_manifest.csv', 
                            'labels_dict.json',
                            audio_conf,
                            transform=transforms.Compose([Padding(SIGNAL_SEQ_LEN, TXT_SEQ_LEN, 'labels_dict.json')]) 
                              )

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                            shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            shuffle=True, num_workers=4)

In [None]:
SIGNAL_FEATURE = 161
NUM_GRU = 4
ENC_HID_DIM = 256
DEC_HID_DIM = 256 
DEC_EMB_DIM = 256
DROPOUT_RATE = 0.2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

encoder = Encoder(seq_len=SIGNAL_SEQ_LEN, input_size=SIGNAL_FEATURE, 
                  enc_hid_dim=ENC_HID_DIM, num_gru=NUM_GRU, 
                  dec_hid_dim=DEC_HID_DIM, dropout_rate=DROPOUT_RATE, 
                  device=device, use_pooling=False)

attention = Attention(enc_hid_dim=ENC_HID_DIM, dec_hid_dim=DEC_HID_DIM)

decoder = Decoder(output_dim=OUTPUT_DIM, emb_dim=DEC_EMB_DIM, 
                  enc_hid_dim=ENC_HID_DIM, dec_hid_dim=DEC_HID_DIM,
                  dropout_rate=DROPOUT_RATE, attention=attention)

model = Generator(encoder, decoder, device).to(device)

In [None]:
#model

In [None]:
if model.cuda:
    print(True)
else:
    print(False)

True


In [None]:
optimizer = optim.Adam(model.parameters())
pad_idx = labels['pad']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)#ignore_index=pad_idx

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, sample in tqdm(enumerate(iterator)):
        
        src = sample['signal'].type(torch.FloatTensor).to(device)
        src = src.permute(0, 2, 1)
        trg = sample['transcript'].type(torch.LongTensor).to(device)
        trg = trg.view(-1, TXT_SEQ_LEN)
       
        optimizer.zero_grad()
        
        #print('src.size - ', src.size(), ' trg.size - ', trg.size())
        #break
        #print(trg[:, 0])
        _, _, output = model(src, trg)
        #print('src.size - ', src.size(), ' trg.size - ', trg.size(), ' output.size - ', output.size())
        
        loss = criterion(output.view(-1, output.shape[2]), trg.view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    #print('in evaluation')
    
    with torch.no_grad():
    
        for i, sample in tqdm(enumerate(iterator)):

            src = sample['signal'].type(torch.FloatTensor).to(device)
            src = src.permute(0, 2, 1)
            trg = sample['transcript'].type(torch.LongTensor).to(device)
            trg = trg.view(-1, TXT_SEQ_LEN)

            _, _, output = model(src, trg, 0) #turn off teacher forcing
            #print('output - ', output)
            
            loss = criterion(output.view(-1, output.shape[2]), trg.view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
N_EPOCHS = 25
CLIP = 10
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'rsr_gan.pt')

best_valid_loss = float('inf')

if not os.path.isdir(SAVE_DIR):
    os.makedirs(SAVE_DIR)

for epoch in range(N_EPOCHS):
    with torch.cuda.device(1):
        train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, val_dataloader, criterion)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), MODEL_SAVE_PATH)
            print('Saved Model.')

        print('Epoch: ', epoch+1, '| Train Loss: ', train_loss, '| Train PPL: ', math.exp(train_loss),
              '| Val. Loss: ', valid_loss, '| Val. PPL: ', math.exp(valid_loss))
    
    #print('Train loss - ', train_loss)

417it [14:53,  2.16s/it]
84it [00:38,  2.51it/s]


Saved Model.
Epoch:  1 | Train Loss:  2.669001927478708 | Train PPL:  14.42556424612975 | Val. Loss:  7.439688052449908 | Val. PPL:  1702.219135230629


417it [15:03,  2.15s/it]
84it [00:38,  2.30it/s]

Epoch:  2 | Train Loss:  2.5587644617048673 | Train PPL:  12.919844487423243 | Val. Loss:  54.036529132298064 | Val. PPL:  2.9360701203938645e+23



417it [14:58,  2.18s/it]
84it [00:38,  2.43it/s]

Saved Model.
Epoch:  3 | Train Loss:  2.5277490495777815 | Train PPL:  12.52528059579386 | Val. Loss:  5.6017361879348755 | Val. PPL:  270.8963263077023



417it [15:06,  2.19s/it]
84it [00:38,  2.61it/s]

Epoch:  4 | Train Loss:  2.5028719267399193 | Train PPL:  12.217531479379172 | Val. Loss:  25.934956414358957 | Val. PPL:  183403854666.54712



417it [15:07,  2.16s/it]
84it [00:38,  2.29it/s]

Epoch:  5 | Train Loss:  2.4842220564826216 | Train PPL:  11.991787691701834 | Val. Loss:  31.017987069629488 | Val. PPL:  29576080803712.4



417it [15:19,  2.17s/it]
84it [00:38,  2.55it/s]

Epoch:  6 | Train Loss:  2.466608320780509 | Train PPL:  11.782416827210849 | Val. Loss:  9.2996346950531 | Val. PPL:  10934.024225377074



417it [15:21,  2.22s/it]
84it [00:39,  2.29it/s]

Saved Model.
Epoch:  7 | Train Loss:  2.4668463202689193 | Train PPL:  11.78522137011461 | Val. Loss:  5.438976759002323 | Val. PPL:  230.20650616882773



417it [15:22,  2.20s/it]
84it [00:38,  2.33it/s]

Saved Model.
Epoch:  8 | Train Loss:  2.4495893616756375 | Train PPL:  11.583589076846476 | Val. Loss:  5.104474323136466 | Val. PPL:  164.75743858987065



417it [15:26,  2.25s/it]
84it [00:38,  2.31it/s]

Epoch:  9 | Train Loss:  2.4487693155412193 | Train PPL:  11.574093893177759 | Val. Loss:  5.109905657314119 | Val. PPL:  165.65472582473487



417it [15:32,  2.22s/it]
84it [00:38,  2.32it/s]

Epoch:  10 | Train Loss:  2.431105112286209 | Train PPL:  11.371441858850378 | Val. Loss:  7.168887053217206 | Val. PPL:  1298.3987496632476



207it [07:48,  2.25s/it]

In [None]:
#torch.cuda.device(1)

In [None]:
"""
train_dataset = SpeechDataset('../../SpeechRecognition.EN/deepspeech.cv.i.dvd/data/train_manifest.csv', 
                            'labels_dict.json',
                            audio_conf 
                              )
seq_len = 0
transcript_len = 0
for i in tqdm(range(len(train_dataset))):
    sample = train_dataset[i]
    if sample['signal'].shape[1] > seq_len:
        seq_len = sample['signal'].shape[1]
        
    if sample['transcript'].shape[0] > transcript_len:
        transcript_len = sample['transcript'].shape[0] 
    
seq_len, transcript_len

"""
# seq_len = 1100, transcript_len = 189