In [1]:
data_path = "/home/widen-desktop2/Desktop/tacotron/LJSpeech-1.1"

In [8]:
from hyperparams import Hyperparams as hp
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from utils import *
import re
import codecs
import os
import unicodedata
import librosa
hp.Tx = 190

In [37]:
def load_data(mode="train"):
    # Load vocabulary
    char2idx, idx2char = load_vocab()

    if mode == "train":
        # Parse
        fpaths, texts = [], []
        transcript = os.path.join(hp.data, 'metadata.csv')
        lines = codecs.open(transcript, 'r', 'utf-8').readlines()

        for line in lines:
            fname, _, text = line.strip().split("|")
            fpath = os.path.join(hp.data, "wavs", fname + ".wav")

            fpaths.append(fpath)

            text = text_normalize(text) + "E" # ␃: EOS
            text = [char2idx[char] for char in text]
            texts.append(np.array(text, np.int32).tostring())
        return fpaths, texts
    else:
        # Parse
        lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:]
        sents = [text_normalize(line.split(" ", 1)[-1]).strip() + u"␃" for line in lines]  # text normalization, E: EOS
        texts = np.zeros((len(lines), hp.Tx), np.int32)
        for i, sent in enumerate(sents):
            texts[i, :len(sent)] = [char2idx[char] for char in sent]
        return texts
    
def collate_fn(batch):
    fpath = [d['wav_name'] for d in batch]
    text = [d['text'] for d in batch]
    wav = [d['wav'] for d in batch]

    text = _prepare_data(text).astype(np.int32)
    fname = []
    mel = []
    mag = []

    for i in fpath:
        fname_i, mel_i, mag_i = load_spectrograms(i)
        fname.append(fname_i)
        mel.append(mel_i)
        mag.append(mag_i)
    
    mel = __prepare_data(mel)
    mag = __prepare_data(mag)
    
    return fname, text, mel, mag

def _pad_data(x, length):
    return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=0)

def __pad_data(x, length):
    return np.pad(x, ((0,length - x.shape[0]), (0, 0)), mode='constant', constant_values=0)

def _prepare_data(inputs):
    max_len = max((len(x) for x in inputs))
    return np.stack([_pad_data(x, max_len) for x in inputs])

def __prepare_data(inputs):
    max_len = max((x.shape[0] for x in inputs))
    return np.stack([__pad_data(x, max_len) for x in inputs])

def load_vocab():
    char2idx = {char: idx for idx, char in enumerate(hp.vocab)}
    idx2char = {idx: char for idx, char in enumerate(hp.vocab)}
    return char2idx, idx2char

def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                           if unicodedata.category(char) != 'Mn') # Strip accents

    text = re.sub("[^{}]".format(hp.vocab), " ", text)
    text = re.sub("[ ]+", " ", text)
    return text

In [38]:
class get_Dataset(Dataset):
    def __init__(self, csv_file, wav_file):
        
        self.metadata = pd.read_csv(csv_file, sep='|', header=None)
        self.wav_file = wav_file
    
    def __len__(self):
        return len(self.metadata)

    def load_wav(self, filename):
        return librosa.load(filename, sr=22050)

    def __getitem__(self, idx):
        char2idx, idx2char = load_vocab()
                
        wav_name = os.path.join(self.wav_file, self.metadata.iloc[idx,0]) + '.wav'
        text = self.metadata.iloc[idx, 1]
        text = text_normalize(text) + "E"
        text = [char2idx[char] for char in text]

        text = np.asarray(text, dtype=np.int32)
        wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
        
        sample = {'wav_name':wav_name, 'text':text, 'wav':wav}
        return sample

In [39]:
data = get_Dataset(os.path.join(data_path,'metadata.csv'), os.path.join(data_path,'wavs'))

In [40]:
data[0]

{'wav_name': '/home/widen-desktop2/Desktop/tacotron/LJSpeech-1.1/wavs/LJ001-0001.wav',
 'text': array([26, 54, 45, 50, 56, 45, 50, 43,  5,  2, 45, 50,  2, 56, 44, 41,  2,
        51, 50, 48, 61,  2, 55, 41, 50, 55, 41,  2, 59, 45, 56, 44,  2, 59,
        44, 45, 39, 44,  2, 59, 41,  2, 37, 54, 41,  2, 37, 56,  2, 52, 54,
        41, 55, 41, 50, 56,  2, 39, 51, 50, 39, 41, 54, 50, 41, 40,  5,  2,
        40, 45, 42, 42, 41, 54, 55,  2, 42, 54, 51, 49,  2, 49, 51, 55, 56,
         2, 45, 42,  2, 50, 51, 56,  2, 42, 54, 51, 49,  2, 37, 48, 48,  2,
        56, 44, 41,  2, 37, 54, 56, 55,  2, 37, 50, 40,  2, 39, 54, 37, 42,
        56, 55,  2, 54, 41, 52, 54, 41, 55, 41, 50, 56, 41, 40,  2, 45, 50,
         2, 56, 44, 41,  2, 15, 60, 44, 45, 38, 45, 56, 45, 51, 50, 15],
       dtype=int32),
 'wav': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32)}

In [41]:
dataloader = DataLoader(data, batch_size=32, shuffle=False, collate_fn=collate_fn, drop_last=True)

In [44]:
print('text (N, Tx)  mel (N, Ty//r, n_mel*r)  mag (N, Ty, n_fft//2+1)')
a = 0
for i, data in enumerate(dataloader):
    fname, text, mel, mag = data
    print('batch :',i, text.shape, mel.shape, mag.shape)
    if text.shape[1] > a:
        a = text.shape[1]
        
print('a',a)
   # if i == 2 :
    #    break

text (N, Tx)  mel (N, Ty//r, n_mel*r)  mag (N, Ty, n_fft//2+1)
batch : 0 (32, 169) (32, 160, 400) (32, 800, 1025)
batch : 1 (32, 152) (32, 156, 400) (32, 780, 1025)
batch : 2 (32, 150) (32, 156, 400) (32, 780, 1025)
batch : 3 (32, 142) (32, 161, 400) (32, 805, 1025)
batch : 4 (32, 158) (32, 151, 400) (32, 755, 1025)
batch : 5 (32, 150) (32, 161, 400) (32, 805, 1025)
batch : 6 (32, 156) (32, 161, 400) (32, 805, 1025)
batch : 7 (32, 159) (32, 160, 400) (32, 800, 1025)
batch : 8 (32, 150) (32, 157, 400) (32, 785, 1025)
batch : 9 (32, 167) (32, 157, 400) (32, 785, 1025)
batch : 10 (32, 130) (32, 161, 400) (32, 805, 1025)
batch : 11 (32, 152) (32, 159, 400) (32, 795, 1025)
batch : 12 (32, 149) (32, 160, 400) (32, 800, 1025)
batch : 13 (32, 159) (32, 161, 400) (32, 805, 1025)
batch : 14 (32, 170) (32, 162, 400) (32, 810, 1025)
batch : 15 (32, 153) (32, 162, 400) (32, 810, 1025)
batch : 16 (32, 172) (32, 162, 400) (32, 810, 1025)
batch : 17 (32, 157) (32, 151, 400) (32, 755, 1025)
batch : 18 

batch : 156 (32, 153) (32, 162, 400) (32, 810, 1025)
batch : 157 (32, 136) (32, 158, 400) (32, 790, 1025)
batch : 158 (32, 151) (32, 154, 400) (32, 770, 1025)
batch : 159 (32, 150) (32, 155, 400) (32, 775, 1025)
batch : 160 (32, 142) (32, 154, 400) (32, 770, 1025)
batch : 161 (32, 155) (32, 162, 400) (32, 810, 1025)
batch : 162 (32, 173) (32, 162, 400) (32, 810, 1025)
batch : 163 (32, 153) (32, 162, 400) (32, 810, 1025)
batch : 164 (32, 150) (32, 152, 400) (32, 760, 1025)
batch : 165 (32, 135) (32, 143, 400) (32, 715, 1025)
batch : 166 (32, 148) (32, 159, 400) (32, 795, 1025)
batch : 167 (32, 146) (32, 160, 400) (32, 800, 1025)
batch : 168 (32, 145) (32, 162, 400) (32, 810, 1025)
batch : 169 (32, 156) (32, 161, 400) (32, 805, 1025)
batch : 170 (32, 166) (32, 162, 400) (32, 810, 1025)
batch : 171 (32, 161) (32, 162, 400) (32, 810, 1025)
batch : 172 (32, 146) (32, 159, 400) (32, 795, 1025)
batch : 173 (32, 158) (32, 162, 400) (32, 810, 1025)
batch : 174 (32, 161) (32, 161, 400) (32, 805,

batch : 311 (32, 153) (32, 156, 400) (32, 780, 1025)
batch : 312 (32, 157) (32, 161, 400) (32, 805, 1025)
batch : 313 (32, 148) (32, 160, 400) (32, 800, 1025)
batch : 314 (32, 174) (32, 160, 400) (32, 800, 1025)
batch : 315 (32, 170) (32, 160, 400) (32, 800, 1025)
batch : 316 (32, 163) (32, 157, 400) (32, 785, 1025)
batch : 317 (32, 156) (32, 161, 400) (32, 805, 1025)
batch : 318 (32, 155) (32, 156, 400) (32, 780, 1025)
batch : 319 (32, 162) (32, 160, 400) (32, 800, 1025)
batch : 320 (32, 149) (32, 158, 400) (32, 790, 1025)
batch : 321 (32, 157) (32, 162, 400) (32, 810, 1025)
batch : 322 (32, 169) (32, 161, 400) (32, 805, 1025)
batch : 323 (32, 132) (32, 161, 400) (32, 805, 1025)
batch : 324 (32, 164) (32, 162, 400) (32, 810, 1025)
batch : 325 (32, 149) (32, 158, 400) (32, 790, 1025)
batch : 326 (32, 183) (32, 162, 400) (32, 810, 1025)
batch : 327 (32, 150) (32, 160, 400) (32, 800, 1025)
batch : 328 (32, 139) (32, 161, 400) (32, 805, 1025)
batch : 329 (32, 157) (32, 159, 400) (32, 795,

In [98]:
a = np.array([[1,2],[3,4]])
np.pad(a, ((0, 10), (0, 0)), mode = 'constant', constant_values=0)

array([[1, 2],
       [3, 4],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0]])

In [112]:
a = np.zeros([hp.batch_size, hp.n_mels, 1])
print(a.shape)

(32, 80, 1)


In [29]:
import torch
x = torch.tensor([[1,1,1,1],[2,2,2,2]])

In [30]:
x1 = x.repeat(1, 10)

In [33]:
x1 = x1.view(2, 10, -1)

In [35]:
x1.shape

torch.Size([2, 10, 4])

In [36]:
x = torch.tensor([[1,1,1,1], [2,2,2,2]])
x = x.repeat(1, 10).view(2, 10, -1)
print(x)
print(x.shape)

tensor([[[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]],

        [[2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2],
         [2, 2, 2, 2]]])
torch.Size([2, 10, 4])


In [21]:
x3 = torch.unsqueeze(x2, 1)

In [23]:
x3.shape

torch.Size([10, 1, 4])

In [24]:
x3

tensor([[[1, 1, 1, 1]],

        [[2, 2, 2, 2]],

        [[1, 1, 1, 1]],

        [[2, 2, 2, 2]],

        [[1, 1, 1, 1]],

        [[2, 2, 2, 2]],

        [[1, 1, 1, 1]],

        [[2, 2, 2, 2]],

        [[1, 1, 1, 1]],

        [[2, 2, 2, 2]]])

In [10]:
import torch