In [1]:
import sys
sys.path.append("..")

In [2]:
"""Contains data generator for orgnaizing various audio data preprocessing
pipeline and offering data reader interface of PaddlePaddle requirements.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import tarfile
import multiprocessing
import numpy as np
import pandas as pd
from threading import local

from data_utils.utility import read_manifest
from data_utils.utility import xmap_readers_mp
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
from data_utils.speech import SpeechSegment
from data_utils.normalizer import FeatureNormalizer

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence, pad_sequence
from torchvision import transforms, utils

## dataset format

In [3]:
class SpecgramDataset(Dataset):
    def __init__(self,
                 manifest_file,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 num_threads=multiprocessing.cpu_count() // 2,
                 random_seed=0,
                 keep_transcription_text=False):
            self._max_duration = max_duration
            self._min_duration = min_duration
            self._keep_transcription_text = keep_transcription_text

            self.manifest = pd.read_csv(manifest_file)
                            # read_manifest(manifest_path=manifest_path,
                            #               max_duration=self._max_duration,
                            #               min_duration=self._min_duration)
            self.manifest = self.manifest.sort_values(by=["duration"])

            self._normalizer = FeatureNormalizer(mean_std_filepath)

            self._augmentation_pipeline = AugmentationPipeline(
                augmentation_config=augmentation_config, random_seed=random_seed)

            self._speech_featurizer = SpeechFeaturizer(
                vocab_filepath=vocab_filepath,
                specgram_type=specgram_type,
                stride_ms=stride_ms,
                window_ms=window_ms,
                max_freq=max_freq,
                use_dB_normalization=use_dB_normalization)
            
    def __len__(self):
        return len(self.manifest)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        instance = self.manifest.iloc[idx]

        specgram, transcript = self.process_utterance(instance["audio_filepath"], instance["text"])
        uttids = instance["uttid"]

        sample = {"uttid": uttids, "specgrams":specgram, "text": transcript}

        return sample
    
    
    def process_multi_utterances(self, audio_files, transcripts):
        specgrams = []
        transcripts = []
        for audio,trans in zip(audio_files, transcript):
            spec, trans = self.process_utterance(audio, trans)
            specgrams.append(spec)
            transcripts.append(trans)
        return specgrams, transcripts
    

    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param audio_file: Filepath or file object of audio file.
        :type audio_file: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
            
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part
    
    
def padding_batch(batch, padding_to=-1):
    """
    Padding audio features with zeros to make them have the same shape (or
    a user-defined shape) within one bach.

    If ``padding_to`` is -1, the maximun shape in the batch will be used
    as the target shape for padding. Otherwise, `padding_to` will be the
    target shape (only refers to the second axis).

    If `flatten` is True, features will be flatten to 1darray.
    """
    # get target shape
    spec_lengths = [i["specgrams"].shape[1] for i in batch]
    max_length = max(spec_lengths)
    sorted_index = np.argsort(spec_lengths, )

    if padding_to != -1:
        if padding_to < max_length:
            raise ValueError("If padding_to is not -1, it should be larger "
                             "than any instance's shape in the batch")
        max_length = padding_to

    # padding
    new_batch = {"uttid":[], "specgrams":[], "text":[], "length_spec":[], "length_text":[]}
    for i in sorted_index:
        sample = batch[i]
        audio = sample["specgrams"]
        text = sample["text"]
        uttid = sample["uttid"]

        new_batch["uttid"].append(uttid)
        new_batch["specgrams"].append(torch.tensor(audio).transpose(1,0))
        new_batch["length_spec"].append(audio.shape[1])
        new_batch["text"].append(torch.tensor(text))
        new_batch["length_text"].append(len(text))
   
    temp_padded_specgrams = pad_sequence(new_batch["specgrams"], batch_first=True)
    # make the specgrams fit the CNN layer
    new_batch["specgrams"] = torch.unsqueeze(temp_padded_specgrams, dim=1).type(torch.float32)
    new_batch["text"] = pad_sequence(new_batch["text"], batch_first=True).type(torch.int32)
    new_batch["length_spec"] = torch.tensor(new_batch["length_spec"], dtype=torch.int32)
    new_batch["length_text"] = torch.tensor(new_batch["length_text"], dtype=torch.int32)
    return new_batch

In [4]:
test_dataset = SpecgramDataset(manifest_file="test.manifest",
                               vocab_filepath="../models/baidu_en8k/vocab.txt",
                               mean_std_filepath="../models/baidu_en8k/mean_std.npz")

In [5]:
for i in range(len(test_dataset)):
    sample = test_dataset[i]
    print(sample["specgrams"].shape)

(161, 2451)
(161, 4786)
(161, 5089)
(161, 5463)


In [6]:
dataloader = DataLoader(test_dataset, batch_size=3,
                        shuffle=True, num_workers=1,
                       collate_fn=padding_batch)

for i_batch, sample_batched in enumerate(dataloader):
    print(sample_batched.keys())
    print(i_batch, sample_batched["specgrams"], sample_batched["length_spec"])
    print(sample_batched["text"])
    break

dict_keys(['uttid', 'specgrams', 'text', 'length_spec', 'length_text'])
0 tensor([[[[ 1.7291,  1.2213,  0.7237,  ..., -0.3093, -0.3313,  0.0226],
          [ 1.7331,  1.2487,  0.5848,  ..., -0.4447, -0.5539, -0.6474],
          [ 1.6575,  1.4555,  0.9296,  ..., -0.5921, -0.1915, -0.2112],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],


        [[[ 2.2365,  1.6285,  0.9088,  ...,  2.3858,  3.2304,  3.6974],
          [ 2.0503,  1.7781,  1.0357,  ...,  2.3867,  2.3240,  3.4075],
          [ 1.5397,  1.5822,  1.1622,  ...,  3.2143,  3.1977,  3.2684],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],


        [[[ 1.4829,  1.4

In [14]:
def flatten_paded_seq(text, length):
    assert isinstance(text, torch.IntTensor), "{}".format(text.type())
    assert isinstance(length, torch.IntTensor), "{}".format(length.type())
    
    flattened_text = torch.cat([text[i][:length[i]] for i in range(text.shape[0])])
    return flattened_text

In [15]:
text = sample_batched["text"]
length = sample_batched["length_text"]

flattened_text = flatten_paded_seq(text, length)
assert len(flattened_text) == sum(length)

In [17]:
text.shape

torch.Size([3, 382])

# hard code mask

In [7]:
test = sample_batched["length"]

In [11]:
(test - 1) // 3 + 1  

tensor([[ 817],
        [1596],
        [1821]])

In [12]:
test

tensor([[2451],
        [4786],
        [5463]])