In [65]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
import numpy as np
import json
import soundfile as sf
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from keras.preprocessing.sequence import pad_sequences
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [72]:
class SpeechDataset(Dataset):
    """Speech dataset."""

    def __init__(self, csv_file, labels, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.speech_frame = pd.read_csv(csv_file, header=None)
        with open(labels, 'r') as f:
            self.labels = json.loads(f.read())
        self.transform = transform

    def __len__(self):
        return len(self.speech_frame)

    def __getitem__(self, idx):
        wav_file = self.speech_frame.iloc[idx, 0]
        transcript_file = self.speech_frame.iloc[idx, 1]
        
        signal, sampling_rate = sf.read(wav_file)
        
        with open(transcript_file, 'r') as f:
            transcript = f.read().strip()
        transcript_idx = []
        for char in list(transcript):
            if char in self.labels:
                transcript_idx.append(self.labels[char])
        sample = {'signal': signal, 'transcript': np.array(transcript_idx)}
        if self.transform:
            sample = self.transform(sample)

        return sample

In [73]:
class Padding(object):
    """Rescale the audio signal and transcript to a given size.

    Args:
        signal_size (int): Desired output size of signal.
        transcript_size (int): Desired output size of transcript.
    """

    def __init__(self, signal_size, transcript_size):
        assert isinstance(signal_size, (int))
        assert isinstance(transcript_size, (int))
        self.signal_size = signal_size
        self.transcript_size = transcript_size

    def __call__(self, sample):
        signal, transcript = sample['signal'], sample['transcript']
        signal = pad_sequences(signal.reshape(1, -1), 
                               maxlen=self.signal_size, padding='post', truncating='post')
        transcript = pad_sequences(transcript.reshape(1, -1), 
                               maxlen=self.transcript_size, padding='post', truncating='post')
        
        return {'signal': signal, 'transcript': transcript}

In [74]:
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        signal, transcript = sample['signal'], sample['transcript']

        return {'signal': torch.from_numpy(signal),
                'transcript': torch.from_numpy(transcript)}

In [75]:
speech_dataset = SpeechDataset('/media/ai/SpeechRecognition.EN/codebase/DVD/train_manifest.csv', 
                               'label_dict.json',
                              transform=transforms.Compose([
                                               Padding(30000, 100),
                                               ToTensor()
                                           ]))

In [79]:
sample = speech_dataset[0]
sample['signal'].size(), sample['transcript'].size()

(torch.Size([1, 30000]), torch.Size([1, 100]))

In [77]:
for i in range(len(speech_dataset)):
    sample = speech_dataset[i]

    print(i, sample['signal'].size(), sample['transcript'].size())

    if i == 3:
        break

0 torch.Size([1, 30000]) torch.Size([1, 100])
1 torch.Size([1, 30000]) torch.Size([1, 100])
2 torch.Size([1, 30000]) torch.Size([1, 100])
3 torch.Size([1, 30000]) torch.Size([1, 100])
