# Checks the size of the dataset in term of samples

In [1]:
# Allows to load modules from parent directory
import inspect, sys
from os.path import dirname, abspath
sys.path.append(dirname(dirname(abspath(inspect.getfile(inspect.currentframe())))))

from data.data_loader import AudioDataLoader, SpectrogramAccentDataset, BucketingSampler, DistributedBucketingSampler
from data.data_loader import create_binarizer
from tqdm import tqdm_notebook as tqdm

In [52]:
train_manifest = './data/CommonVoice_dataset/splits/train.csv'
labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
accent_binarizer=create_binarizer(train_manifest)
audio_conf = {"sample_rate": 16000,
              "window_size": 0.02,
              "window_stride": 0.01,
              "window": "hamming"}

train_dataset = SpectrogramAccentDataset(audio_conf=audio_conf, 
                                        manifest_filepath=train_manifest, 
                                        labels=labels,
                                        normalize=True, 
                                        augment=False, 
                                        accent_binarizer=accent_binarizer,
                                        kaldi=False)



train_sampler = BucketingSampler(train_dataset, batch_size=1)

train_loader = AudioDataLoader(train_dataset,
                                num_workers=4, 
                                batch_sampler=train_sampler)
   

num_samples = []
print('Tot len:', len(train_sampler))
for i, (data) in enumerate(train_loader, start=0):
            if i % 3000 == 0:
                print(i)
    
            if i == len(train_sampler):
                break
                
            inputs, targets, input_percentages, target_sizes, target_accents = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            num_samples.append(inputs.shape[3])            
            

Tot len: 30896
0
3000
6000
9000
12000
15000
18000
21000
24000
27000
30000


In [53]:
print('num utt:', len(num_samples))
print('tot sam:', sum(num_samples))

num utt: 30896
tot sam: 12403138


In [55]:
import numpy as np
a = np.array(num_samples)

In [56]:
a.std()

151.44958044886553

In [59]:
a.mean()

401.44801916105644