In [1]:
import sys
import pandas as pd
from collections import defaultdict
sys.path.append("..")
from model_utils.model import DeepSpeech2Model
from data_utils.dataloader import SpecgramGenerator, DummyGenerator
from torch.utils.data import DataLoader

In [2]:
base_df = pd.DataFrame.from_dict({"uttid": ["test"], 
                                  "audio_path": ["test_audio/file_example_WAV_1MG.wav"], 
                                  "st": [10], 
                                  "et": [20], 
                                  "text": ["test"], 
                                  "duration": [33.529]
                                 })

In [3]:
import json

In [4]:
# test1: online audio segmentation
with open("../conf/augmentation.config", 'r') as f:
    augmentation_config = f.read()
test_dataset = SpecgramGenerator(manifest=base_df,
                                 vocab_filepath="../models/baidu_en8k/vocab.txt",
                                 mean_std_filepath="../models/baidu_en8k/mean_std.npz",
                                 augmentation_config=augmentation_config,
                                 segmented=True)

for index, row in base_df.iterrows():
    %time spec1, _ = test_dataset.process_utterance(row["audio_path"], row["text"], segments_info=None)
    
spec2 = test_dataset[0]
assert spec2["specgrams"].shape == spec1.shape
    
# segment audio based on start and end time    
test_dataset = SpecgramGenerator(manifest=base_df,
                                 vocab_filepath="../models/baidu_en8k/vocab.txt",
                                 mean_std_filepath="../models/baidu_en8k/mean_std.npz",
                                 augmentation_config=augmentation_config,
                                 segmented=False)
for index, row in base_df.iterrows():
    %time spec3, _ = test_dataset.process_utterance(row["audio_path"], row["text"], segments_info={"start":row.st, "end":row.et})
spec4 = test_dataset[0]
print(spec3)
assert spec4["specgrams"].shape == spec3.shape
assert len(spec4["text"]) == 4

CPU times: user 2.54 s, sys: 148 ms, total: 2.69 s
Wall time: 1.12 s
CPU times: user 1.28 s, sys: 63.8 ms, total: 1.34 s
Wall time: 168 ms
[[ 2.46418279  1.70024656  2.09008356 ...  1.96108698  1.99895312
   1.51750683]
 [ 2.12802768  2.44341125  2.09300444 ...  1.09625724  1.21529109
   1.59623817]
 [ 0.97176926  1.78964533  0.88834509 ...  1.00696693  0.9041591
   1.32906537]
 ...
 [-0.79207478 -0.79207692 -0.79207642 ... -0.79207984 -0.79208046
  -0.79208113]
 [-0.78492186 -0.78493023 -0.78492647 ... -0.7849321  -0.784934
  -0.78493469]
 [-0.65423738 -0.65423965 -0.65424063 ... -0.65424202 -0.65424159
  -0.65424211]]


In [5]:
# test2: data selection based on duration
test_dataset = SpecgramGenerator(manifest=base_df,
                                 vocab_filepath="../models/baidu_en8k/vocab.txt",
                                 mean_std_filepath="../models/baidu_en8k/mean_std.npz",
                                 max_duration=30)
assert len(test_dataset.manifest) == 0
    
    
test_dataset = SpecgramGenerator(manifest=base_df,
                                 vocab_filepath="../models/baidu_en8k/vocab.txt",
                                 mean_std_filepath="../models/baidu_en8k/mean_std.npz",
                                 min_duration=35)
assert len(test_dataset.manifest) == 0



test_dataset = SpecgramGenerator(manifest=base_df,
                                 vocab_filepath="../models/baidu_en8k/vocab.txt",
                                 mean_std_filepath="../models/baidu_en8k/mean_std.npz"
                                )
assert len(test_dataset.manifest) == 1

In [6]:
# test3: test dummy dataloader
test_dataset = DummyGenerator(datasize=32, feat_size=161, num_class=28, max_len=1200, min_len=750)
test_dataset[0]["specgrams"].shape

(161, 832)