In [1]:
from typing import Any

from asr_eval.datasets.datasets import *


load_fns = (
    load_multivariant_v1_200,
    load_youtube_lectures,
    load_golos_farfield,
    load_rulibrispeech,
    load_podlodka,
    load_podlodka_full,
    load_sova_rudevices,
    load_resd,
    load_fleurs,
    load_speech_massive,
    load_common_voice_17_0
)

def get_audio_len(sample: dict[str, Any]):
    return len(sample['audio']['array']) / sample['audio']['sampling_rate']

infos = []

for load_fn in load_fns:
    dataset_name = load_fn.__name__.removeprefix('load_')
    
    print(f'Loading {dataset_name}')
    
    try:
        dataset = load_fn()
        
        total_size_sec = sum([get_audio_len(sample) for sample in dataset]) # type: ignore
        
        infos.append(info := { # type: ignore
            'testset_name': dataset_name,
            'n_samples': len(dataset),
            'n_hours': total_size_sec / 3600,
        })
        print(info)
    except Exception as e:
        print(e)

Loading multivariant_v1_200
{'testset_name': 'multivariant_v1_200', 'n_samples': 200, 'n_hours': 7.306692579994961}
Loading youtube_lectures
{'testset_name': 'youtube_lectures', 'n_samples': 7, 'n_hours': 2.8012888194444443}
Loading golos_farfield
{'testset_name': 'golos_farfield', 'n_samples': 1916, 'n_hours': 1.407983611111111}
Loading rulibrispeech


Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

{'testset_name': 'rulibrispeech', 'n_samples': 1352, 'n_hours': 2.6462166666666667}
Loading podlodka
{'testset_name': 'podlodka', 'n_samples': 20, 'n_hours': 0.1351415798611111}
Loading podlodka_full
{'testset_name': 'podlodka_full', 'n_samples': 107, 'n_hours': 0.6855049066987907}
Loading sova_rudevices


Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

{'testset_name': 'sova_rudevices', 'n_samples': 5799, 'n_hours': 5.809085399305556}
Loading resd
{'testset_name': 'resd', 'n_samples': 280, 'n_hours': 0.46099137703924165}
Loading fleurs
{'testset_name': 'fleurs', 'n_samples': 775, 'n_hours': 2.498283333333333}
Loading speech_massive


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

{'testset_name': 'speech_massive', 'n_samples': 2974, 'n_hours': 3.4382683333333333}
Loading common_voice_17_0
{'testset_name': 'common_voice_17_0', 'n_samples': 10203, 'n_hours': 15.871770254629629}


In [2]:
import pandas as pd
print(pd.DataFrame(infos))

           testset_name  n_samples    n_hours
0   multivariant_v1_200        200   7.306693
1      youtube_lectures          7   2.801289
2        golos_farfield       1916   1.407984
3         rulibrispeech       1352   2.646217
4              podlodka         20   0.135142
5         podlodka_full        107   0.685505
6        sova_rudevices       5799   5.809085
7                  resd        280   0.460991
8                fleurs        775   2.498283
9        speech_massive       2974   3.438268
10    common_voice_17_0      10203  15.871770


In [None]:
from datasets import load_dataset, Audio
from asr_eval.datasets.datasets import *
dataset = load_youtube_lectures()

import numpy as np

samples = list(dataset)
for sample in samples:
    sample['audio']['array'] = np.array(sample['audio']['array'])

from datasets import Features, Value

dataset = Dataset.from_list(samples, features=Features({
    'audio': Audio(decode=True),
    'name': Value('string'),
    'transcription': Value('string'),
    'noise': Value('string'),
    'domain': Value('string'),
}))

dataset.save_to_disk('/asr_datasets/long_audio_youtube_lectures')

In [3]:
%cd ..

/home/oleg/asr-eval


In [8]:
from datasets import Dataset, Features, Value, Sequence, Audio, load_from_disk
import librosa

samples = [
    {
        'audio': 'audio.wav',
        'transcription': 'чипи чипи чапа чапа',
        'utterances': {
            'start': [0, 10],
            'end': [5, 15],
            'text': ['чипи чипи', 'чапа чапа'],
        }
    },
    {
        'audio': 'audio copy.wav',
        'transcription': 'прибыть в 314 кабинет',
        'utterances': {
            'start': [2],
            'end': [5],
            'text': ['прибыть в 314 кабинет'],
        }
    },
]

SAMPLING_RATE = 16_000

for sample in samples:
    audio_path = sample['audio']
    waveform, _ = librosa.load(audio_path, sr=SAMPLING_RATE) # type: ignore
    sample['audio'] = {
        'array': waveform,
        'sampling_rate': SAMPLING_RATE,
    }

dataset = Dataset.from_list(samples, features=Features({ # type: ignore
    'audio': Audio(decode=True),
    'transcription': Value('string'),
    'utterances': Sequence({
        'start': Value('float'),
        'end': Value('float'),
        'text': Value('string'),
    })
}))

dataset.save_to_disk('dataset1')

dataset = load_from_disk('dataset1')
assert len(dataset) == 2

Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]