Pack samples from Common Voice so that we fit as many as possible into clips of less than 30 seconds. This can improve the effciency of training as we have fewer empty samples. It also speeds up the dataset download time, as the HuggingFace Common Voice repositories contain many large files that we don't need.

In [None]:
!pip install -q datasets
!pip install -q wget
!pip install -q soundfile
!pip install -q librosa
!pip install -q pydub

In [None]:
import pandas as pd
import datasets
import numpy as np
import subprocess
import multiprocessing
from tqdm.notebook import tqdm
import os
import wget
import huggingface_hub
import string
from IPython import display
import pydub

In [None]:
huggingface_hub.notebook_login()

In [None]:
CV_SUBSET = 'lg'
LANGUAGE_CODE = 'lug'

In [None]:
ds = datasets.load_dataset(
    'mozilla-foundation/common_voice_16_0', name=CV_SUBSET, split='train',
    trust_remote_code=True, num_proc=10)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000))

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import numpy as np
import pydub
from tqdm import tqdm
import os

max_length_seconds = 30.0
max_length_samples = 16000 * max_length_seconds

def process_audio_batch(index, packed, language_code):
    """Process a batch of audio and save it as an MP3 file."""
    transcript = ' '.join(packed[1])
    file_name = f"data/{LANGUAGE_CODE}_{index:05}.mp3"
    file_path = f"audio_folder_{LANGUAGE_CODE}/{file_name}"
    if not os.path.exists(file_path):
        audio_data = np.concatenate(packed[0])
        audio_data = (audio_data * 32767).astype(np.int16)
        pydub_audio = pydub.AudioSegment(
            audio_data.tobytes(), frame_rate=16000, sample_width=2, channels=1)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)  # Ensure output folder exists
        pydub_audio.export(file_path, format="mp3")
    return transcript, file_name

def sentence_case(input_string):
  if len(input_string) < 3:
    return input_string
  formatted_string = input_string[0].upper() + input_string[1:]
  if formatted_string[-1] not in string.punctuation:
    formatted_string += '.'
  return formatted_string
    
def sample_packing_generator(test_run=False):
    current_samples = []
    current_transcripts = []
    num_yielded = 0
    for i in range(len(ds)):
        try:
            example = ds[i] # sometimes fails with mp3 format error
            if (np.sum([len(s) for s in current_samples])
                + len(example['audio']['array'])) > max_length_samples:
                yield current_samples, current_transcripts
                num_yielded += 1
                if test_run and num_yielded > 10:
                    break
                current_samples = [example['audio']['array']]
                current_transcripts = [sentence_case(example['sentence'])]
            else:
                current_samples.append(example['audio']['array'])
                current_transcripts.append(sentence_case(example['sentence']))  
        except:
            print(f'problem decoding {i}')

transcripts = []
file_paths = []

with ProcessPoolExecutor(max_workers=16) as executor:
    futures = []
    progress_bar = tqdm(total=0, unit="task", position=0)
    
    # Dynamically add tasks to the executor
    for i, packed in enumerate(sample_packing_generator()):
        futures.append(executor.submit(process_audio_batch, i, packed, LANGUAGE_CODE))
        progress_bar.total += 1  # Increase the total task count dynamically
        progress_bar.refresh()   # Update the progress bar display

    # Wait for tasks to complete and update the progress bar
    for future in as_completed(futures):
        try:
            transcript, file_path = future.result()
            transcripts.append(transcript)
            file_paths.append(file_path)
        except Exception as e:
            print(f"Task failed: {e}")
        progress_bar.update(1)  # Increment completed task count

progress_bar.close()

In [None]:
metadata = pd.DataFrame()
metadata['file_name'] = file_paths
metadata['id'] = range(len(file_paths))
metadata['text'] = transcripts
metadata['language'] = LANGUAGE_CODE

# Filter out the entries with no audio
metadata = metadata[metadata['file_name'].apply(
    lambda x: os.path.exists(os.path.join(f'audio_folder_{LANGUAGE_CODE}', x)))]

metadata.to_csv(f'audio_folder_{LANGUAGE_CODE}/metadata.csv', index=False)

In [None]:
dataset = datasets.load_dataset("audiofolder", data_dir=f"audio_folder_{LANGUAGE_CODE}")

In [None]:
index = 6543
dataset['train'][index]

In [None]:
display.Audio(dataset['train'][index]['audio']['array'], rate=16000)

In [None]:
dataset.push_to_hub(
    'Sunbird/external-speech-data', config_name=f'common-voice-sample-packed-lug', private=True)