In [None]:
!pip install -q datasets
!pip install -q wget
!pip install -q soundfile
!pip install librosa

In [None]:
import pandas as pd
import datasets
import numpy as np
import subprocess
import multiprocessing
import tqdm.notebook as tqdm
import os
import wget
import huggingface_hub

In [None]:
huggingface_hub.notebook_login()

In [113]:
language = 'Lumasaba'
language_code = 'myx'

In [115]:
df = pd.read_csv('https://raw.githubusercontent.com/AI-Lab-Makerere/'
                 f'Yogera-Dataset-Metadata/main/version_5.0.1/{language_code}/'
                 'transcribed_and_reviewed.csv')

In [116]:
# Whisper can only be trained with examples of 30 seconds or less
df = df[df.duration < (30 / 3600)]

In [None]:
df

In [118]:
AUDIO_PATH_PREFIX = f'https://storage.googleapis.com/yogera_voices_backup/{language}/'

def download_and_compress_audio(i, verbose=False):

  audio_path = AUDIO_PATH_PREFIX + df.voice_clip[i]
  destination_filename_uncompressed = f'{language_code}_{i:05}.wav'
  destination_filename_compressed = f'audio_folder_{language_code}/data/{language_code}_{i:05}.mp3'

  !mkdir -p audio_folder_{language_code}/data

  if not os.path.exists(destination_filename_uncompressed):
    try:
      wget.download(audio_path, out=destination_filename_uncompressed)
    except:
      if verbose:
        print(f'Failed to download {audio_path}')

  if os.path.exists(destination_filename_uncompressed):
    # Compress the downloaded WAV file to OGG
    try:
      subprocess.run(
          [
              'ffmpeg', '-hide_banner', '-loglevel', 'error',
              '-y', '-i', destination_filename_uncompressed,
              destination_filename_compressed
          ],
          check=True)
    except FileNotFoundError:
      print("ffmpeg not found. Please install ffmpeg.")
    except subprocess.CalledProcessError as e:
      print(f"Conversion failed with error: {e}")


with multiprocessing.Pool(processes=20) as pool:
  for _ in tqdm.tqdm(pool.imap_unordered(
      download_and_compress_audio, df.index), total=len(df)):
    pass

  0%|          | 0/8502 [00:00<?, ?it/s]

In [119]:
metadata = pd.DataFrame()
metadata['file_name'] = [f'data/{language_code}_{i:05}.mp3' for i in df.index]
metadata['id'] = list(df.index)
metadata['text'] = list(df.transcript)
metadata['audio_language'] = language_code
metadata['is_studio'] = False
metadata['speaker_id'] = list(df.contributor_id)

# Filter out the entries with no audio
metadata = metadata[metadata['file_name'].apply(
    lambda x: os.path.exists(os.path.join(f'audio_folder_{language_code}', x)))]

metadata.to_csv(f'audio_folder_{language_code}/metadata.csv', index=False)

In [120]:
metadata

Unnamed: 0,file_name,id,text,audio_language,is_studio,speaker_id
60,data/myx_00063.mp3,63,Biseela bili bye kumuusi bulalo bwe bibyuma bw...,myx,False,691
61,data/myx_00064.mp3,64,"Ano ndi kubonawo mezi magaali naabi, naboonemo...",myx,False,568
62,data/myx_00065.mp3,65,Liguje limali lyahambile shindu mukhono kyehul...,myx,False,691
63,data/myx_00066.mp3,66,"Umusetsa ali khu pikiki ali khu fuga, khupikik...",myx,False,691
64,data/myx_00067.mp3,67,Gano manyonyi gesi balanga kaloli magali gabus...,myx,False,594
...,...,...,...,...,...,...
8497,data/myx_09073.mp3,9073,Bapangile tsibendela tsingali tse kamanambo ka...,myx,False,2899
8498,data/myx_09074.mp3,9074,Tsimotokha tsingali tsili mu'town ni babandu b...,myx,False,2525
8499,data/myx_09075.mp3,9075,Syinywinywi namwe ingosi isi balanga bari naka...,myx,False,2617
8500,data/myx_09076.mp3,9076,Bakhaasi babili bali mukhatale mutwela ali khu...,myx,False,2525


In [121]:
dataset = datasets.load_dataset("audiofolder", data_dir=f"audio_folder_{language_code}")

Resolving data files:   0%|          | 0/7744 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/7744 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [122]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'id', 'text', 'audio_language', 'is_studio', 'speaker_id'],
        num_rows: 7743
    })
})

In [123]:
dataset.push_to_hub(
    'Sunbird/external-speech-data', config_name=f'makerere-yogera-{language_code}', private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/7743 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/78 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Sunbird/external-speech-data/commit/c75b11283a2ae58581918e4283983ed3dc352c67', commit_message='Upload dataset', commit_description='', oid='c75b11283a2ae58581918e4283983ed3dc352c67', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Sunbird/external-speech-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Sunbird/external-speech-data'), pr_revision=None, pr_num=None)

In [66]:
test_dataset = datasets.load_dataset(
    'Sunbird/external-speech-data', f'makerere-yogera-{language_code}', split='train')

README.md:   0%|          | 0.00/530 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/278M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4496 [00:00<?, ? examples/s]