In [None]:
!pip install -q datasets
!pip install -q wget
!pip install -q soundfile
!pip install librosa

In [None]:
import pandas as pd
import datasets
import numpy as np
import subprocess
import multiprocessing
import tqdm.notebook as tqdm
import os
import wget
import huggingface_hub

In [None]:
huggingface_hub.notebook_login()

In [None]:
language = 'Lumasaba'
language_code = 'myx'

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/AI-Lab-Makerere/'
                 f'Yogera-Dataset-Metadata/main/version_5.0.1/{language_code}/'
                 'transcribed_and_reviewed.csv')

In [None]:
# Whisper can only be trained with examples of 30 seconds or less
df = df[df.duration < (30 / 3600)]

In [None]:
df

In [None]:
AUDIO_PATH_PREFIX = f'https://storage.googleapis.com/yogera_voices_backup/{language}/'

def download_and_compress_audio(i, verbose=False):

  audio_path = AUDIO_PATH_PREFIX + df.voice_clip[i]
  destination_filename_uncompressed = f'{language_code}_{i:05}.wav'
  destination_filename_compressed = f'audio_folder_{language_code}/data/{language_code}_{i:05}.mp3'

  !mkdir -p audio_folder_{language_code}/data

  if not os.path.exists(destination_filename_uncompressed):
    try:
      wget.download(audio_path, out=destination_filename_uncompressed)
    except:
      if verbose:
        print(f'Failed to download {audio_path}')

  if os.path.exists(destination_filename_uncompressed):
    # Compress the downloaded WAV file to OGG
    try:
      subprocess.run(
          [
              'ffmpeg', '-hide_banner', '-loglevel', 'error',
              '-y', '-i', destination_filename_uncompressed,
              destination_filename_compressed
          ],
          check=True)
    except FileNotFoundError:
      print("ffmpeg not found. Please install ffmpeg.")
    except subprocess.CalledProcessError as e:
      print(f"Conversion failed with error: {e}")


with multiprocessing.Pool(processes=20) as pool:
  for _ in tqdm.tqdm(pool.imap_unordered(
      download_and_compress_audio, df.index), total=len(df)):
    pass

In [None]:
metadata = pd.DataFrame()
metadata['file_name'] = [f'data/{language_code}_{i:05}.mp3' for i in df.index]
metadata['id'] = list(df.index)
metadata['text'] = list(df.transcript)
metadata['audio_language'] = language_code
metadata['is_studio'] = False
metadata['speaker_id'] = list(df.contributor_id)

# Filter out the entries with no audio
metadata = metadata[metadata['file_name'].apply(
    lambda x: os.path.exists(os.path.join(f'audio_folder_{language_code}', x)))]

metadata.to_csv(f'audio_folder_{language_code}/metadata.csv', index=False)

In [None]:
metadata

In [None]:
dataset = datasets.load_dataset("audiofolder", data_dir=f"audio_folder_{language_code}")

In [None]:
dataset

In [None]:
dataset.push_to_hub(
    'Sunbird/external-speech-data', config_name=f'makerere-yogera-{language_code}', private=True)

In [None]:
test_dataset = datasets.load_dataset(
    'Sunbird/external-speech-data', f'makerere-yogera-{language_code}', split='train')