In [7]:
!pip install -q librosa
!pip install -q datasets
!pip install -q wget
!pip install -q soundfile
!pip install ipywidgets widgetsnbextension

In [9]:
from IPython import display
import pandas as pd
import librosa
import json
#import tqdm
import tqdm.notebook as tqdm
import numpy as np
import warnings
import os
import datasets
warnings.filterwarnings("ignore", message="PySoundFile failed. Trying audioread instead.")
warnings.filterwarnings("ignore", category=FutureWarning, 
                       message="librosa.core.audio.__audioread_load.*")

The data files are a mixture of different formats (ogg, mp3, webm) so first convert everything to .ogg.

In [None]:
import os
import subprocess
import multiprocessing
from tqdm.notebook import tqdm


def list_files_no_extension(folder):
    """List all files in folder with no extension."""
    return [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if os.path.isfile(os.path.join(folder, f)) and '.' not in f
    ]


def convert_to_ogg(file_path, output_dir):
    """Convert a single file to .ogg using ffmpeg."""
    base_name = os.path.basename(file_path)
    output_path = os.path.join(output_dir, f"{base_name}.ogg")

    try:
        result = subprocess.run(
            ["ffmpeg", "-y", "-i", file_path, "-c:a", "libvorbis", output_path],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL
        )
        if result.returncode == 0:
            return (file_path, True)
        else:
            return (file_path, False)
    except Exception:
        return (file_path, False)


def batch_convert(folder, output_dir, num_workers=8):
    os.makedirs(output_dir, exist_ok=True)
    files = list_files_no_extension(folder)

    valid = []
    invalid = []

    with multiprocessing.Pool(num_workers) as pool:
        jobs = [pool.apply_async(convert_to_ogg, args=(f, output_dir)) for f in files]
        for job in tqdm(jobs, desc="Converting files to .ogg"):
            path, success = job.get()
            if success:
                valid.append(path)
            else:
                invalid.append(path)

    return valid, invalid


# === USAGE ===
input_folder = "audio"
output_folder = "audio_ogg"
num_workers = 16  # adjust to match your CPU

valid_files, invalid_files = batch_convert(input_folder, output_folder, num_workers=num_workers)

print(f"✅ Successfully converted: {len(valid_files)}")
print(f"❌ Invalid or failed files: {len(invalid_files)}")

In [171]:
data_dir = 'audio_folder_test'
with open('test.json') as f:
    d = json.load(f)

In [164]:
file_names = []
ids = []
texts = []
audio_language = []
prompts = []
durations = []

for key in d.keys():
    example = d[key]
    has_duration = example['duration'] is not None
    valid_duration = has_duration and example['duration'] < 30.0
    has_transcription = example['transcription'] is not None and len(example['transcription']) > 10
    # Filter training data
    # if valid_duration and has_transcription:
    # No filter for test data
    if 'test' in data_dir or (valid_duration and has_transcription):
        file_names.append(example['audio_path'] + '.ogg')
        ids.append(key)
        durations.append(example['duration'])
        # If test data, empty transcription
        if data_dir == 'audio_folder_test':
            texts.append(' ')
        else:
            texts.append(example['transcription'])
        prompts.append(example['image_category'] + ', ' + example['image_sub_category'])

In [165]:
len(ids)

9263

In [166]:
df = pd.DataFrame()
df['id'] = ids
df['file_name'] = file_names
df['audio_language'] = 'kin'
df['text'] = texts
df['prompt'] = prompts
df['duration'] = durations
# Filter out any entries with no audio
df = df[df['file_name'].apply(
    lambda x: os.path.exists(data_dir + '/' + x))]

In [167]:
df

Unnamed: 0,id,file_name,audio_language,text,prompt,duration
0,KsVMmVra4WbPUm7bvUVI,audio/1737470193-FMS8HawISoYIuCLQt2QjIoz1d8H2.ogg,kin,Umugore wambaye umupira w'akazi mpuzankano iri...,"Financial Services, Mobile_money_kiosks",18.090667
1,b9rjWQGuU5NG1it3ADOd,audio/1737663811-rVzBmu7JpsUV2uUrsTHz1HMf5LF3.ogg,kin,Uburyo emutiyene yatangije wishyura amafaranga...,"Financial Services, Mobile_money_kiosks",16.277333
2,cgqGF3vpjdPiQdM550f2,audio/1739893092-o2i2UlqzLJPMnM7ZfKzhq0bbY9z2.ogg,kin,"Umudamu uhagaze mu iduka, w'inzobe, uri guseka...","Financial Services, Boutiques",23.317333
3,vEQyHWCzaKM11GdXykKi,audio/1737640745-LpoUVdUVxjM6P6JkrJcEIzmCI562.ogg,kin,Ikusanyirizo ry'amata yitwa miriki zone y'Inya...,"Financial Services, Dairy_product_shops",19.080000
4,6bl9pichoKbYa0bdTeZ3,audio/1737222297-edDS7WSeKCS6O2zFNySBoWwMtpX2.ogg,kin,"Imashini zitunganya amata, ubushakashatsi bwak...","Financial Services, Dairy_product_shops",15.120000
...,...,...,...,...,...,...
9258,PFY45NMyoZQQe34P5VlP,audio/1742980214-LqSkNILaTYZN3XpRAIr5ai2hS012.ogg,kin,"Ndabona ikimera, ibimera dukundaga gukoresha m...","Health, Traditional Medecine",17.400000
9259,sCrx0TjAhT4SooTZ9tik,audio/1743019968-TuAJCA584lflQUPELwwF1utx9N63.ogg,kin,Mu bihingwa gakondo byo mu Rwanda hagaragaramo...,"Health, Traditional Medecine",28.680000
9260,HcXKvWL353OurJXMMUBc,audio/1743420987-vDkghm2DOngAvsYZo9vqrpSP3El2.ogg,kin,"Uruzitiro rw'ibiti bitandukanye by'icatsi, has...","Health, Traditional Medecine",28.380000
9261,JZp1X68iFY3RuzjfDW3u,audio/1743423425-D0ftP87QZSXYJx1rUaBprSgzcUU2.ogg,kin,Umugwegwe ni icatsi ciza kigira uruti ruhera h...,"Health, Traditional Medecine",17.580000


In [168]:
np.sum(df['duration']) / 3600

np.float64(50.40407329325651)

In [67]:
!mkdir {data_dir}
!mkdir {data_dir}/audio

mkdir: cannot create directory ‘audio_folder_test’: File exists
mkdir: cannot create directory ‘audio_folder_test/audio’: File exists


In [60]:
import shutil
for file_name in tqdm.tqdm(file_names):
    shutil.copy2(file_name, f"{data_dir}/{file_name}")

  0%|          | 0/9265 [00:00<?, ?it/s]

In [169]:
df.to_csv(f'{data_dir}/metadata.csv', index=False)

After carrying out the above for three splits (train, dev_test and test), then upload to a private HuggingFace repo

In [None]:
# Reload each split from their respective directories
test_ds = datasets.load_dataset("audiofolder", data_dir="audio_folder_test", split="train")  # becomes 'test'
val_ds = datasets.load_dataset("audiofolder", data_dir="audio_folder_dev_test", split="train")    # optional
train_ds = datasets.load_dataset("audiofolder", data_dir="audio_folder", split="train")

# Rename splits and combine
dataset = datasets.DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "dev_test": val_ds,
})

# Push all splits together
dataset.push_to_hub("jq/kinyarwanda-speech-hackathon", private=True)

In [134]:
dataset['train'][0]

{'id': '0000nf4gVJZuLc93PruO',
 'audio': {'path': '/home/user/Downloads/audio_folder/audio/1739639348-1JxgqY2zCWOuXdSJZqUeJ9qiRry2.ogg',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00302154,
         -0.00219331, -0.00154569], shape=(884160,)),
  'sampling_rate': 48000},
 'audio_language': 'kin',
 'text': ' ',
 'prompt': 'Government Services, E-Government Services',
 'duration': 18.42}