In [None]:
!pip install -q datasets
!pip install -q soundfile

In [None]:
import datasets
import numpy as np
import pandas as pd
import re
import string
import os
import subprocess
from tqdm.notebook import tqdm
import multiprocessing
import huggingface_hub

In [None]:
huggingface_hub.notebook_login()

In [None]:
!wget https://huggingface.co/datasets/allandclive/MakerereRadioSpeech_20Hrs/resolve/main/audio.zip
!wget https://huggingface.co/datasets/allandclive/MakerereRadioSpeech_20Hrs/raw/main/uncleaned.csv
!unzip -q audio.zip

In [None]:
df = pd.read_csv('uncleaned.csv')

In [None]:
def remove_multiple_spaces(text):
  return re.sub(r'\s+', ' ', text)

def sentence_case(input_string):
  if len(input_string) < 3:
    return input_string
  formatted_string = input_string[0].upper() + input_string[1:]
  if formatted_string[-1] not in string.punctuation:
    formatted_string += '.'
  return formatted_string

def remove_spaces_between_capitals(text):
    # Use regex to find uppercase letters with spaces in between and merge them
    return re.sub(r'(?<=[A-Z])\s+(?=[A-Z])', '', text)

def format_single_line(text):
  text = text.replace('A:', '')
  text = text.replace('B:', '')
  text = text.replace('[um]', '')
  text = text.replace('[part]', '')
  text = text.replace('[en]', '')
  text = text.strip()
  text = sentence_case(text)
  text = remove_spaces_between_capitals(text)
  return text

def format_lines(text):
  parts = text.split('\n')
  parts = [format_single_line(part) for part in parts]
  parts = [part for part in parts if len(part)>1]
  text = ' '.join(parts)
  text = remove_multiple_spaces(text)
  return text

def get_language(text):
  return 'eng' if '[en]' in text else 'lug'

df['cleaned_transcript'] = df['transcript'].apply(format_lines)

df['audio_language'] = df['transcript'].apply(get_language)

for i in range(2000, 2010):
  print(df['cleaned_transcript'][i])

In [None]:
df[df.audio_language == 'lug']

In [None]:
!rm audio_folder/data/*.mp3

In [None]:
!mkdir -p audio_folder/data

def compress_audio(i):
  source_audio = f'audio/{df["wav_filename"][i]}'
  compressed_audio = f'audio_folder/data/{i:05}.mp3'

  # Omit any entries with short or missing transcripts
  if os.path.exists(source_audio) and len(df.cleaned_transcript[i]) > 5:
    try:
      subprocess.run(
          [
              'ffmpeg', '-hide_banner', '-loglevel', 'error',
              '-y', '-i', source_audio,
              compressed_audio
          ],
          check=True)
    except FileNotFoundError:
      print("ffmpeg not found. Please install ffmpeg.")
    except subprocess.CalledProcessError as e:
      print(f"Conversion failed with error: {e}")

with multiprocessing.Pool(processes=20) as pool:
  for _ in tqdm(pool.imap_unordered(
      compress_audio, df.index), total=len(df)):
    pass

In [None]:
metadata = pd.DataFrame()
metadata['file_name'] = [f'data/{i:05}.mp3' for i in df.index]
metadata['id'] = list(df.index)
metadata['text'] = list(df.cleaned_transcript)
metadata['audio_language'] = list(df.audio_language)

# Filter out the entries with no audio
metadata = metadata[metadata['file_name'].apply(
    lambda x: os.path.exists(os.path.join(f'audio_folder', x)))]

metadata.to_csv(f'audio_folder/metadata.csv', index=False)

In [None]:
dataset = datasets.load_dataset("audiofolder", data_dir=f"audio_folder")

In [None]:
dataset.push_to_hub(
    'Sunbird/external-speech-data', config_name=f'makerere-radio-speech', private=True)