# **Data Collection**

This is Yoruba Language dataset with English translation culled from Google FLEURS dataset. The original Google FLEURS dataset is available [here](https://huggingface.co/datasets/google/fleurs).

In [None]:
# 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

In [1]:
%%capture

!pip install transformers datasets librosa tqdm split-folders


In [34]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load the FLEURS dataset for English and Yoruba languages
# Here we merge the 3 splits "train+validation+test", but you can download them separately

In [31]:
from datasets import load_dataset, Audio, DatasetDict
from librosa import load, get_duration
from tqdm.notebook import tqdm
import numpy as np
import IPython.display as ipd
import os
import glob

In [None]:
fleurs_en = load_dataset("google/fleurs",
                                  name="en_us",
                                  split="train+validation+test",
                                  token=True,
                                  trust_remote_code=True,
                                )

fleurs_yo = load_dataset("google/fleurs",
                                  name="yo_ng",
                                  split="train+validation+test",
                                  token=True,
                                  trust_remote_code=True,
                                )

In [5]:
print(fleurs_en)


Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 3643
})


In [6]:
print(fleurs_yo)

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 3548
})


In [7]:
fleurs_en[0]

{'id': 903,
 'num_samples': 108800,
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/be467d88ba270014363a9d0aaae3893b4701a2710e0a55c6091a6d4fa56a9d84/10004088536354799741.wav',
 'audio': {'path': 'train/10004088536354799741.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.15904617e-06, -3.03983688e-06, -3.27825546e-06]),
  'sampling_rate': 16000},
 'transcription': 'a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward',
 'raw_transcription': 'A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [8]:
fleurs_yo[10]

{'id': 1332,
 'num_samples': 144000,
 'path': '/root/.cache/huggingface/datasets/downloads/extracted/dd0627a72260482a2d2931e59cddd06784011a6b9266a50ad34c2c89f40e7f21/10069972170045106105.wav',
 'audio': {'path': 'train/10069972170045106105.wav',
  'array': array([0.        , 0.        , 0.        , ..., 0.00096989, 0.00099474,
         0.0012356 ]),
  'sampling_rate': 16000},
 'transcription': 'àwọn ìbásòpọ̀ wọ́nyí ma ń gbìyànjú fún ètò ṣíṣe àrà àkànṣe àrà àti ìlò ohun èlò ojú ọ̀nà',
 'raw_transcription': 'Àwọn ìbásòpọ̀ wọ́nyí ma ń gbìyànjú fún ètò ṣíṣe, àrà àkànṣe àrà, àti ìlò ohun èlò ojú ọ̀nà.',
 'gender': 1,
 'lang_id': 99,
 'language': 'Yoruba',
 'lang_group_id': 3}

In [9]:
#  select only the id and raw_transcription for English and select the id, raw_transcription, and audio for Yoruba.
fleurs_en_dataset = fleurs_en.select_columns(['id', 'raw_transcription'])
fleurs_yo_dataset = fleurs_yo.select_columns(['id', 'audio', 'raw_transcription'])

# rename the raw transcription columns to text accordingly
fleurs_yo_dataset = fleurs_yo_dataset.rename_column('raw_transcription', 'text_yo')
fleurs_en_dataset = fleurs_en_dataset.rename_column('raw_transcription', 'text_en')


In [10]:
# Cast the audio column to Audio with sampling rate 16000

fleurs_yo_dataset = fleurs_yo_dataset.cast_column('audio', Audio(sampling_rate=16000))

# print(fleurs_yo_dataset)[2]['audio']

In [11]:
# Count the number of common examples in the English and Yoruba datasets

count = 0
for idx in fleurs_yo_dataset['id']:
  if idx in fleurs_en_dataset['id']:
    count += 1

print(count)

3500


In [12]:
# Merge the English and Irish datasets based on the id

def merge_examples(example):
  id = example['id']
  if id in fleurs_en_dataset['id']:
    example["text_en"] = fleurs_en_dataset['text_en'][fleurs_en_dataset['id'].index(id)]
  else:
    example["text_en"] = None

  return example

In [13]:
# Map the merge function to the Irish dataset
merged_dataset = fleurs_yo_dataset.map(merge_examples)

# Filter out None values
fleurs_final_dataset = merged_dataset.filter(lambda example: example['text_en'] is not None)

Map:   0%|          | 0/3548 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3548 [00:00<?, ? examples/s]

In [14]:
fleurs_final_dataset

Dataset({
    features: ['id', 'audio', 'text_yo', 'text_en'],
    num_rows: 3500
})

In [15]:
fleurs_final_dataset[50]

{'id': 575,
 'audio': {'path': 'train/10356240013910784492.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.0054782 ,
         -0.005463  , -0.00666481]),
  'sampling_rate': 16000},
 'text_yo': 'Ìletò náà bẹ̀rẹ̀ ìgbéayé má a kó kiri látibìkan sí ibòmíràn ńgbàti àdínkù bá oúnjẹ tó wà nílẹ̀, ìletò náà se ìtẹ́ ẹyẹ fúngbà kanná, èyí tí wọ́n má ń pààrọ̀ lójoójúmọ́.',
 'text_en': 'The colony begins a nomadic phase when available food has decreased. During this phase, the colony makes temporary nests that are changed everyday.'}

In [22]:
# 90% train, 10% test + validation
fleurs_yo_en_final = fleurs_final_dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid =fleurs_yo_en_final['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
fleurs_yo_en_final_split = DatasetDict({
    'train': fleurs_yo_en_final['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

In [41]:
fleurs_yo_en_final_split.push_to_hub("Bloomcode/fleurs_yo_en",
                                data_dir="data",
                                private=True)

Uploading the dataset shards:   0%|          | 0/7 [00:00<?, ?it/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Bloomcode/fleurs_yo_en/commit/a10bff0c9183fbdd566240b03d9e9e8a62e37252', commit_message='Upload dataset', commit_description='', oid='a10bff0c9183fbdd566240b03d9e9e8a62e37252', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Bloomcode/fleurs_yo_en', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Bloomcode/fleurs_yo_en'), pr_revision=None, pr_num=None)

Loading the Yoruba-English translation dataset so that we can calculate the duration of the audio dataset.

In [42]:

dataset_id = "Bloomcode/fleurs_yo_en"

dataset = load_dataset(dataset_id,
                       split="train")

README.md:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

train-00000-of-00007.parquet:   0%|          | 0.00/455M [00:00<?, ?B/s]

train-00001-of-00007.parquet:   0%|          | 0.00/451M [00:00<?, ?B/s]

train-00002-of-00007.parquet:   0%|          | 0.00/452M [00:00<?, ?B/s]

train-00003-of-00007.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

train-00004-of-00007.parquet:   0%|          | 0.00/455M [00:00<?, ?B/s]

train-00005-of-00007.parquet:   0%|          | 0.00/463M [00:00<?, ?B/s]

train-00006-of-00007.parquet:   0%|          | 0.00/460M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/171M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3150 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/175 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/175 [00:00<?, ? examples/s]

In [43]:
print(dataset.features)

{'id': Value(dtype='int32', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'text_yo': Value(dtype='string', id=None), 'text_en': Value(dtype='string', id=None)}


In [44]:
def get_dataset_duration(dataset):
  seconds = 0
  for audio in tqdm(dataset, total=len(dataset)):
    audio_array = audio["array"]
    sampling_rate = audio["sampling_rate"]
    audio_duration = get_duration(y=audio_array, sr=sampling_rate)
    seconds += audio_duration

  minutes, seconds = divmod(seconds, 60)
  hours, minutes = divmod(minutes, 60)
  dataset_duration = f"{hours:0.0f}:{minutes:0.0f}:{seconds:0.0f}"

  return dataset_duration

In [45]:
train_duration = get_dataset_duration(dataset["audio"])
train_duration

  0%|          | 0/3150 [00:00<?, ?it/s]

'13:48:32'

In [46]:
test_dataset =  load_dataset(dataset_id,
                       split="test")

In [47]:
validation_dataset =  load_dataset(dataset_id,
                       split="validation")

In [48]:
test_duration = get_dataset_duration(test_dataset["audio"])
test_duration

  0%|          | 0/175 [00:00<?, ?it/s]

'0:45:27'

In [49]:
validation_duration = get_dataset_duration(validation_dataset["audio"])
validation_duration

  0%|          | 0/175 [00:00<?, ?it/s]

'0:44:32'