#### Installing packages

In [31]:
import os
import pandas as pd
from pydub import AudioSegment
import json
from pydub.utils import mediainfo
import wave

#### Definining the directory path

In [None]:
base_dir = "D:/ProgrammingWork/XRI-Global-Internship/Yoruba"
clips_dir = os.path.join(base_dir, "clips")

#### Defining the output directories

In [None]:
output_dirs = {
    "train": os.path.join(base_dir, "structured/train"),
    "dev": os.path.join(base_dir, "structured/dev"),
    "test": os.path.join(base_dir, "structured/test")
}

#### Function to process a dataset split by converting audio files and saving transcriptions

In [None]:
def process_split(file_path, split_name):
    data = pd.read_csv(file_path, sep='\t')  # Load .tsv file
    output_dir = output_dirs[split_name]

    # Loop over each row in the DataFrame
    for _, row in data.iterrows():
        clip_id = row['path']
        transcription = row['sentence']
        
        # Define paths for input clip and output converted clip
        clip_path = os.path.join(clips_dir, clip_id)
        wav_output_path = os.path.join(output_dir, os.path.splitext(clip_id)[0] + ".wav")

        # Convert MP3 to WAV at 16kHz
        audio = AudioSegment.from_mp3(clip_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        audio.export(wav_output_path, format="wav")

        # Save transcription to a text file
        transcription_path = os.path.join(output_dir, os.path.splitext(clip_id)[0] + ".txt")
        with open(transcription_path, 'w', encoding='utf-8') as f:
            f.write(transcription)

#### Process each split

In [None]:
process_split(os.path.join(base_dir, "train.tsv"), "train")
process_split(os.path.join(base_dir, "dev.tsv"), "dev")
process_split(os.path.join(base_dir, "test.tsv"), "test")

#### Creating a JSON path

In [29]:
def get_duration(file_path):
    """Get the duration of an audio file in seconds."""
    info = mediainfo(file_path)
    return float(info['duration'])

def create_asr_json(audio_dir, output_json_path):
    json_data = []

    # Loop through each .wav file in the directory
    for wav_file in os.listdir(audio_dir):
        if wav_file.endswith(".wav"):
            wav_path = os.path.join(audio_dir, wav_file)
            txt_path = os.path.join(audio_dir, os.path.splitext(wav_file)[0] + ".txt")
            
            # Read the transcription
            if os.path.exists(txt_path):
                with open(txt_path, 'r', encoding='utf-8') as f:
                    transcription = f.read().strip()
                
                # Get the duration of the audio file
                duration = get_duration(wav_path)

                # Construct JSON entry
                json_data.append({
                    "audio_filepath": os.path.relpath(wav_path, base_dir),
                    "text": transcription,
                    "duration": duration
                })

    # Write JSON data to file
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=4)

In [30]:
base_dir = os.path.join(os.getcwd(), "structured") 
create_asr_json(os.path.join(base_dir, "train"), os.path.join(base_dir, "train.json"))
create_asr_json(os.path.join(base_dir, "dev"), os.path.join(base_dir, "dev.json"))
create_asr_json(os.path.join(base_dir, "test"), os.path.join(base_dir, "test.json"))

#### Running tests for various formats

In [35]:
def check_audio_properties(audio_dir, sample_rate=16000, channels=1, bit_depth=2):
    for wav_file in os.listdir(audio_dir):
        if wav_file.endswith(".wav"):
            with wave.open(os.path.join(audio_dir, wav_file), 'r') as wf:
                assert wf.getframerate() == sample_rate, f"{wav_file} has a sample rate of {wf.getframerate()}, expected {sample_rate}"
                assert wf.getnchannels() == channels, f"{wav_file} is not mono, expected {channels} channels"
                assert wf.getsampwidth() == bit_depth, f"{wav_file} is not 16-bit, expected bit depth {bit_depth}"

In [37]:
audio_dir_train = "structured/train"
check_audio_properties(audio_dir_train)

audio_dir_test = "structured/test"
check_audio_properties(audio_dir_test)

audio_dir_dev = "structured/dev"
check_audio_properties(audio_dir_dev)

### Checking what the split of data is

In [None]:
def summarize_dataset(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        total_duration = sum(entry["duration"] for entry in data)
        print(f"Total samples: {len(data)}, Total duration: {total_duration / 3600:.2f} hours")

summarize_dataset("structured/train.json")
summarize_dataset("structured/test.json")
summarize_dataset("structured/dev.json")


Total samples: 1246, Total duration: 2.10 hours
Total samples: 1014, Total duration: 1.74 hours
Total samples: 874, Total duration: 1.29 hours
