# Split up long audio into segments

Whisper can be trained on audio segments of maximum 30 seconds. This notebook applies an ASR model to try to break up long audio into 30 second segments, and then align the right portion of the original transcript to each segment.

In [None]:
pip install -q transformers datasets==3.6.0 evaluate jiwer librosa soundfile torch torchcodec torchaudio torchvision accelerate editdistance pydub mutagen

In [None]:
!git clone https://github.com/SunbirdAI/salt.git

In [1]:
import os
import json
import string
import pandas as pd
import torch
import transformers
import datasets
from evaluate import load
import evaluate
import huggingface_hub
from tqdm.notebook import tqdm
import transformers
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
import huggingface_hub
import salt.constants
import tqdm.notebook as tqdm
import editdistance
import pydub
import os
import numpy as np
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from mutagen.oggvorbis import OggVorbis

In [2]:
torch.set_float32_matmul_precision('high')

In [None]:
huggingface_hub.login()

In [3]:
config = {'pretrained_model': 'akera/whisper-large-v3-kik-32h'}
feature_extractor = transformers.WhisperFeatureExtractor.from_pretrained(
    config['pretrained_model'])
processor = transformers.WhisperProcessor.from_pretrained(
    config['pretrained_model'],
    language=None,
    task="transcribe")
model = transformers.WhisperForConditionalGeneration.from_pretrained(
    config['pretrained_model'],
    device_map='auto',
)
model = model.eval()

In [4]:
generation_config = transformers.GenerationConfig.from_pretrained("openai/whisper-large-v3")
model.generation_config = generation_config
model.config.forced_decoder_ids = None
model.generation_config.forced_decoder_ids = None
model.generation_config.cache_implementation = "static"
# model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

In [5]:
generate_kwargs = {
    # "language": processor.tokenizer.decode(
    #      salt.constants.SALT_LANGUAGE_TOKENS_WHISPER['kin']),
    "language": processor.tokenizer.decode(50348),
    "num_beams": 5,
    "max_length": 400,
}

In [None]:
SPLIT = 'train'
ds = datasets.load_dataset('evie-8/kikuyu-data', 'all', split=SPLIT)
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000))

In [106]:
audio_array = example['audio']['array']
max_segment_length = 30 * 16000

segments = []
num_samples = len(audio_array)

for start in range(0, num_samples, max_segment_length):
    end = start + max_segment_length
    segment = audio_array[start:end]
    segments.append(segment)

In [113]:
offsets_by_row = {}

This cell takes about 14 hours to process the full Kikuyu train set (24,840 records) on 1 x RTX4090

In [None]:
for i in tqdm.tqdm(range(len(ds))):
    example = ds[i]
    audio_array = example['audio']['array']
    max_segment_length = 30 * 16000    
    segments = []
    num_samples = len(audio_array)
    for start in range(0, num_samples, max_segment_length):
        end = start + max_segment_length
        segment = audio_array[start:end]
        segments.append(segment)    

    full_transcription = example['text']
    offsets = [0]
    
    if len(segments) > 1:
        for segment in segments[1:]:
            input_features = processor(
                segment[:(5 * 16000)], # Just use the first few seconds
                sampling_rate=16000,
                return_tensors="pt").input_features
            input_features = input_features.to('cuda')
            
            predicted_ids = model.generate(
                input_features,
                **generate_kwargs,
            )
            segment_transcription = processor.batch_decode(
                predicted_ids, skip_special_tokens=True)[0]
        
            L = min(len(segment_transcription) - 5, len(full_transcription) - 1)
            scores = []
            for character_index in range(len(full_transcription) - L):
                full_excerpt = full_transcription[character_index:character_index+L]
                scores.append(editdistance.eval(segment_transcription,full_excerpt))
            offset = int(np.argmin(scores))
            if L > 10 and np.min(score) > L / 2:
                # Suspicious transcript: there may not be an aligment
                offsets.append(None)
            else:
                offsets.append(offset)
    
    offsets_by_row[i] = offsets

In [121]:
len(offsets_by_row.keys())

24840

In [117]:
i = 1
boundaries = offsets + [len(full_transcription)]
print(full_transcription[boundaries[i]:boundaries[i+1]])
display.Audio(segments[i], rate=16000)

aharĩria mũgũnda waku na wahanda, thutha wa mĩeri ĩrĩ nĩwagĩrĩirwo gũcoka ũthiĩ ũrĩme mũgũnda waku nĩguo weherie, weherie ria mũgũnda-inĩ ũtige kũrĩa tĩri, kũhũthĩra ũnoru wa tĩri, na thutha ũcio nĩ ũbatie gwĩkĩra mĩmera yaku bataraitha nĩguo wongerere, hinya wa tĩri, nĩguo mĩmera yaku ĩthie na mbere, kana nĩguo mbembe ĩthiĩ na mbere na gũkũra na thutha wa kahinda nĩwagĩrĩirwo kũrĩmĩra rita rĩa kerĩ nĩguo,ihinda


Generate .ogg files for each segment of 30 seconds or less. There are about 50k to create, so we need to make this parallel on several cores.

In [None]:
dir_lock = Lock()

def process_segment(args):
    """Process a single audio segment with pre-loaded audio data"""
    i, segment_index, offsets, max_segment_length, split = args

    file_path = f'audiofolder/data/{i:05}-segment-{segment_index + 1}.ogg'
    if os.path.exists(file_path):
        return f"Skipped existing: {file_path}"

    audio_array = ds[i]['audio']['array'] 
    start = segment_index * max_segment_length
    end = start + max_segment_length
    audio_segment = audio_array[start:end]
    audio_data = (audio_segment * 32767).astype(np.int16)
    pydub_audio = pydub.AudioSegment(
        audio_data.tobytes(), frame_rate=16000, sample_width=2, channels=1)
    
    output_dir = os.path.dirname(file_path)
    with dir_lock:
        os.makedirs(output_dir, exist_ok=True)
    
    pydub_audio.export(file_path, format="ogg")
    return f"Processed: {file_path}"

def process_audio_row_parallel(ds, offsets_by_row, split, max_workers=None):
    """
    Process each dataset row sequentially (for audio loading) but parallelize segment processing within each row
    This avoids the bottleneck of concurrent dataset access while still gaining parallelism
    """
    max_segment_length = 16000 * 30
    
    print(f"Processing {len(ds)} audio files with segment parallelization...")
    
    total_segments = int(np.sum([len(offsets_by_row[k]) for k in offsets_by_row]))
    
    with tqdm.tqdm(total=total_segments, desc="Processing segments") as pbar:
        tasks = []
        for i in range(len(ds)):
            offsets = offsets_by_row[i]
            
            if len(offsets) == 0:
                continue
            
            for segment_index in range(len(offsets)):
                tasks.append((i, segment_index, offsets, max_segment_length, split))
            
            # Process a batch once the prepared tasks have accumulated
            if len(tasks) > 100:
                with ThreadPoolExecutor(max_workers=max_workers) as executor:
                    futures = [executor.submit(process_segment, task) for task in tasks]
                    
                    for future in as_completed(futures):
                        try:
                            result = future.result()
                            pbar.update(1)
                        except Exception as e:
                            print(f"Error processing segment: {e}")
                            pbar.update(1)
                tasks = []
            
process_audio_row_parallel(ds, offsets_by_row, SPLIT, max_workers=32)

In [283]:
i = 18
example = ds[i]
offsets = offsets_by_row[i]
boundaries = offsets + [len(example['text'])]
for segment_index in range(len(offsets)):
    text_start = boundaries[segment_index]
    text_end = boundaries[segment_index + 1]
    print(example['text'][text_start:text_end])
    display.display(display.Audio(filename=f'audiofolder/data/{i:05}-segment-{segment_index+1}.ogg'))

Ũrĩmi nĩ waagĩrĩire mũno harĩ ũtonga wa bũrũri witũ wa Kenya. Nĩ gũkorwo hĩndĩ ĩrĩa twakorwo tũkĩrĩma tũkarĩma indo ta kahũa, tũkarĩma indo ta waru, no mũno mũno nĩ macani na mahũa. Icio nĩcio tũtuĩkaga a kwendia mabũrũri ma na nja na ithuĩ tũkona na mbeca na [inaudible]. Ũ


guo hĩndĩ ĩrĩa twatuĩka a kwendia indo ici twĩ ta bũrũri wa Kenya nĩ tũtuĩkaga a kũgĩa mbeca nyingĩ mũno, na twagĩa mbeca icio cigatũteithĩrĩria na maũndũ marĩa maingĩ marabatarania gũkũ gwitũ. Na ũndũ ũcio ũgatuĩka nĩ ũrateithĩrĩria mũno. Ũguo maũndũ maitũ no marĩkoragwo magĩthiĩ na mbere, na gũtirĩ ũndũ ũngĩtuĩka wa kũhĩngĩcana atĩ tondũ bũrũri ndũrĩ na mbeca


 cia kũigana.


Construct the dataset with metadata. Do some filtering to exclude examples where the alignment didn't seem to work - if there is a discrepancy between the audio length and the transcript length for any segments.

In [None]:
ids = []
texts = []
filenames = []
speaker_ids = []
file_paths = []
durations = []

for i in tqdm.tqdm(range(len(ds))):
    example = ds[i]
    offsets = offsets_by_row[i]
    boundaries = offsets + [len(example['text'])]
    
    if len(offsets) == 0:
        continue

    segment_audio_durations = []
    segment_text_lengths = []
    segment_file_paths = []
    segment_transcripts = []
    segment_ids = []
    segment_speaker_ids = []
    
    for segment_index in range(len(offsets)):
        segment_id = f"{example['id']}_segment_{segment_index + 1}"
        file_path = f'data/{i:05}-segment-{segment_index + 1}.ogg'
        
        audio = OggVorbis('audiofolder/' + file_path)
        duration_seconds = audio.info.length
        if duration_seconds == 0:
            raise ValueError('Found empty audio file: {file_path}')

        segment_text_length = boundaries[segment_index + 1] - boundaries[segment_index]
        segment_transcripts.append(example['text'][boundaries[segment_index]:boundaries[segment_index+1]])
        segment_file_paths.append(file_path)
        segment_audio_durations.append(duration_seconds)
        segment_text_lengths.append(segment_text_length)
        segment_speaker_ids.append(example['speaker_id'])
        segment_ids.append(segment_id)

    # When we got the split wrong, the audio length and the text length are inconsistent.
    # A ratio of 2.5 here seems to weed out most of the bad examples.
    audio_to_text_ratios = np.array(segment_text_lengths) / np.array(segment_audio_durations)
    text_audio_length_discrepancy = max(audio_to_text_ratios) / min(audio_to_text_ratios)

    lengths_look_ok = 1 <= text_audio_length_discrepancy < 2.5
        
    if lengths_look_ok:
        ids.extend(segment_ids)
        speaker_ids.extend(segment_speaker_ids)
        durations.extend(segment_audio_durations)
        file_paths.extend(segment_file_paths)
        texts.extend(segment_transcripts)

In [None]:
for i in range(10):
    print(texts[i])
    display.display(display.Audio(filename='audiofolder/'+file_paths[i]))

In [297]:
np.sum(durations) / 3600

np.float64(242.25255932291668)

Upload to HuggingFace

In [None]:
metadata = pd.DataFrame()
metadata['file_name'] = file_paths
metadata['id'] = ids
metadata['text'] = texts
metadata['language'] = 'kik'
metadata['duration'] = durations
metadata['speaker_ids'] = speaker_ids

# Define the minimum file size in bytes (1 KB = 1024 bytes)
MIN_FILE_SIZE = 1024

# Filter out the entries with no audio and file size less than 1 KB
metadata = metadata[metadata['file_name'].apply(
    lambda x: os.path.exists(os.path.join(f'audiofolder', x)) and 
              os.path.getsize(os.path.join(f'audiofolder', x)) >= MIN_FILE_SIZE
)]

metadata.to_csv(f'audiofolder/metadata.csv', index=False)

In [None]:
dataset = datasets.load_dataset("audiofolder", data_dir=f"audiofolder")

In [None]:
dataset.push_to_hub(
    'kikuyu-data-segmented',
    private=False,
)