In [1]:
#install library
!pip install torch torchaudio transformers librosa torchcodec soundfile


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import torch
import librosa
from transformers import pipeline
import json
import os
import soundfile as sf

In [None]:
#CONFIGURATION

AUDIO_DIR = "./audio_data"
OUTPUT_DIR = "./audio_data/dataset1"
OUTPUT_JSON = "ls_predictions.json"

DEVICE = 0 if torch.cuda.is_available() else -1


In [14]:
#Load VAD MODEL

vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
(get_speech_timestamps, _, read_audio, _, _) = utils

Using cache found in /Users/shin1am/.cache/torch/hub/snakers4_silero-vad_master


In [15]:
#set ASR pipeline using thonburian whisper


asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model="biodatlab/whisper-th-medium-combined", 
    chunk_length_s=30,
    device=DEVICE
)

Loading weights: 100%|██████████| 948/948 [00:00<00:00, 1257.85it/s, Materializing param=proj_out.weight]                                       


In [None]:
#Create a function for process the audio
def process_audio_to_ls():

    if not os.path.exists(AUDIO_DIR):
        print(f"Error: Folder {AUDIO_DIR} not found.")
        return
    
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    

    audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith(('.wav', '.mp3'))]
    all_tasks = []

    for file_name in audio_files:
        print(f"Processing: {file_name}")
        path = os.path.join(AUDIO_DIR, file_name)
    
        wav = read_audio(path, sampling_rate=16000)
        # Get timestamps in seconds
        speech_timestamps = get_speech_timestamps(wav, vad_model, sampling_rate=16000, return_seconds=True)
    
    
        # 3. RUN ASR ON EACH SEGMENT
        full_audio, sr = librosa.load(path, sr=16000)
    
        for i, segment in enumerate(speech_timestamps):
            start, end = segment['start'], segment['end']
        
            # Extract the audio chunk for this segment
            start_sample = int(start * sr)
            end_sample = int(end * sr)
            audio_chunk = full_audio[start_sample:end_sample]

            # --- NEW: SAVE THE CUT ---
            # Create a unique filename for the segment
            segment_filename = f"{os.path.splitext(file_name)[0]}_cut_{i}.wav"
            segment_path = os.path.join(OUTPUT_DIR, segment_filename)
            
            # Save using soundfile
            sf.write(segment_path, audio_chunk, sr)
            # -------------------------
        
            # Transcribe chunk
            transcription = asr_pipe(audio_chunk)["text"]


            #Create the Label Studio task
            task = {
                "data": {
                    "audio": f"/data/local-files/?d=files/dataset1/{segment_filename}"
                },
                "predictions": [{
                    "model_version": "VAD-Whisper-Thai-v1",
                    "result": [
                        {
                            "from_name": "transcription",
                            "to_name": "audio",
                            "type": "textarea",
                            "value": { "text": [transcription] }
                        }
                    ]
                }]
            }

            all_tasks.append(task)
            
    return all_tasks

In [20]:
final_task = process_audio_to_ls()

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump([final_task], f, ensure_ascii=False, indent=4)

Processing: Example1.mp3


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.generate()` flags.
A custom logits pr