# segmenting audio files into small chunks by putting a silence threshold.

In [None]:
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence

In [6]:
def segment_audio(input_file, output_dir, silence_thresh=-30, min_silence_len=2200, padding=100):
    
    audio = AudioSegment.from_file(input_file)
    segments = split_on_silence(
        audio,
        min_silence_len=min_silence_len,  
        silence_thresh=silence_thresh,    
        keep_silence=padding            
    )

    base_filename = os.path.splitext(os.path.basename(input_file))[0]
    segment_files = []
    for i, segment in enumerate(segments):
        segment_filename = f"{base_filename}_chunk{i+1}.wav"
        segment_file_path = os.path.join(output_dir, segment_filename)
        segment.export(segment_file_path, format="wav")
        segment_files.append(segment_file_path)

    return segment_files

def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith(".wav"):
            file_path = os.path.join(input_directory, filename)
            print(f"Processing {file_path}...")
            segments = segment_audio(file_path, output_directory)
            print(f"Created {len(segments)} segments: {segments}")


input_directory = "siva_audio_val" 
output_directory = "siva_audio_val_segments"  
process_directory(input_directory, output_directory)


Processing siva_audio_val/siva_val1.wav...
Created 152 segments: ['siva_audio_val_segments/siva_val1_chunk1.wav', 'siva_audio_val_segments/siva_val1_chunk2.wav', 'siva_audio_val_segments/siva_val1_chunk3.wav', 'siva_audio_val_segments/siva_val1_chunk4.wav', 'siva_audio_val_segments/siva_val1_chunk5.wav', 'siva_audio_val_segments/siva_val1_chunk6.wav', 'siva_audio_val_segments/siva_val1_chunk7.wav', 'siva_audio_val_segments/siva_val1_chunk8.wav', 'siva_audio_val_segments/siva_val1_chunk9.wav', 'siva_audio_val_segments/siva_val1_chunk10.wav', 'siva_audio_val_segments/siva_val1_chunk11.wav', 'siva_audio_val_segments/siva_val1_chunk12.wav', 'siva_audio_val_segments/siva_val1_chunk13.wav', 'siva_audio_val_segments/siva_val1_chunk14.wav', 'siva_audio_val_segments/siva_val1_chunk15.wav', 'siva_audio_val_segments/siva_val1_chunk16.wav', 'siva_audio_val_segments/siva_val1_chunk17.wav', 'siva_audio_val_segments/siva_val1_chunk18.wav', 'siva_audio_val_segments/siva_val1_chunk19.wav', 'siva_audio_

# Transcribing these audio segments into texts using speechRecognition.
# Considering only those files which has less than 30 seconds duration

In [8]:
import os
import speech_recognition as sr
from pydub import AudioSegment

def transcribe_audio(audio_file):
    recognizer = sr.Recognizer()

    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source) 
    
    try:
        # Use Google Web Speech API for transcription in Hindi (language code: hi-IN)
        text = recognizer.recognize_google(audio, language="en-IN")
        return text
    except sr.UnknownValueError:
        return "Audio could not be understood"
    except sr.RequestError as e:
        return f"Error with the API service: {e}"

def process_directory(input_directory, valid_chunks_directory, valid_texts_directory):
    if not os.path.exists(valid_chunks_directory):
        os.makedirs(valid_chunks_directory)
    
    if not os.path.exists(valid_texts_directory):
        os.makedirs(valid_texts_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith(".wav"):
            file_path = os.path.join(input_directory, filename)
            print(f"Processing {file_path}...")
            audio = AudioSegment.from_file(file_path)
            duration_in_seconds = len(audio) / 1000  
            
            if duration_in_seconds > 30:
                print(f"Skipping {filename} (duration: {duration_in_seconds} seconds, longer than 30 seconds)")
                continue  
            
            valid_audio_path = os.path.join(valid_chunks_directory, filename)
            audio.export(valid_audio_path, format="wav")
            
            transcript = transcribe_audio(valid_audio_path)
            
            transcription_filename = os.path.splitext(filename)[0] + ".txt"
            transcription_path = os.path.join(valid_texts_directory, transcription_filename)
            
            with open(transcription_path, 'w', encoding='utf-8') as transcription_file:
                transcription_file.write(transcript)
            
            print(f"Valid chunk and transcription saved: {valid_audio_path} and {transcription_path}")

input_directory = "siva_audio_val_segments"  
valid_chunks_directory = "Siva_validation_audio"  
valid_texts_directory = "Siva_validation_texts"  
process_directory(input_directory, valid_chunks_directory, valid_texts_directory)


Processing siva_audio_val_segments/siva_val1_chunk148.wav...
Valid chunk and transcription saved: Siva_validation_audio/siva_val1_chunk148.wav and Siva_validation_texts/siva_val1_chunk148.txt
Processing siva_audio_val_segments/siva_val1_chunk74.wav...
Valid chunk and transcription saved: Siva_validation_audio/siva_val1_chunk74.wav and Siva_validation_texts/siva_val1_chunk74.txt
Processing siva_audio_val_segments/siva_val1_chunk15.wav...
Valid chunk and transcription saved: Siva_validation_audio/siva_val1_chunk15.wav and Siva_validation_texts/siva_val1_chunk15.txt
Processing siva_audio_val_segments/siva_val1_chunk29.wav...
Valid chunk and transcription saved: Siva_validation_audio/siva_val1_chunk29.wav and Siva_validation_texts/siva_val1_chunk29.txt
Processing siva_audio_val_segments/siva_val1_chunk94.wav...
Valid chunk and transcription saved: Siva_validation_audio/siva_val1_chunk94.wav and Siva_validation_texts/siva_val1_chunk94.txt
Processing siva_audio_val_segments/siva_val1_chunk48

# Creating a CSV file to Load data into Hugging face
# csv file has two colomns, one for audio files paths and another is for transcrptions.

In [9]:
import os
import pandas as pd

def get_file_mapping(folder_path, file_extension):
    """
    Gathers all files with a given extension in a folder (without subfolders).
    Returns a dictionary mapping file names (without extension) to their full paths.
    """
    file_mapping = {}
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        if os.path.isfile(file_path) and file.endswith(file_extension):
            file_name = os.path.splitext(file)[0] 
            file_mapping[file_name] = file_path
    return file_mapping

def create_csv_mapping(audio_folder, text_folder, output_csv):
    """
    Maps audio files to their corresponding text files and exports the result to a CSV file.
    """
    audio_mapping = get_file_mapping(audio_folder, ".wav")
    text_mapping = get_file_mapping(text_folder, ".txt")

    data = []

    print("Audio Files:")
    print(audio_mapping.keys())
    print("Text Files:")
    print(text_mapping.keys())

    for audio_name, audio_path in audio_mapping.items():
        if audio_name in text_mapping:
            with open(text_mapping[audio_name], 'r', encoding='utf-8') as f:
                transcription = f.read().strip()
            data.append({
                "audio": audio_path,
                "transcription": transcription
            })
        else:
            print(f"❗ Warning: No transcription found for {audio_name}")

    if data:
        df = pd.DataFrame(data)
        df.to_csv(output_csv, index=False)
        print(f"✅ CSV file created successfully: {output_csv}")
    else:
        print("❗ No matching audio and text files found.")

if __name__ == "__main__":
    audio_folder = "Siva_validation_audio"   
    text_folder = "Siva_validation_texts"     
    output_csv = "siva_validation.csv"        

    create_csv_mapping(audio_folder, text_folder, output_csv)


Audio Files:
dict_keys(['siva_val1_chunk148', 'siva_val1_chunk74', 'siva_val1_chunk15', 'siva_val1_chunk29', 'siva_val1_chunk94', 'siva_val1_chunk48', 'siva_val1_chunk138', 'siva_val1_chunk132', 'siva_val1_chunk61', 'siva_val1_chunk60', 'siva_val1_chunk71', 'siva_val1_chunk9', 'siva_val1_chunk126', 'siva_val1_chunk86', 'siva_val1_chunk117', 'siva_val1_chunk56', 'siva_val1_chunk116', 'siva_val1_chunk122', 'siva_val1_chunk121', 'siva_val1_chunk92', 'siva_val1_chunk13', 'siva_val1_chunk79', 'siva_val1_chunk81', 'siva_val1_chunk106', 'siva_val1_chunk69', 'siva_val1_chunk36', 'siva_val1_chunk149', 'siva_val1_chunk28', 'siva_val1_chunk12', 'siva_val1_chunk129', 'siva_val1_chunk46', 'siva_val1_chunk124', 'siva_val1_chunk109', 'siva_val1_chunk125', 'siva_val1_chunk105', 'siva_val1_chunk123', 'siva_val1_chunk63', 'siva_val1_chunk99', 'siva_val1_chunk143', 'siva_val1_chunk134', 'siva_val1_chunk98', 'siva_val1_chunk96', 'siva_val1_chunk130', 'siva_val1_chunk11', 'siva_val1_chunk127', 'siva_val1_c

# This is simple code to load our data into hugging face.

In [10]:
from datasets import DatasetDict, Audio

#train_dataset = DatasetDict.from_csv({"train": "hindi_train.csv"})
validation_dataset = DatasetDict.from_csv({"validation": "siva_validation.csv"})
dataset = DatasetDict({
    #"train": train_dataset["train"],
    "validation": validation_dataset["validation"]
})

dataset = dataset.cast_column("audio", Audio())
dataset.push_to_hub("siva-voice-validation")


Generating validation split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sivakgp/siva-voice-validation/commit/c075d8d11892536df3bf650d173882af17f31070', commit_message='Upload dataset', commit_description='', oid='c075d8d11892536df3bf650d173882af17f31070', pr_url=None, pr_revision=None, pr_num=None)