In [None]:
import os
from pydub import AudioSegment
import io
import webrtcvad
import numpy as np
import wave


def flac_to_raw(flac_file):
    audio = AudioSegment.from_file(flac_file, format="flac")
    audio = audio.set_channels(1)
    audio = audio.set_sample_width(2)  
    audio = audio.set_frame_rate(16000)  
    
    raw_data = io.BytesIO()
    audio.export(raw_data, format="wav")
    raw_data.seek(0)
    
    with wave.open(raw_data, 'rb') as wf:
        sample_rate = wf.getframerate()
        samples = wf.readframes(wf.getnframes())
        samples = np.frombuffer(samples, dtype=np.int16)
        
    return sample_rate, samples


def vad_segment(samples, sample_rate):
    vad = webrtcvad.Vad(3)  
    frame_duration = 30  # ms
    frame_size = int(sample_rate * frame_duration / 1000) * 2 

    def frame_generator(samples, frame_size):
        for start in range(0, len(samples), frame_size // 2):  
            yield samples[start:start + (frame_size // 2)]

    def vad_collector(sample_rate, frames):
        buffer = b''
        segments = []
        for frame in frames:
            if len(frame) < frame_size // 2:
                continue
            frame_bytes = frame.astype(np.int16).tobytes()
            if vad.is_speech(frame_bytes, sample_rate):
                buffer += frame_bytes
            elif buffer:
                segments.append(buffer)
                buffer = b''
        if buffer:
            segments.append(buffer)
        return segments

    frames = frame_generator(samples, frame_size)
    segments = vad_collector(sample_rate, frames)
    
    print(f"Detected {len(segments)} speech chunks")
    return segments

def save_chunks(chunks, sample_rate, output_dir, base_filename='chunk'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    silence_duration_ms = 500 
    silence_chunk = AudioSegment.silent(duration=silence_duration_ms, frame_rate=sample_rate)

    for i, chunk in enumerate(chunks):
        with io.BytesIO(chunk) as raw_data:
            audio_segment = AudioSegment.from_raw(raw_data, sample_width=2, frame_rate=sample_rate, channels=1)
            # Add silence padding to the beginning and end
            padded_audio_segment = silence_chunk + audio_segment + silence_chunk
            duration_seconds = len(padded_audio_segment) / 1000.0 

            if duration_seconds > 2: 
                output_path = os.path.join(output_dir, f'{base_filename}_{i}.flac')
                padded_audio_segment.export(output_path, format='flac')
                print(f"Saved chunk {i} as {output_path}, duration: {duration_seconds} seconds")
            else:
                print(f"Discarded chunk {i}, duration: {duration_seconds} seconds")


def process_file(flac_file, output_base_dir):
    base_filename = os.path.splitext(os.path.basename(flac_file))[0]
    output_dir = os.path.join(output_base_dir, base_filename)
    print(f"Processing file: {flac_file}")
    sample_rate, samples = flac_to_raw(flac_file)
    speech_chunks = vad_segment(samples, sample_rate)
    save_chunks(speech_chunks, sample_rate, output_dir, base_filename)

def process_input(input_path, output_base_dir):
    if os.path.isdir(input_path):
        flac_files = [os.path.join(input_path, file) for file in os.listdir(input_path) if file.endswith('.flac')]
        for flac_file in flac_files:
            process_file(flac_file, output_base_dir)
    elif os.path.isfile(input_path) and input_path.endswith('.flac'):
        process_file(input_path, output_base_dir)
    else:
        print("Invalid input. Please provide a directory containing FLAC files or a single FLAC file.")

# Example usage
if __name__ == "__main__":
    input_path = 'RFI\Speaker_10_vocals.flac'  
    output_base_dir = 'output_chunks'  
    process_input(input_path, output_base_dir)

Processing file: RFI\Speaker_10_vocals.flac
Detected 82 speech chunks
Saved chunk 0 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_0.flac, duration: 6.73 seconds
Discarded chunk 1, duration: 1.15 seconds
Saved chunk 2 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_2.flac, duration: 2.29 seconds
Saved chunk 3 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_3.flac, duration: 5.02 seconds
Saved chunk 4 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_4.flac, duration: 5.29 seconds
Discarded chunk 5, duration: 1.12 seconds
Saved chunk 6 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_6.flac, duration: 2.92 seconds
Saved chunk 7 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_7.flac, duration: 2.38 seconds
Discarded chunk 8, duration: 1.09 seconds
Discarded chunk 9, duration: 1.45 seconds
Saved chunk 10 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_10.flac, duration: 2.2 seconds
Saved chunk 11 as output_chunks\Speaker_10_vocals\Speaker_10_vocals_11.flac