In [None]:
from pydub import AudioSegment
import io
import webrtcvad
import numpy as np
import wave
import os

# Convert FLAC to raw audio data
def flac_to_raw(flac_file):
    audio = AudioSegment.from_file(flac_file, format="flac")
    audio = audio.set_channels(1)  # Convert to mono
    audio = audio.set_sample_width(2)  # 16-bit samples
    audio = audio.set_frame_rate(16000)  

    raw_data = io.BytesIO()
    audio.export(raw_data, format="wav")
    raw_data.seek(0)

    with wave.open(raw_data, 'rb') as wf:
        sample_rate = wf.getframerate()
        samples = wf.readframes(wf.getnframes())
        samples = np.frombuffer(samples, dtype=np.int16)

    return sample_rate, samples


def vad_segment(samples, sample_rate, frame_duration_ms=30, padding_duration_ms=575, max_chunk_duration=9):
    vad = webrtcvad.Vad(3)  # Mode 3 for more aggressive VAD
    frame_size = int(sample_rate * frame_duration_ms / 1000)
    max_chunk_samples = sample_rate * max_chunk_duration
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)

    def frame_generator(samples, frame_size):
        for start in range(0, len(samples), frame_size):
            yield samples[start:start + frame_size]

    def vad_collector(sample_rate, frames):
        buffer = []
        triggered = False
        voiced_frames = []
        silence_frames = 0
        speech_segments = []

        for frame in frames:
            if len(frame) < frame_size:
                print("Skipping frame due to incorrect size.")
                continue

            frame_bytes = frame.tobytes()
            try:
                is_speech = vad.is_speech(frame_bytes, sample_rate)
            except Exception as e:
                print(f"Error processing frame: {e}")
                continue

            if is_speech:
                if not triggered:
                    triggered = True
                    buffer.extend(voiced_frames)
                    voiced_frames = []
                buffer.append(frame)
                silence_frames = 0
            else:
                if triggered:
                    buffer.append(frame)
                    silence_frames += 1
                    if silence_frames > num_padding_frames or len(buffer) * frame_size >= max_chunk_samples:
                        triggered = False
                        speech_segments.append(np.concatenate(buffer))
                        buffer = []
                else:
                    voiced_frames.append(frame)
                    if len(voiced_frames) > num_padding_frames:
                        voiced_frames = voiced_frames[1:]

        if buffer:
            speech_segments.append(np.concatenate(buffer))

        print(f"Detected {len(speech_segments)} speech chunks")
        return speech_segments

    frames = frame_generator(samples, frame_size)
    segments = vad_collector(sample_rate, frames)
    return segments

# Check if the audio segment contains significant speech
def contains_significant_speech(audio_segment, silence_threshold_db=-40, min_duration_ms=1000):
    duration_ms = len(audio_segment)
    if duration_ms < min_duration_ms:
        return False

    avg_dBFS = audio_segment.dBFS
    return avg_dBFS > silence_threshold_db

# # Function to add silence padding to the beginning and end of audio segments
# def add_silence_padding(audio_segment, duration_ms=575):
#     silence = AudioSegment.silent(duration=duration_ms)
#     return silence + audio_segment + silence





def save_chunks(chunks, sample_rate, output_dir='chunks', base_filename='chunk', max_duration_ms=9800, min_silence_db=-40):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i, chunk in enumerate(chunks):
        if len(chunk) > 0:
            with io.BytesIO(chunk.tobytes()) as raw_data:
                audio_segment = AudioSegment.from_raw(raw_data, sample_width=2, frame_rate=sample_rate, channels=1)

                # # Add silence padding to the chunk
                # audio_segment = add_silence_padding(audio_segment)

                if not contains_significant_speech(audio_segment, silence_threshold_db=min_silence_db):
                    print(f"Chunk {i} is silent and will be skipped.")
                    continue

                while len(audio_segment) > max_duration_ms:
                    split_point = max_duration_ms

                    # Find a silence point within a range to avoid cutting sentences
                    for j in range(max_duration_ms - 1, max_duration_ms - 3000, -1):
                        if audio_segment[j-1:j+1].dBFS < min_silence_db:
                            split_point = j
                            break

                    # Export the chunk up to the split point as FLAC
                    chunk_to_export = audio_segment[:split_point]
                    chunk_to_export.export(os.path.join(output_dir, f'{base_filename}_{i}.flac'), format='flac')
                    print(f"Saved chunk {i} as {os.path.join(output_dir, base_filename)}_{i}.flac")
                    i += 1
                    
                    # Move to the next segment
                    audio_segment = audio_segment[split_point:]

                if len(audio_segment) > 0:
                    audio_segment.export(os.path.join(output_dir, f'{base_filename}_{i}.flac'), format='flac')
                    print(f"Saved chunk {i} as {os.path.join(output_dir, base_filename)}_{i}.flac")



def main(flac_file):
    sample_rate, samples = flac_to_raw(flac_file)
    speech_chunks = vad_segment(samples, sample_rate)
    save_chunks(speech_chunks, sample_rate, output_dir='output_chunks')


if __name__ == "__main__":
    flac_file = 'audio3_vocals.flac'  
    main(flac_file)


Skipping frame due to incorrect size.
Detected 16 speech chunks
Saved chunk 0 as output_chunks\chunk_0.flac
Saved chunk 1 as output_chunks\chunk_1.flac
Saved chunk 2 as output_chunks\chunk_2.flac
Saved chunk 3 as output_chunks\chunk_3.flac
Saved chunk 3 as output_chunks\chunk_3.flac
Saved chunk 4 as output_chunks\chunk_4.flac
Saved chunk 5 as output_chunks\chunk_5.flac
Saved chunk 6 as output_chunks\chunk_6.flac
Saved chunk 7 as output_chunks\chunk_7.flac
Saved chunk 7 as output_chunks\chunk_7.flac
Saved chunk 8 as output_chunks\chunk_8.flac
Saved chunk 8 as output_chunks\chunk_8.flac
Saved chunk 9 as output_chunks\chunk_9.flac
Saved chunk 10 as output_chunks\chunk_10.flac
Saved chunk 10 as output_chunks\chunk_10.flac
Saved chunk 11 as output_chunks\chunk_11.flac
Saved chunk 11 as output_chunks\chunk_11.flac
Saved chunk 12 as output_chunks\chunk_12.flac
Saved chunk 13 as output_chunks\chunk_13.flac
Saved chunk 14 as output_chunks\chunk_14.flac
Saved chunk 14 as output_chunks\chunk_14.f