In [None]:
import os
from yt_dlp import YoutubeDL


from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/MyDrive/a'


def download_youtube_audio(youtube_url, output_dir):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'flac',
            'preferredquality': '192',
        }],
    }

    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])


youtube_url = 'https://youtu.be/Ov6RRFDR5X8?si=937vuOF6U6dxDH1v'  

download_youtube_audio(youtube_url, output_dir)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[youtube] Extracting URL: https://youtu.be/Ov6RRFDR5X8?si=937vuOF6U6dxDH1v
[youtube] Ov6RRFDR5X8: Downloading webpage
[youtube] Ov6RRFDR5X8: Downloading ios player API JSON
[youtube] Ov6RRFDR5X8: Downloading web creator player API JSON
[youtube] Ov6RRFDR5X8: Downloading player 7fe34188
[youtube] Ov6RRFDR5X8: Downloading m3u8 information
[info] Ov6RRFDR5X8: Downloading 1 format(s): 251
[download] Destination: /content/drive/MyDrive/a/ទស្សនយុវជនជុំវិញកិច្ចសហប្រតិបត្តិការតំបន់ត្រីកោណអភិវឌ្ឍន៍ កម្ពុជា-ឡាវ-វៀតណាម (CLV-DTA).webm
[download] 100% of   37.18MiB in 00:00:03 at 10.41MiB/s  
[ExtractAudio] Destination: /content/drive/MyDrive/a/ទស្សនយុវជនជុំវិញកិច្ចសហប្រតិបត្តិការតំបន់ត្រីកោណអភិវឌ្ឍន៍ កម្ពុជា-ឡាវ-វៀតណាម (CLV-DTA).flac
Deleting original file /content/drive/MyDrive/a/ទស្សនយុវជនជុំវិញកិច្ចសហប្រតិបត្តិការតំបន់ត្រីកោណអភិវឌ្ឍន៍ កម្ពុជា-ឡាវ-វៀតណាម (CLV-DTA).webm

In [None]:
script_code = """
import soundfile as sf
import torch
import os
import librosa
import numpy as np
import onnxruntime as ort
from pathlib import Path
from argparse import ArgumentParser
from tqdm import tqdm
import glob


class ConvTDFNet:
    def __init__(self, target_name, L, dim_f, dim_t, n_fft, hop=1024):
        super(ConvTDFNet, self).__init__()
        self.dim_c = 4
        self.dim_f = dim_f
        self.dim_t = 2**dim_t
        self.n_fft = n_fft
        self.hop = hop
        self.n_bins = self.n_fft // 2 + 1
        self.chunk_size = hop * (self.dim_t - 1)
        self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
        self.target_name = target_name

        out_c = self.dim_c * 4 if target_name == "*" else self.dim_c

        self.freq_pad = torch.zeros([1, out_c, self.n_bins - self.dim_f, self.dim_t])
        self.n = L // 2

    def stft(self, x):
        x = x.reshape([-1, self.chunk_size])
        x = torch.stft(
            x,
            n_fft=self.n_fft,
            hop_length=self.hop,
            window=self.window,
            center=True,
            return_complex=True,
        )
        x = torch.view_as_real(x)
        x = x.permute([0, 3, 1, 2])
        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
            [-1, self.dim_c, self.n_bins, self.dim_t]
        )
        return x[:, :, : self.dim_f]

    # Inversed Short-time Fourier transform (STFT).
    def istft(self, x, freq_pad=None):
        freq_pad = (
            self.freq_pad.repeat([x.shape[0], 1, 1, 1])
            if freq_pad is None
            else freq_pad
        )
        x = torch.cat([x, freq_pad], -2)
        c = 4 * 2 if self.target_name == "*" else 2
        x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
            [-1, 2, self.n_bins, self.dim_t]
        )
        x = x.permute([0, 2, 3, 1])
        x = x.contiguous()
        x = torch.view_as_complex(x)
        x = torch.istft(
            x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
        )
        return x.reshape([-1, c, self.chunk_size])

class Predictor:
    def __init__(self, args):
        self.args = args
        self.model_ = ConvTDFNet(
            target_name="vocals",
            L=11,
            dim_f=args["dim_f"],
            dim_t=args["dim_t"],
            n_fft=args["n_fft"]
        )

        if torch.cuda.is_available():
            self.model = ort.InferenceSession(args['model_path'], providers=['CUDAExecutionProvider'])
        else:
            self.model = ort.InferenceSession(args['model_path'], providers=['CPUExecutionProvider'])

    def demix(self, mix):
        samples = mix.shape[-1]
        margin = self.args["margin"]
        chunk_size = self.args["chunks"] * 44100

        assert not margin == 0, "margin cannot be zero!"

        if margin > chunk_size:
            margin = chunk_size

        segmented_mix = {}

        if self.args["chunks"] == 0 or samples < chunk_size:
            chunk_size = samples

        counter = -1
        for skip in range(0, samples, chunk_size):
            counter += 1
            s_margin = 0 if counter == 0 else margin
            end = min(skip + chunk_size + margin, samples)
            start = skip - s_margin
            segmented_mix[skip] = mix[:, start:end].copy()
            if end == samples:
                break

        sources = self.demix_base(segmented_mix, margin_size=margin)
        return sources

    def demix_base(self, mixes, margin_size):
        chunked_sources = []
        progress_bar = tqdm(total=len(mixes))
        progress_bar.set_description("Processing")

        for mix in mixes:
            cmix = mixes[mix]
            sources = []
            n_sample = cmix.shape[1]
            model = self.model_
            trim = model.n_fft // 2
            gen_size = model.chunk_size - 2 * trim
            pad = gen_size - n_sample % gen_size
            mix_p = np.concatenate(
                (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
            )
            mix_waves = []
            i = 0
            while i < n_sample + pad:
                waves = np.array(mix_p[:, i : i + model.chunk_size])
                mix_waves.append(waves)
                i += gen_size

            mix_waves = torch.tensor(np.array(mix_waves), dtype=torch.float32)

            with torch.no_grad():
                _ort = self.model
                spek = model.stft(mix_waves)
                if self.args["denoise"]:
                    spec_pred = (
                        -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
                        + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
                    )
                    tar_waves = model.istft(torch.tensor(spec_pred))
                else:
                    tar_waves = model.istft(
                        torch.tensor(_ort.run(None, {"input": spek.cpu().numpy() })[0])
                    )
                tar_signal = (
                    tar_waves[:, :, trim:-trim]
                    .transpose(0, 1)
                    .reshape(2, -1)
                    .numpy()[:, :-pad]
                )

                start = 0 if mix == 0 else margin_size
                end = None if mix == list(mixes.keys())[::-1][0] else -margin_size

                if margin_size == 0:
                    end = None

                sources.append(tar_signal[:, start:end])

                progress_bar.update(1)

            chunked_sources.append(sources)
        _sources = np.concatenate(chunked_sources, axis=-1)

        progress_bar.close()
        return _sources

    def predict(self, file_path):

        mix, rate = librosa.load(file_path, mono=False, sr=44100)

        if mix.ndim == 1:
            mix = np.asfortranarray([mix, mix])

        mix = mix.T
        sources = self.demix(mix.T)
        opt = sources[0].T

        return (mix - opt, opt, rate)

def main():
    parser = ArgumentParser(description="Ultimate Vocal Remover Inference CLI")
    parser.add_argument("inputs", nargs="+", type=Path, help="Source audio path or folder containing audio files")
    parser.add_argument("-o", "--output", type=Path, default=Path("separated"), help="Output folder")
    parser.add_argument("-m", "--model_path", type=Path, help="MDX Net ONNX Model path")
    parser.add_argument("-d", "--denoise", action="store_true", help="Enable denoising")
    parser.add_argument("-M", "--margin", type=int, default=44100, help="Margin")
    parser.add_argument("-c", "--chunks", type=int, default=15, help="Chunk size")
    parser.add_argument("-F", "--n_fft", type=int, default=6144)
    parser.add_argument("-t", "--dim_t", type=int, default=8)
    parser.add_argument("-f", "--dim_f", type=int, default=2048)

    args = parser.parse_args()
    dict_args = vars(args)

    os.makedirs(args.output, exist_ok=True)

    # Process each input which could be either a file or a folder
    for input_path in args.inputs:
        if input_path.is_dir():
            # Process all files in the directory
            file_paths = glob.glob(str(input_path / '*.flac'))  # Adjust the glob pattern if necessary
            for file_path in file_paths:
                file_path = Path(file_path)
                process_file(file_path, dict_args)
        else:
            # Process the single file
            process_file(input_path, dict_args)

def process_file(file_path, args):
    predictor = Predictor(args=args)
    vocals, no_vocals, sampling_rate = predictor.predict(file_path)
    filename = os.path.splitext(os.path.split(file_path)[-1])[0]
    sf.write(os.path.join(args["output"], filename + "_vocals.flac"), vocals, sampling_rate)

if __name__ == "__main__":
    main()
"""

with open('/content/drive/MyDrive/a/separate.py', 'w') as f:
    f.write(script_code)


In [None]:
!python /content/drive/MyDrive/a/separate.py "/content/drive/MyDrive/a/ហេតុអ្វីបានជាក្រុមប្រឆាំងក្រៅស្រុក.flac" -o "/content/drive/MyDrive/e" -m "/content/drive/MyDrive/a/UVR-MDX-NET-Inst_Main.onnx"


Processing: 100% 217/217 [1:30:39<00:00, 25.06s/it]


In [None]:
from pydub import AudioSegment
import io
import webrtcvad
import numpy as np
import wave
import os
from google.colab import drive


drive.mount('/content/drive')


def flac_to_raw(flac_file):
    audio = AudioSegment.from_file(flac_file, format="flac")
    audio = audio.set_channels(1)  
    audio = audio.set_sample_width(2) 
    audio = audio.set_frame_rate(16000)  

    raw_data = io.BytesIO()
    audio.export(raw_data, format="wav")
    raw_data.seek(0)

    with wave.open(raw_data, 'rb') as wf:
        sample_rate = wf.getframerate()
        samples = wf.readframes(wf.getnframes())
        samples = np.frombuffer(samples, dtype=np.int16)

    return sample_rate, samples


def vad_segment(samples, sample_rate, frame_duration_ms=30, padding_duration_ms=500, max_chunk_duration=9):
    vad = webrtcvad.Vad(3)
    frame_size = int(sample_rate * frame_duration_ms / 1000)
    max_chunk_samples = sample_rate * max_chunk_duration
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)

    def frame_generator(samples, frame_size):
        for start in range(0, len(samples), frame_size):
            yield samples[start:start + frame_size]

    def vad_collector(sample_rate, frames):
        buffer = []
        triggered = False
        voiced_frames = []
        silence_frames = 0
        speech_segments = []

        for frame in frames:
            if len(frame) < frame_size:
                print("Skipping frame due to incorrect size.")
                continue

            frame_bytes = frame.tobytes()
            try:
                is_speech = vad.is_speech(frame_bytes, sample_rate)
            except Exception as e:
                print(f"Error processing frame: {e}")
                continue

            if is_speech:
                if not triggered:
                    triggered = True
                    buffer.extend(voiced_frames)
                    voiced_frames = []
                buffer.append(frame)
                silence_frames = 0
            else:
                if triggered:
                    buffer.append(frame)
                    silence_frames += 1
                    if silence_frames > num_padding_frames or len(buffer) * frame_size >= max_chunk_samples:
                        triggered = False
                        speech_segments.append(np.concatenate(buffer))
                        buffer = []
                else:
                    voiced_frames.append(frame)
                    if len(voiced_frames) > num_padding_frames:
                        voiced_frames = voiced_frames[1:]

        if buffer:
            speech_segments.append(np.concatenate(buffer))

        print(f"Detected {len(speech_segments)} speech chunks")
        return speech_segments

    frames = frame_generator(samples, frame_size)
    segments = vad_collector(sample_rate, frames)
    return segments


def contains_significant_speech(audio_segment, silence_threshold_db=-40, min_duration_ms=1000):
    duration_ms = len(audio_segment)
    if duration_ms < min_duration_ms:
        return False

    avg_dBFS = audio_segment.dBFS
    return avg_dBFS > silence_threshold_db

def save_chunks(chunks, sample_rate, output_dir='chunks', base_filename='p', max_duration_ms=9800, min_silence_db=-40):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    padding_silence = AudioSegment.silent(duration=500) 

    for i, chunk in enumerate(chunks):
        if len(chunk) > 0:
            with io.BytesIO(chunk.tobytes()) as raw_data:
                audio_segment = AudioSegment.from_raw(raw_data, sample_width=2, frame_rate=sample_rate, channels=1)

                if not contains_significant_speech(audio_segment, silence_threshold_db=min_silence_db):
                    print(f"Chunk {i} is silent and will be skipped.")
                    continue

                audio_segment = padding_silence + audio_segment + padding_silence

                while len(audio_segment) > max_duration_ms:
                    split_point = max_duration_ms

                    
                    for j in range(max_duration_ms - 1, max_duration_ms - 3000, -1):
                        if audio_segment[j-1:j+1].dBFS < min_silence_db:
                            split_point = j
                            break

                   
                    chunk_to_export = audio_segment[:split_point]
                    chunk_to_export.export(os.path.join(output_dir, f'{base_filename}_{i}.flac'), format='flac')
                    print(f"Saved chunk {i} as {base_filename}_{i}.flac") 
                    i += 1

                   
                    audio_segment = audio_segment[split_point:]

                if len(audio_segment) > 0:
                    audio_segment.export(os.path.join(output_dir, f'{base_filename}_{i}.flac'), format='flac')
                    print(f"Saved chunk {i} as {base_filename}_{i}.flac") # Changed this line


def main(flac_file):
    sample_rate, samples = flac_to_raw(flac_file)
    speech_chunks = vad_segment(samples, sample_rate)

   
    drive_folder = '/content/drive/MyDrive/l' 
    save_chunks(speech_chunks, sample_rate, output_dir=drive_folder)

# Example usage
if __name__ == "__main__":
    flac_file = '/content/drive/MyDrive/a/ទស្សនយុវជនជុំវិញកិច្ចសហប្រតិបត្តិការតំបន់ត្រីកោណអភិវឌ្ឍន៍ កម្ពុជា-ឡាវ-វៀតណាម (CLV-DTA).flac'  # Replace with your FLAC file path
    main(flac_file)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Skipping frame due to incorrect size.
Detected 493 speech chunks
Saved chunk 0 as p_0.flac
Saved chunk 1 as p_1.flac
Saved chunk 1 as p_1.flac
Saved chunk 2 as p_2.flac
Saved chunk 2 as p_2.flac
Saved chunk 3 as p_3.flac
Saved chunk 4 as p_4.flac
Saved chunk 4 as p_4.flac
Saved chunk 5 as p_5.flac
Saved chunk 5 as p_5.flac
Saved chunk 6 as p_6.flac
Saved chunk 6 as p_6.flac
Saved chunk 7 as p_7.flac
Saved chunk 7 as p_7.flac
Saved chunk 8 as p_8.flac
Saved chunk 8 as p_8.flac
Saved chunk 9 as p_9.flac
Saved chunk 10 as p_10.flac
Saved chunk 10 as p_10.flac
Saved chunk 11 as p_11.flac
Saved chunk 11 as p_11.flac
Saved chunk 12 as p_12.flac
Saved chunk 12 as p_12.flac
Saved chunk 13 as p_13.flac
Saved chunk 13 as p_13.flac
Saved chunk 14 as p_14.flac
Saved chunk 14 as p_14.flac
Saved chunk 15 as p_15.flac
Saved chunk 15 as p_15.flac
Saved chunk 16 as p_16.flac


In [None]:
!pip install webrtcvad
# The webrtcvad module needs to be installed.

Collecting webrtcvad
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m761.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: webrtcvad
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp310-cp310-linux_x86_64.whl size=73461 sha256=6877c8c6eb3450078a0230ad26753b084a06863d85719201244d34c5e3473515
  Stored in directory: /root/.cache/pip/wheels/2a/2b/84/ac7bacfe8c68a87c1ee3dd3c66818a54c71599abf308e8eb35
Successfully built webrtcvad
Installing collected packages: webrtcvad
Successfully installed webrtcvad-2.0.10
