In [None]:
%%capture
!pip install speechbrain

In [None]:
import math
import numpy
import pandas
import librosa
import soundfile
from speechbrain.pretrained import VAD
import matplotlib
import matplotlib.pyplot as plt
from pathlib import Path
import torch
from tqdm import tqdm

In [None]:
def detect_voice(
    path,
    activation_threshold = 0.70,
    deactivation_threshold = 0.25,
    min_pause = 0.200,
    min_activation = 0.100,
    save_dir = 'model_dir',
    segment_pre = 0.0,
    segment_post = 0.0,
    double_check_threshold = None,
    parallel_chunks = 4,
    chunk_size = 1.0,
    overlap_chunks = True,
    ):

    # do initial, coarse-detection
    vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir=save_dir)

    probabilities = vad.get_speech_prob_file(path,
        large_chunk_size=chunk_size*parallel_chunks,
        small_chunk_size=chunk_size,
        overlap_small_chunk=overlap_chunks)

    thresholded = vad.apply_threshold(probabilities,
        activation_th=activation_threshold,
        deactivation_th=deactivation_threshold).float()

    boundaries = vad.get_boundaries(thresholded)

    # refine boundaries using energy-based VAD
    boundaries = vad.energy_VAD(path, boundaries,
            activation_th=activation_threshold,
            deactivation_th=deactivation_threshold)

    # post-process to clean up
    if min_pause is not None:
        boundaries = vad.merge_close_segments(boundaries, close_th=min_pause)

    if min_activation is not None:
        boundaries = vad.remove_short_segments(boundaries, len_th=min_activation)

    if double_check_threshold:
        boundaries = vad.double_check_speech_segments(boundaries, speech_th=double_check_threshold)

    # convert to friendly pandas DataFrames with time info
    events = pandas.DataFrame(boundaries, columns=['start', 'end'])
    events['class'] = 'speech'

    p = numpy.squeeze(probabilities)
    times = pandas.Series(numpy.arange(0, len(p)) * vad.time_resolution, name='time')
    p = pandas.DataFrame(p, columns=['speech'], index=times)

    return p, events

In [None]:
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir='model_dir')

In [None]:
def load_split_audio(audio_path: Path, dest_path: Path):
    audio, sr = librosa.load(str(audio_path), sr=None)
    resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)

    cache_folder = Path('./cache/')
    if not cache_folder.exists():
        cache_folder.mkdir()
    if not dest_path.exists():
        dest_path.mkdir()

    cache_file = Path('./cache/').joinpath(audio_path.name)
    soundfile.write(str(cache_file), data=resampled, samplerate=16000)

    stem = audio_path.stem
    dest_audio = dest_path.joinpath(audio_path.name)
    timestamps = detect_voice(str(cache_file), min_pause=2, min_activation=2)
    path = Path('./resample&split/')
    for row in timestamps[1].itertuples():
        name = dest_audio.with_stem(f'{stem}_{row.Index}')
        start, end = round(row.start*16000), round(row.end*16000)
        soundfile.write(str(name), data=resampled[start:end], samplerate=16000)
    cache_file.unlink()

In [None]:
total_data = list(Path('./data/audio').iterdir()) + list(Path('./data/text+audio/audio').iterdir())
output_path = Path('./resample_split/')

In [None]:
for filepath in tqdm(total_data):
    load_split_audio(filepath, output_path)

100%|████████████████████████████████████████████████████████████████████████████████| 193/193 [19:02<00:00,  5.92s/it]


In [None]:
from pathlib import Path
import librosa
from tqdm import tqdm

def resample(audio_path: Path):
  for filepath in tqdm(list(audio_path.iterdir())):
    audio, sr = librosa.load(str(filepath), sr=None)
    if sr != 16000:
      resampled = librosa.resample(audio, orig_sr=sr, target_sr=16000)
      soundfile.write(str(filepath), data=resampled, samplerate=16000)

In [None]:
resample(Path('/content/drive/MyDrive/dataset/audio'))

100%|██████████| 19794/19794 [05:46<00:00, 57.18it/s] 
