In [1]:
from pyannote.audio import Pipeline
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import soundfile as sf
from IPython import display as disp

from bokeh.plotting import figure
from bokeh.io import output_notebook, show
output_notebook()

plt.rcParams.update({'font.size': 13})

# get common_voice

In [2]:
from datasets import Audio
from datasets import load_dataset, DatasetDict
from datasets import load_from_disk
from datasets import Dataset

# train_split = "train[0:1000]"
train_split = "test[0:1000]"
name = "mozilla-foundation/common_voice_11_0"
language = "ru"
removavle_cols = ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
sampling_rate = 16000

common_voice = DatasetDict()
common_voice["train"] = load_dataset(name, language, split=train_split)
common_voice = common_voice.remove_columns(removavle_cols)
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

Found cached dataset common_voice_11_0 (/home/docker_current/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ru/11.0.0/3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631)


In [3]:
common_voice["train"]

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 1000
})

# prepare VAD model

In [4]:
hf_token = "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"
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection",
                                    use_auth_token=hf_token)

initial_params = {"onset": 0.3, "offset": 0.1, 
                  "min_duration_on": 0.05, "min_duration_off": 0.0}

# initial_params = {"onset": 0.01, "offset": 0.01, 
#                   "min_duration_on": 0.01, "min_duration_off": 0.0}

pipeline.instantiate(initial_params)

<pyannote.audio.pipelines.voice_activity_detection.VoiceActivityDetection at 0x7f4d5aa47280>

In [5]:
import torch

def process_output_vad(all_speechs, sr, merge_thre=0.4):
    if len(all_speechs) == 1:
        return all_speechs
    
    if len(all_speechs) == 0:
        return []
    
    # deleting by threshold
    # need to realise   
    
    time_array = np.array(all_speechs) / sr
    # print(time_array)
    
    # merging by threshold
    t_start = all_speechs[0][0]
    t_end = all_speechs[0][1]
    
    for i in range(time_array.shape[0]-1):
        if time_array[i+1, 0] - time_array[i, 1]  < merge_thre:
            t_end = all_speechs[i+1][1]
        
    return [[t_start, t_end]]

def new_inference_vad(wav, sr, vis=False, disp_audio=False, merge_thre=0.4, dt_reserve=None):
    # wav, sr = torchaudio.load(path)
    wav = torch.Tensor(wav).unsqueeze(0)
    
    mapping = {"waveform" : wav, "sample_rate": sr}
    output =  pipeline(mapping)
    
    if disp_audio:
        disp.display(disp.Audio(wav.numpy(), rate=sr))
    
    all_speechs = []

    for speech in output.get_timeline().support():
        t_start, t_end = speech.start, speech.end
        if dt_reserve:
            t_start = max(0, t_start - dt_reserve)
            t_end = min(wav.shape[-1], t_end + dt_reserve)
            
        all_speechs.append([int(t_start * sr), int(t_end * sr)])
        
    all_speechs = process_output_vad(all_speechs, sr, merge_thre=merge_thre)
    
    if vis:
        new_plot_vad(all_speechs, wav, sr, dt_reserve)
        
    return all_speechs, wav, sr

def new_plot_vad(all_speechs, wav, sr, dt_reserve):
    data = wav[0].numpy()
    timestamps = np.arange(0, data.shape[0]) * 1 / sr
    y_min, y_max = min(data), max(data)

    plt.figure(figsize=(20, 4))
    plt.plot(timestamps, data)
    
    for speech in all_speechs:
        t_start, t_end = speech
        t_start /= sr 
        t_end /= sr
        if dt_reserve:
            t_start += dt_reserve
            t_end -= dt_reserve
        plt.fill_betweenx([y_min, y_min, y_max], t_start,  [t_start, t_end, t_end], alpha=0.2, color='r')
        if dt_reserve:
            plt.fill_betweenx([y_min, y_min, y_max], t_start-dt_reserve,  [t_start-dt_reserve, t_start, t_start], alpha=0.2, color='b')
            plt.fill_betweenx([y_min, y_min, y_max], t_end,  [t_end, t_end+dt_reserve, t_end+dt_reserve], alpha=0.2, color='b')
            
def bokeh_vis(path):
    wav, sr = torchaudio.load(path)
    wav = wav.numpy()[0]
    disp.display(disp.Audio(wav, rate=sr))
    p = figure(width=1400, height=400, title="hello world")
    p.line(np.arange(wav.shape[-1]), wav, line_width=2)
    show(p)

In [6]:
from tqdm import tqdm
import os
import shutil

dt_reserves = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, "orig"]
basepath = "/home/docker_current/src/common_voice"

for dt_reserve in dt_reserves:
    cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
    if os.path.isdir(cur_path):
        shutil.rmtree(cur_path)
    os.mkdir(cur_path)

In [7]:
# for dt_reserve in dt_reserves:

#     vaded_common_voice_dict = {"audio" : [],
#                                "sentence" : []} 

#     for i in tqdm(range(len(common_voice["train"]))):
#         wav = common_voice["train"][i]["audio"]["array"]
#         all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

#         if len(all_speechs) != 0:
#             start, end = all_speechs[0]
            
#             audio = {"array" : wav[0][start:end].numpy(), 
#                     'sampling_rate': 16000}
#             sentence = common_voice["train"][i]["sentence"]
            
#             vaded_common_voice_dict["audio"].append(audio)
#             vaded_common_voice_dict["sentence"].append(sentence)
            
#         cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
#         ds = Dataset.from_dict(vaded_common_voice_dict)
#         ds.save_to_disk(cur_path)



In [8]:
dt_reserve = dt_reserves[0]

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

    if len(all_speechs) != 0:
        start, end = all_speechs[0]
        
        audio = {"array" : wav[0][start:end].numpy(), 
                'sampling_rate': 16000}
        sentence = common_voice["train"][i]["sentence"]
        
        vaded_common_voice_dict["audio"].append(audio)
        vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [01:11<00:00, 14.00it/s]


In [9]:
dt_reserve = dt_reserves[1]

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

    if len(all_speechs) != 0:
        start, end = all_speechs[0]
        
        audio = {"array" : wav[0][start:end].numpy(), 
                'sampling_rate': 16000}
        sentence = common_voice["train"][i]["sentence"]
        
        vaded_common_voice_dict["audio"].append(audio)
        vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [00:47<00:00, 20.97it/s]


In [10]:
dt_reserve = dt_reserves[2]

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

    if len(all_speechs) != 0:
        start, end = all_speechs[0]
        
        audio = {"array" : wav[0][start:end].numpy(), 
                'sampling_rate': 16000}
        sentence = common_voice["train"][i]["sentence"]
        
        vaded_common_voice_dict["audio"].append(audio)
        vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [00:48<00:00, 20.78it/s]


In [11]:
dt_reserve = dt_reserves[3]

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

    if len(all_speechs) != 0:
        start, end = all_speechs[0]
        
        audio = {"array" : wav[0][start:end].numpy(), 
                'sampling_rate': 16000}
        sentence = common_voice["train"][i]["sentence"]
        
        vaded_common_voice_dict["audio"].append(audio)
        vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [00:48<00:00, 20.72it/s]


In [12]:
dt_reserve = dt_reserves[4]

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

    if len(all_speechs) != 0:
        start, end = all_speechs[0]
        
        audio = {"array" : wav[0][start:end].numpy(), 
                'sampling_rate': 16000}
        sentence = common_voice["train"][i]["sentence"]
        
        vaded_common_voice_dict["audio"].append(audio)
        vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [00:48<00:00, 20.58it/s]


In [15]:
dt_reserve = dt_reserves[5]

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    all_speechs, wav, sr = new_inference_vad(wav, sampling_rate, vis=False, disp_audio=False, dt_reserve=dt_reserve)

    if len(all_speechs) != 0:
        start, end = all_speechs[0]
        
        audio = {"array" : wav[0][start:end].numpy(), 
                'sampling_rate': 16000}
        sentence = common_voice["train"][i]["sentence"]
        
        vaded_common_voice_dict["audio"].append(audio)
        vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [01:30<00:00, 11.10it/s]


In [26]:
dt_reserve = "orig"

vaded_common_voice_dict = {"audio" : [],
                            "sentence" : []} 

for i in tqdm(range(len(common_voice["train"]))):
    wav = common_voice["train"][i]["audio"]["array"]
    
    audio = {"array" : wav, 
            'sampling_rate': 16000}
    sentence = common_voice["train"][i]["sentence"]
    
    vaded_common_voice_dict["audio"].append(audio)
    vaded_common_voice_dict["sentence"].append(sentence)
        
cur_path = os.path.join(basepath, f"vad_{dt_reserve}")
ds = Dataset.from_dict(vaded_common_voice_dict)
ds.save_to_disk(cur_path)

100%|██████████| 1000/1000 [00:15<00:00, 63.60it/s]
