In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

from IPython.display import clear_output

import torch

from pyannote.audio import Inference
from pyannote.audio.utils.signal import Binarize, Peak
from pyannote.core import Segment, notebook, SlidingWindowFeature, timeline, Timeline

from skimage.measure import block_reduce



In [7]:
def pyannote_extract_embs(one_file, one_diar, window_type="sliding"):
    
    if window_type == "sliding":
        emb = Inference("pyannote/embedding", 
                          window="sliding",
                          duration=3.0, step=1.0)
    elif window_type == "whole":
        emb = Inference("pyannote/embedding", window="whole")
    
    #using pyannote audio pretrained model tutorial as is: https://github.com/pyannote/pyannote-audio/tree/master/tutorials/pretrained/model
    #Main change from tutorial - don't calculate the means of the embeddings extracted from each 500ms speech turn.
    
    #one_file: .wav file in the format the model expects
    #diar: diarization map with pyannote timeline to just extract based on intended speaker
    
    emb_from_sample = []
    
    # obtain raw embeddings (as `pyannote.core.SlidingWindowFeature` instance)
    #one_file is a .wav
    embeddings = emb(one_file)

    #We only work of long (> ts) speech turns. Default is 2s from the tutorial.
    # DIAR only has speech turns from desired participant
    #t = 2
    #long_turns = Timeline(segments=[s for s in one_diar if s.duration > t])

    #for each long turn of >t seconds long, extract each 500ms segment of embeddings 
    if one_diar==None:
        emb_from_sample.append(embeddings)
    else:
        for segment in one_diar:
            inter = embeddings.crop(segment, 'strict')
            emb_from_sample.append(inter)
            #Tutorial calculated the mean of all the embeddings from a 500ms segment, but we keep them all
            #emb_from_sample.append(np.mean(inter, axis=0)
    emb_from_sample = np.vstack(emb_from_sample) #vstack to get the list into numpy format with easy to understand #of embeddings X embedding values structure
    return emb_from_sample

In [3]:
##Downsample audio to 16kHz
root_dir= "/Users/rahulbrito/Documents/projects/infantvoice/data/Full_Readings/gasser_readings"
audio_files = os.path.join(root_dir,"preprocessed_audios_dur3sec")

#make a folder with today's date for the downsampled audio
down_sample_dir = os.path.join(root_dir,"preprocessed_audios_dur3sec")

import glob
all_files = glob.glob(down_sample_dir+'/*/*', recursive=True)
all_files = [file for file in all_files if 'script' in file]

one_file = all_files[0]

In [4]:
one_file

'/Users/rahulbrito/Documents/projects/infantvoice/data/Full_Readings/gasser_readings/preprocessed_audios_dur3sec/132/132_M_LPP_script_019.wav'

In [28]:
test = pyannote_extract_embs(one_file, one_diar=None, window_type="whole")

In [35]:
test.shape

(1, 512)