In [29]:
# All imports
import librosa
import torch
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from utils_vad import (
    get_speech_timestamps,
)  # Assuming this file defines the get_speech_timestamps function
from preprocess import Wav2Mel
import torchaudio

import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.stats import mode
import os

from collections import Counter
from tqdm import tqdm
import warnings

warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message="RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().",
)


# Global variables used for preprocessing
SAMPLE_RATE = 16000
NORM_DB = -3
FFT_WINDOW_MS = 25
FFT_HOP_MS = 10
FRAME_SIZE = 40  # Adjust frame size if needed
BLOCK_SIZE = 50  # MFCC frames to stack together for embedding

# Path to audio file
TEST_PATH = "../Dataset/Audio/Test/"
OUTPUT_PATH = "../Results/Custom Pipeline/"

# Load the models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dvector_model = torch.jit.load("Pretrained Modules/dvector-step250000.pt")
dvector_model = dvector_model.to(device)
vad_model = torch.jit.load("Pretrained Modules/silero_vad.jit")
vad_model = vad_model.to(device)
wave2mel = Wav2Mel(
    sample_rate=SAMPLE_RATE,
    norm_db=NORM_DB,
    fft_window_ms=FFT_WINDOW_MS,
    fft_hop_ms=FFT_HOP_MS,
    n_mels=40,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dvector_model.to(device)


def convert_samples_to_time(segments, sampling_rate):
    time_segments = []
    for segment in segments:
        start_time = segment["start"] / sampling_rate
        end_time = segment["end"] / sampling_rate
        time_segments.append({"start": start_time, "end": end_time})
    return time_segments


def get_frames(mel_tensor, block_size):
    return mel_tensor.unfold(0, block_size, block_size).mT


def get_frame_embeddings(mel_frames):
    embeddings = torch.empty(mel_frames.shape[0], 256, device=device)
    for frame_idx in range(mel_frames.shape[0]):
        frame = mel_frames[frame_idx].to(device)  # Move the frame to the GPU
        embeddings[frame_idx, :] = dvector_model.embed_utterance(frame)
    return embeddings.detach().cpu().numpy()


def normalized_laplacian_matrix(A):
    # A is an adjacency matrix
    D = np.sum(A, axis=1)
    inv_sqrt_D = np.power(D, -0.5)
    L = inv_sqrt_D.reshape(-1, 1) * A * inv_sqrt_D.reshape(1, -1)
    return L


# Function to generate RTTM entries
def generate_rttm(
    speech_segments, frame_predictions, block_size, sampling_rate, output_file
):
    rttm_lines = []
    segment_index = 0
    frame_index = 0

    # Loop over the speech segments
    for segment in speech_segments:
        start_time = segment["start"]
        end_time = segment["end"]
        segment_duration = end_time - start_time

        # Calculate number of frames that fit into this segment
        num_frames_in_segment = int(
            np.ceil(segment_duration * sampling_rate / block_size)
        )

        for _ in range(num_frames_in_segment):
            if frame_index >= len(frame_predictions):
                break

            frame_start_time = start_time + (frame_index * block_size / sampling_rate)
            frame_end_time = frame_start_time + (block_size / sampling_rate)

            if frame_end_time > end_time:
                frame_end_time = end_time

            duration = frame_end_time - frame_start_time
            cluster_id = frame_predictions[frame_index]

            rttm_line = f"SPEAKER unknown 1 {frame_start_time:.3f} {duration:.3f} <NA> <NA> speaker_{cluster_id} <NA> <NA>"
            rttm_lines.append(rttm_line)

            frame_index += 1

    with open(output_file, "w") as file:
        for line in rttm_lines:
            file.write(line + "\n")

    print(f"RTTM file generated at: {output_file}")

# Iterate over all files at the test path
for file in os.listdir(TEST_PATH):
    print(f"Processing file: {file}")

Processing file: aepyx.wav
Processing file: aggyz.wav
Processing file: aiqwk.wav
Processing file: aorju.wav
Processing file: auzru.wav
Processing file: bgvvt.wav
Processing file: bidnq.wav
Processing file: bjruf.wav
Processing file: bmsyn.wav
Processing file: bpzsc.wav
Processing file: bvqnu.wav
Processing file: bvyvm.wav
Processing file: bxcfq.wav
Processing file: byapz.wav
Processing file: cadba.wav
Processing file: cawnd.wav
Processing file: clfcg.wav
Processing file: cpebh.wav
Processing file: cqfmj.wav
Processing file: crorm.wav
Processing file: crylr.wav
Processing file: cvofp.wav
Processing file: cwbvu.wav
Processing file: dgvwu.wav
Processing file: diysk.wav
Processing file: dkabn.wav
Processing file: dlast.wav
Processing file: dohag.wav
Processing file: duvox.wav
Processing file: dxbbt.wav
Processing file: dxokr.wav
Processing file: dzsef.wav
Processing file: dzxut.wav
Processing file: eauve.wav
Processing file: eazeq.wav
Processing file: eddje.wav
Processing file: eguui.wav
P

In [30]:
# Running it on all the files in the test folder
for file in tqdm(os.listdir(TEST_PATH)):
    print(f"Processing file: {file}")

    # Skip non-audio files
    if not file.endswith(".wav"):
        continue

    # If the file is empty, skip it
    if os.path.getsize(os.path.join(TEST_PATH, file)) == 0:
        continue



    # Get audio, preprocess it, get segment and move it all to CUDA
    audio_path = os.path.join(TEST_PATH, file)
    wave_tensor_torch, wave_sampling_rate = torchaudio.load(audio_path)
    wave_tensor_torch = wave_tensor_torch.to(device)
    speech_segments = get_speech_timestamps(wave_tensor_torch, model=vad_model)
    speech_segments_time = convert_samples_to_time(speech_segments, wave_sampling_rate)
    wave_tensor_torch, wave_sampling_rate = torchaudio.load(audio_path)
    wave_tensor, mel_tensor = wave2mel(wave_tensor_torch, wave_sampling_rate)

    # Get the features
    mel_frames = get_frames(mel_tensor, BLOCK_SIZE)
    embeddings = get_frame_embeddings(mel_frames)

    # Spectral clustering part
    distance = pdist(embeddings, metric="euclidean")

    A = 1 / (1 + distance)
    A = squareform(A)
    A = A - np.diag(np.diag(A))  # set diagonal to zero
    L = normalized_laplacian_matrix(A)
    w, v = np.linalg.eig(L)
    eigenvalues = np.real(w)
    num_vectors = np.argmin(np.diff(eigenvalues)) + 1
    num_vectors = 3
    eigenvectors = np.real(v[:, 0:num_vectors])

    norm_eigenvectors = eigenvectors / np.linalg.norm(
        eigenvectors, axis=1, keepdims=True
    )

    frame_predictions = KMeans(
        n_clusters=norm_eigenvectors.shape[1], n_init=5
    ).fit_predict(norm_eigenvectors)

    output_file = os.path.join(OUTPUT_PATH, f"{file}.rttm")

    # Generate RTTM entries
    generate_rttm(
        speech_segments_time,
        frame_predictions,
        BLOCK_SIZE,
        wave_sampling_rate,
        output_file,
    )

  0%|          | 0/232 [00:07<?, ?it/s]


RuntimeError: sox extension is not supported on Windows