# Nvidia Nemo
Neural Diarizer: Multiscale Diarization Decoder with Oracle VAD


In [1]:
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa
from nemo.collections.asr.parts.utils.speaker_utils import (
    rttm_to_labels,
    labels_to_pyannote_object,
)
import os
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa
from tqdm import tqdm

from omegaconf import OmegaConf
from nemo.collections.asr.models import ClusteringDiarizer
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
import logging
import torch


import wget
import json
import nemo.utils
import sys


nemo_logger = nemo.utils.logging
nemo_logger.setLevel(logging.ERROR)

    


In [2]:
import os
import json
import wget
import torch
import sys
from tqdm import tqdm
from omegaconf import OmegaConf
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
import librosa

# Directories
data_directory = "../Dataset/Audio/Dev" # Change this to run on the test set
rttm_directory = "../Dataset/RTTMs/Dev" # Change this to run on the test set
sr = 16000
results_directory = "../Results/Oracle_Decoder"
model_config = "../Config/"
Metadata_test = "../Metadata_test/"
# logging.getLogger("nemo").setLevel(logging.ERROR)

# Check if CUDA is available and set the device
if not torch.cuda.is_available():
    print("CUDA is not available. Exiting.")
    sys.exit(1)
else:
    device = torch.device("cuda")
    print(f"Using device: {device}")

# Loading the pre-trained model
MODEL_CONFIG = os.path.join(model_config, "diar_infer_telephonic.yaml")
if not os.path.exists(MODEL_CONFIG):
    print("Did not find the model config file, downloading it now")
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"
    MODEL_CONFIG = wget.download(config_url, model_config)

config = OmegaConf.load(MODEL_CONFIG)
pretrained_speaker_model = "titanet_large"
config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [
    1.5,
    1.25,
    1.0,
    0.75,
    0.5,
]
config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [
    0.75,
    0.625,
    0.5,
    0.375,
    0.1,
]
config.diarizer.speaker_embeddings.parameters.multiscale_weights = [1, 1, 1, 1, 1]
config.diarizer.oracle_vad = True  # ----> ORACLE VAD
config.diarizer.clustering.parameters.oracle_num_speakers = False
config.diarizer.out_dir = results_directory  # Set out_dir correctly
config.diarizer.output_dir = results_directory
config.diarizer.msdd_model.model_path = (
    "diar_msdd_telephonic"  # Telephonic speaker diarization model
)
config.diarizer.msdd_model.parameters.sigmoid_threshold = [
    0.7,
    1.0,
]  # Evaluate with T=0.7 and T=1.0

config.device = "cuda"

# Iterate over all files in the directory
for filename in tqdm(os.listdir(data_directory)):
    if filename.endswith(".wav"):
        # Get the file paths
        audio_path = os.path.join(data_directory, filename)
        # Get the RTTM file path. They are in ../Dataset/RTTMs/Dev
        rttm_path = os.path.join(rttm_directory, filename.replace(".wav", ".rttm"))
        signal, sr = librosa.load(audio_path, sr=sr)
        signal_tensor = torch.tensor(signal).to(device)

        # Get the labels from the RTTM file
        labels = rttm_to_labels(rttm_path)
        reference = labels_to_pyannote_object(labels)

        # Create the metadata dictionary
        meta = {
            "audio_filepath": audio_path,
            "offset": 0,
            "duration": None,
            "label": "infer",
            "text": "-",
            "num_speakers": None,
            "rttm_filepath": rttm_path,
            "uem_filepath": None,
        }

        # Write the metadata to a JSON file
        manifest_filename = filename.replace(".wav", ".json")
        manifest_path = os.path.join(Metadata_test, manifest_filename)
        with open(manifest_path, "w") as fp:
            json.dump(meta, fp)
            fp.write("\n")

        config.diarizer.manifest_filepath = (
            manifest_path  # Use the correct manifest path
        )

        # Instantiate the model inside the loop after setting the manifest_filepath
        oracle_vad_msdd_model = NeuralDiarizer(cfg=config)
        oracle_vad_msdd_model.to(device)

        # Diarize
        oracle_vad_msdd_model.diarize()

        # Get the RTTMS
        rttm_pred_path = os.path.join(
            results_directory, "pred_rttms", filename.replace(".wav", ".rttm")
        )
        pred_labels_neural = rttm_to_labels(rttm_pred_path)

        # Refresh the CUDA memory
        torch.cuda.empty_cache()

Using device: cuda


100%|██████████| 1/1 [00:00<00:00, 14.04it/s]
100%|██████████| 1/1 [00:00<00:00,  5.59it/s]t]
100%|██████████| 1/1 [00:01<00:00,  1.30s/it]t]
100%|██████████| 1/1 [00:00<00:00,  8.39it/s]t]
100%|██████████| 1/1 [00:00<00:00, 33.17it/s]t]
100%|██████████| 1/1 [00:00<00:00, 30.26it/s]t]
100%|██████████| 1/1 [00:00<00:00,  6.66it/s]t]
100%|██████████| 1/1 [00:00<00:00, 33.95it/s]t]
100%|██████████| 1/1 [00:00<00:00, 10.26it/s]t]
100%|██████████| 1/1 [00:00<00:00,  9.16it/s]t]
100%|██████████| 1/1 [00:00<00:00,  1.73it/s]it]
100%|██████████| 1/1 [00:04<00:00,  4.20s/it]it]
100%|██████████| 1/1 [00:00<00:00, 47.62it/s]it]
100%|██████████| 1/1 [00:00<00:00, 17.54it/s]it]
100%|██████████| 1/1 [00:00<00:00,  4.26it/s]it]
100%|██████████| 1/1 [00:00<00:00,  4.87it/s]it]
100%|██████████| 1/1 [00:00<00:00, 24.39it/s]it]
100%|██████████| 1/1 [00:00<00:00,  3.88it/s]it]
100%|██████████| 1/1 [00:00<00:00,  7.09it/s]it]
100%|██████████| 1/1 [00:00<00:00,  5.06it/s]it]
100%|██████████| 1/1 [00:01<00:0