# Custom clustering
In this code we attempt to do custom clustering on the data. We will use the following steps:
- Preprocessing the data    
    - Get melspectogram of audio with 80 features per mel-frame. 



In [18]:
import librosa
import torch
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from utils_vad import (
    get_speech_timestamps,
)  # Assuming this file defines the get_speech_timestamps function
from preprocess import Wav2Mel
import torchaudio

In [19]:
# Global variables used for preprocessing
SAMPLE_RATE = 16000
NORM_DB = -3
FFT_WINDOW_MS = 25
FFT_HOP_MS = 10

BLOCK_SIZE = 50  # MFCC frames to stack together for embedding

In [20]:
# Load the models

# Pre-trained Dvector model --> Embedding model (modify paths as needed)
dvector_model = torch.jit.load("Pretrained Modules/dvector-step250000.pt")

# V Pretrained VAD model --> VAD model (modify paths as needed)
vad_model = torch.jit.load("Pretrained Modules/silero_vad.jit")

wave2mel = Wav2Mel(
    sample_rate=SAMPLE_RATE,
    norm_db=NORM_DB,
    fft_window_ms=FFT_WINDOW_MS,
    fft_hop_ms=FFT_HOP_MS,
    n_mels=40,
)


# In the case that you wish to verify the model architecture
#! dvector_model.eval()
#! vad_model.eval()

In [21]:
# Function to preprocess audio (resample and normalize)
def preprocess_audio(audio_path, sample_rate=16000):
    """
    Preprocesses audio using Librosa. Includes resampling and normalization (default -3 dB).

    Args:
        audio_path: Path to the audio file.
        sample_rate: Target sample rate (default 16kHz).

    Returns:
        audio_tensor: Resampled and normalized audio tensor (channel dimension added).
    """
    y, sr = librosa.load(audio_path, sr=None)  # Load with original sample rate

    # Resample if necessary
    if sr != sample_rate:
        y = librosa.effects.resample(y, sr, sample_rate)

    # Normalize to -3 dB (default)
    y = librosa.util.normalize(y)  # Removed norm_db argument

    # Convert to torch tensor with channel dimension
    audio_tensor = torch.from_numpy(y).unsqueeze(0)

    return audio_tensor


# Function to compute Mel spectrogram using Librosa
def compute_mel_spectrogram(audio_tensor, sample_rate):
    """
    Computes Mel spectrogram from preprocessed audio tensor using Librosa.

    Args:
        audio_tensor: Preprocessed audio tensor (with channel dimension).
        sample_rate: Sample rate of the audio.

    Returns:
        mel_tensor: Mel spectrogram tensor (float).
    """
    audio = audio_tensor.squeeze(0).numpy()  # Remove channel dimension
    melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate) # Default n_fft=2048, hop_length=512
    mel_tensor = torch.from_numpy(melspectrogram).float() # Convert to tensor
    return mel_tensor


def get_frames(mel_tensor, block_size):
    return mel_tensor.unfold(0, block_size, block_size).mT

def get_frame_embeddings(mel_frames):
    embeddings = torch.empty(mel_frames.shape[0], 256)
    for frame_idx in range(mel_frames.shape[0]):
        embeddings[frame_idx, :] = dvector_model.embed_utterance(mel_frames[frame_idx])
    return embeddings.detach().numpy()


# Testing for one audio file
audio_path = "../Dataset/Audio/Dev/afjiv.wav"
audio_tensor = preprocess_audio(audio_path)
audio_mel_tensor = compute_mel_spectrogram(audio_tensor, SAMPLE_RATE)

# analyze the audio
speech_timestamps = get_speech_timestamps(
    audio_tensor, vad_model, sampling_rate=SAMPLE_RATE, return_seconds=True
)


# Get the mel frames
mel_frames = get_frames(audio_mel_tensor, BLOCK_SIZE)
print("Mel_frame dims", mel_frames.shape)
frame_embeddings = get_frame_embeddings(mel_frames)

# Cluster the embeddings
kmeans = KMeans(n_clusters=2, random_state=0).fit(frame_embeddings)
print(kmeans.labels_)
print(speech_timestamps)
print(kmeans.labels_)
print(len(kmeans.labels_))
print(len(speech_timestamps))
plt.plot(kmeans.labels_)
plt.show()

RuntimeError: sox extension is not supported on Windows