In [6]:
import librosa
import torch
from transformers import ClapAudioModelWithProjection, ClapProcessor

In [14]:
def get_embedding(file_path, processor, model, sample_rate=48000):
    audio, _ = librosa.load(file_path, sr=sample_rate)
    inputs = processor(
        audios=audio,
        sampling_rate=sample_rate,
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.audio_embeds  # Aggregate the embeddings if necessary

# File paths
wav_file_path1 = '/data/tilak/projects/mustango/ground_truth/DnrBxSlKd68_70_80.wav'
wav_file_path2 = '/data/tilak/projects/mustango/tmp/epoch_65/DnrBxSlKd68_0_30.wav'  # Replace with your second file's path

# Initialize model and processor
model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

# Get embeddings for both audio files
embedding1 = get_embedding(wav_file_path1, processor, model)
embedding2 = get_embedding(wav_file_path2, processor, model)

# Calculate cosine similarity
cosine_similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=1)

# Print the similarity score
print(f"Similarity score: {cosine_similarity.item()}")


Similarity score: 0.5138895511627197
