In [None]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
# Install necessary libraries
!pip install transformers datasets librosa torch

import torch
import librosa
from transformers import ClapProcessor, ClapModel

# Load CLAP processor and model
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
model = ClapModel.from_pretrained("laion/clap-htsat-fused")

# Load the song using librosa
audio_path = "/content/Data/drums.wav"  # Replace with your file path
audio, sr = librosa.load(audio_path, sr=48000, duration=30)  # Load 30 seconds of audio

# Preprocess audio for CLAP
audio_inputs = processor(audios=audio, return_tensors="pt")

# Extract audio embeddings
with torch.no_grad():
    audio_embeddings = model.get_audio_features(**audio_inputs)

# Instrument list with 10 selected instruments
instruments = [
    "guitar",            # General guitar
    "electric guitar",   # Electric guitar
    "acoustic guitar",   # Acoustic guitar
    "piano",             # General piano
    "grand piano",       # Grand piano
    "violin",            # Violin
    "flute",             # Flute
    "saxophone",         # Saxophone
    "drums",             # Drums (general)
    "keyboard"           # Keyboard
]

# Function to generate text embeddings for labels
def get_text_embeddings(labels):
    embeddings = {}
    for label in labels:
        inputs = processor(text=label, return_tensors="pt")
        with torch.no_grad():
            embeddings[label] = model.get_text_features(**inputs)
    return embeddings

# Generate text embeddings for each group
instrument_embeddings = get_text_embeddings(instruments)

# Function to compute cosine similarity
def compute_similarity(audio_embeds, text_embeds):
    similarities = {
        label: torch.nn.functional.cosine_similarity(audio_embeds, text_embeds[label]).item()
        for label in text_embeds
    }
    return similarities

# Compare and rank results
instrument_results = compute_similarity(audio_embeddings, instrument_embeddings)

# Sort the results based on similarity score
sorted_instruments = sorted(instrument_results.items(), key=lambda x: x[1], reverse=True)

# Display top predictions
if len(sorted_instruments) >= 3:
    top_predictions = sorted_instruments[:3]
    print("\nTop 3 Predicted Instruments:")
    for instrument, score in top_predictions:
        print(f"{instrument}: {score:.2f}")
else:
    predicted_instrument = sorted_instruments[0][0]
    predicted_score = sorted_instruments[0][1]
    print("\nPredicted Instrument:")
    print(f"Predicted Instrument: {predicted_instrument} with similarity score: {predicted_score:.2f}")




It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.



Top 3 Predicted Instruments:
drums: 0.27
acoustic guitar: 0.00
guitar: -0.01
