In [None]:
import torch
import librosa
import json
from transformers import ClapModel, ClapProcessor

# Load the CLAP model and processor
model = ClapModel.from_pretrained("laion/clap-htsat-fused")
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

# Define mood descriptions (text prompts)
moods = [
    "This audio sounds happy and cheerful.",
    "This audio sounds sad and melancholic.",
    "This audio is calm and relaxing.",
    "This audio is energetic and upbeat.",
    "This audio sounds angry and intense.",
    "This audio sounds fearful and suspenseful.",
    "This audio is joyful and uplifting.",
    "This audio sounds dark and ominous.",
    "This audio feels romantic and emotional.",
    "This audio is dreamy and atmospheric.",
    "This audio sounds tense and dramatic.",
    "This audio feels lonely and isolated."
]

# Step 1: Load and preprocess the audio file
def extract_metadata(audio_path):
    waveform, sr = librosa.load(audio_path, sr=48000, mono=True)  # Convert to 48kHz mono

    # Step 2: Process audio to get the audio embedding
    audio_inputs = processor(audios=waveform, sampling_rate=48000, return_tensors="pt", padding=True)
    audio_embedding = model.get_audio_features(**audio_inputs)

    # Step 3: Process mood text prompts to get text embeddings
    text_embeddings = []
    for mood in moods:
        text_inputs = processor(text=[mood], return_tensors="pt", padding=True)
        text_embedding = model.get_text_features(**text_inputs)
        text_embeddings.append(text_embedding)

    # Step 4: Calculate cosine similarities
    cosine_similarities = [
        torch.nn.functional.cosine_similarity(audio_embedding, text_embedding).item()
        for text_embedding in text_embeddings
    ]

    # Step 5: Get the top 3 moods and their scores
    top_3_indices = sorted(range(len(cosine_similarities)), key=lambda i: cosine_similarities[i], reverse=True)[:3]
    top_3_moods = [
        (moods[i].replace("This audio sounds", "").strip(), cosine_similarities[i])
        for i in top_3_indices
    ]

    # Step 6: Prepare metadata as JSON
    metadata = {
        "audio_file": audio_path.split("/")[-1],  # Extract only the file name
        "top_moods": [
            {"mood": mood, "score": round(score, 4)}
            for mood, score in top_3_moods
        ]
    }

    return metadata

# Example usage
audio_path = "/content/drive/MyDrive/dataset/processed/Beat Won_t Stop.wav"
metadata = extract_metadata(audio_path)

# Save metadata to a JSON file
output_file = "Beat Won_t Stop_metadata.json"
with open(output_file, "w") as f:
    json.dump(metadata, f, indent=4)

print(f"Metadata saved to {output_file}.")
print(json.dumps(metadata, indent=4))


Metadata saved to Beat Won_t Stop_metadata.json.
{
    "audio_file": "Beat Won_t Stop.wav",
    "top_moods": [
        {
            "mood": "dark and ominous.",
            "score": 0.4603
        },
        {
            "mood": "tense and dramatic.",
            "score": 0.3349
        },
        {
            "mood": "fearful and suspenseful.",
            "score": 0.3309
        }
    ]
}
