Audio and Text Embedding

In [None]:
!pip install laion-clap sentence-transformers torchaudio unidecode



In [None]:
import os
import json
import torch
import torchaudio
import numpy as np
import unicodedata
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
AUDIO_FOLDER = "/content/drive/MyDrive/MIR Project/Tamil/Tamil_fem_mono/Tamil_fem_audio"
TEXT_FILE = "/content/drive/MyDrive/MIR Project/Tamil/Tamil_fem_mono/Tamil_fem_mono.txt"

OUTPUT_AUDIO_EMB = "/content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy"
OUTPUT_TEXT_EMB = "/content/drive/MyDrive/MIR Project/Tamil/text_embeddings.npy"
OUTPUT_METADATA = "/content/drive/MyDrive/MIR Project/Tamil/metadata.json"


In [None]:
from transformers import ClapModel, ClapProcessor

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLAP (HuggingFace HF version)
clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)
clap_model.eval()

# Load LaBSE text encoder
labse = SentenceTransformer("sentence-transformers/LaBSE").to(device)
labse.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/614M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 768, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
  (3): Normalize()
)

In [None]:
def preprocess_audio(path, target_sr=48000):
    waveform, sr = torchaudio.load(path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sr != target_sr:
        waveform = torchaudio.functional.resample(waveform, sr, target_sr)

    waveform = waveform / waveform.abs().max()

    return waveform.squeeze(0)


def clean_tamil_text(text):
    text = unicodedata.normalize("NFC", text)
    text = " ".join(text.split())
    text = text.strip(" .,!?:;“”\"'")
    return text


In [None]:
transcript_dict = {}

with open(TEXT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) < 2:
            continue
        key, text = parts[0].strip(), parts[1].strip()
        transcript_dict[key] = clean_tamil_text(text)

print("Loaded transcripts:", len(transcript_dict))


Loaded transcripts: 5396


In [None]:
audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")])
print("Total audio found:", len(audio_files))

Total audio found: 5396


In [None]:
# processor(audio, sampling_rate=48000)
# model.get_audio_features(...)
audio_embeddings = []
text_embeddings = []
metadata = []

for fname in tqdm(audio_files):

    file_id = fname.replace(".wav", "")

    if file_id not in transcript_dict:
        print("Warning: No text for", file_id)
        continue

    text = transcript_dict[file_id]
    audio_path = os.path.join(AUDIO_FOLDER, fname)

    # --- Audio preprocessing ---
    waveform = preprocess_audio(audio_path)
    waveform = waveform.numpy()

    # --- CLAP Audio Embedding ---
    inputs = clap_processor(audios=waveform, sampling_rate=48000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        audio_emb = clap_model.get_audio_features(**inputs)
        audio_emb = audio_emb.cpu().numpy().squeeze()

    # --- LaBSE text embedding ---
    with torch.no_grad():
        text_emb = labse.encode([text], convert_to_tensor=True).cpu().numpy().squeeze()

    audio_embeddings.append(audio_emb)
    text_embeddings.append(text_emb)

    metadata.append({
        "id": file_id,
        "audio_file": fname,
        "text": text
    })


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  inputs = clap_processor(audios=waveform, sampling_rate=48000, return_tensors="pt")
100%|██████████| 5396/5396 [49:35<00:00,  1.81it/s]


In [None]:
audio_embeddings = np.array(audio_embeddings, dtype=np.float32)
text_embeddings = np.array(text_embeddings, dtype=np.float32)

np.save(OUTPUT_AUDIO_EMB, audio_embeddings)
np.save(OUTPUT_TEXT_EMB, text_embeddings)

with open(OUTPUT_METADATA, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print("Saved:")
print(OUTPUT_AUDIO_EMB)
print(OUTPUT_TEXT_EMB)
print(OUTPUT_METADATA)

Saved:
/content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy
/content/drive/MyDrive/MIR Project/Tamil/text_embeddings.npy
/content/drive/MyDrive/MIR Project/Tamil/metadata.json


english dataset loading

In [None]:
import json
import numpy as np
import os

# -------------------------
# Set your English paths here
# -------------------------
EN_AUDIO_EMB = "/content/drive/MyDrive/MIR Project/English/clap_embeddings.npy"
EN_TEXT_EMB = "/content/drive/MyDrive/MIR Project/English/labse_embeddings.npy"
EN_METADATA = "/content/drive/MyDrive/MIR Project/English/dataset_metadata.json"


In [None]:
# -------------------------
# Load metadata
# -------------------------
def load_metadata(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded metadata records: {len(data)}")
    print("Sample metadata entry:\n", json.dumps(data[0], indent=2))
    return data


# -------------------------
# Load embeddings
# -------------------------
def load_npy(path):
    arr = np.load(path)
    print(f"\nLoaded: {path}")
    print(f"Shape   : {arr.shape}")
    print(f"Dtype   : {arr.dtype}")
    print(f"Min     : {arr.min():.4f}")
    print(f"Max     : {arr.max():.4f}")
    print(f"Mean    : {arr.mean():.4f}")
    print(f"NaN?    : {np.isnan(arr).sum()} values")
    return arr


# -------------------------
# Check alignment
# -------------------------
def check_alignment(meta, audio_emb, text_emb):

    print("\n=== Alignment Check ===")
    print(f"Metadata count     : {len(meta)}")
    print(f"Audio embeddings   : {audio_emb.shape[0]}")
    print(f"Text embeddings    : {text_emb.shape[0]}")

    if len(meta) != audio_emb.shape[0] or len(meta) != text_emb.shape[0]:
        print("❌ MISMATCH: Metadata and embeddings not aligned!")
    else:
        print("✅ OK: Metadata & embeddings are aligned index-to-index")

    # Example entries
    print("\nExample alignment preview:")
    for i in range(3):
        print(f"\nIndex {i}:")
        print("File ID:", meta[i].get("file_id"))
        print("Transcript:", meta[i].get("transcript"))
        print("Audio Embedding snippet:", audio_emb[i][:5])
        print("Text Embedding snippet:", text_emb[i][:5])


# -------------------------
# Run analysis
# -------------------------

metadata = load_metadata(EN_METADATA)
audio_emb = load_npy(EN_AUDIO_EMB)
text_emb = load_npy(EN_TEXT_EMB)

check_alignment(metadata, audio_emb, text_emb)


Loaded metadata records: 2703
Sample metadata entry:
 {
  "file_id": "5694-64038-0008",
  "path": "dev-clean/LibriSpeech/dev-clean/5694/64038/5694-64038-0008.flac",
  "speaker_id": "5694",
  "chapter_id": "64038",
  "transcript": "A MAN IN THE WELL"
}

Loaded: /content/drive/MyDrive/MIR Project/English/clap_embeddings.npy
Shape   : (2703, 512)
Dtype   : float32
Min     : -0.1874
Max     : 0.1927
Mean    : -0.0013
NaN?    : 0 values

Loaded: /content/drive/MyDrive/MIR Project/English/labse_embeddings.npy
Shape   : (2703, 768)
Dtype   : float32
Min     : -0.4183
Max     : 0.4325
Mean    : 0.0060
NaN?    : 0 values

=== Alignment Check ===
Metadata count     : 2703
Audio embeddings   : 2703
Text embeddings    : 2703
✅ OK: Metadata & embeddings are aligned index-to-index

Example alignment preview:

Index 0:
File ID: 5694-64038-0008
Transcript: A MAN IN THE WELL
Audio Embedding snippet: [-0.05773152  0.01041359  0.01422     0.0091116  -0.0241992 ]
Text Embedding snippet: [ 0.00052354 -0.00

Tamil

In [None]:
import json
import numpy as np

# ----------------------------------------------------
# Set your Tamil paths here
# ----------------------------------------------------
TA_AUDIO_EMB = "/content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy"
TA_TEXT_EMB  = "/content/drive/MyDrive/MIR Project/Tamil/text_embeddings.npy"
TA_METADATA  = "/content/drive/MyDrive/MIR Project/Tamil/metadata.json"


# ----------------------------------------------------
# Load metadata
# ----------------------------------------------------
def load_metadata(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded metadata records: {len(data)}")
    print("\nSample metadata entry:\n", json.dumps(data[0], indent=2, ensure_ascii=False))
    return data


# ----------------------------------------------------
# Load embeddings
# ----------------------------------------------------
def load_npy(path):
    arr = np.load(path)
    print(f"\nLoaded: {path}")
    print(f"Shape   : {arr.shape}")
    print(f"Dtype   : {arr.dtype}")
    print(f"Min     : {arr.min():.4f}")
    print(f"Max     : {arr.max():.4f}")
    print(f"Mean    : {arr.mean():.4f}")
    print(f"NaN?    : {np.isnan(arr).sum()} values")
    return arr


# ----------------------------------------------------
# Check alignment between metadata & embeddings
# ----------------------------------------------------
def check_alignment(meta, audio_emb, text_emb):

    print("\n=== Alignment Check ===")
    print(f"Metadata count     : {len(meta)}")
    print(f"Audio embeddings   : {audio_emb.shape[0]}")
    print(f"Text embeddings    : {text_emb.shape[0]}")

    if len(meta) != audio_emb.shape[0] or len(meta) != text_emb.shape[0]:
        print(" MISMATCH: Metadata and embeddings not aligned!")
    else:
        print(" OK: Metadata & embeddings are aligned index-to-index")

    # Show a few records
    print("\nExample alignment preview:")
    for i in range(3):
        print(f"\nIndex {i}:")
        print("ID:", meta[i].get("id"))
        print("Audio file:", meta[i].get("audio_file"))
        print("Transcript:", meta[i].get("text"))
        print("Audio Embedding snippet:", audio_emb[i][:5])
        print("Text Embedding snippet:", text_emb[i][:5])


# ----------------------------------------------------
# Run analysis
# ----------------------------------------------------
metadata = load_metadata(TA_METADATA)
audio_emb = load_npy(TA_AUDIO_EMB)
text_emb = load_npy(TA_TEXT_EMB)

check_alignment(metadata, audio_emb, text_emb)


Loaded metadata records: 5396

Sample metadata entry:
 {
  "id": "train_tamilfem_00001",
  "audio_file": "train_tamilfem_00001.wav",
  "text": "உனக்கு இன்னும் என்ன வேண்டும் ? பீட்டர்ஸ் பர்க்கில் சுதந்திர வாழ்க்கையை நீ கனவு கண்டாய்"
}

Loaded: /content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy
Shape   : (5396, 512)
Dtype   : float32
Min     : -0.1836
Max     : 0.1733
Mean    : 0.0001
NaN?    : 0 values

Loaded: /content/drive/MyDrive/MIR Project/Tamil/text_embeddings.npy
Shape   : (5396, 768)
Dtype   : float32
Min     : -0.1981
Max     : 0.1210
Mean    : -0.0103
NaN?    : 0 values

=== Alignment Check ===
Metadata count     : 5396
Audio embeddings   : 5396
Text embeddings    : 5396
 OK: Metadata & embeddings are aligned index-to-index

Example alignment preview:

Index 0:
ID: train_tamilfem_00001
Audio file: train_tamilfem_00001.wav
Transcript: உனக்கு இன்னும் என்ன வேண்டும் ? பீட்டர்ஸ் பர்க்கில் சுதந்திர வாழ்க்கையை நீ கனவு கண்டாய்
Audio Embedding snippet: [0.01597428 0.0463142

MLP projection for tamil and english

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np


In [None]:
# ==== Tamil =====
ta_audio = np.load("/content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy")
ta_text  = np.load("/content/drive/MyDrive/MIR Project/Tamil/text_embeddings.npy")

# ==== English ====
en_audio = np.load("/content/drive/MyDrive/MIR Project/English/clap_embeddings.npy")
en_text  = np.load("/content/drive/MyDrive/MIR Project/English/labse_embeddings.npy")

# Combine into one multilingual dataset
audio_emb = np.concatenate([ta_audio, en_audio], axis=0)
text_emb  = np.concatenate([ta_text, en_text], axis=0)

print("Total combined samples:", audio_emb.shape[0])


Total combined samples: 8099


In [None]:
#dataset class
class AudioTextDataset(Dataset):
    def __init__(self, audio, text):
        self.audio = torch.tensor(audio, dtype=torch.float32)
        self.text = torch.tensor(text, dtype=torch.float32)

    def __len__(self):
        return len(self.audio)

    def __getitem__(self, idx):
        return self.text[idx], self.audio[idx]


In [None]:
#MLP Projection Model
class ProjectionMLP(nn.Module):
    def __init__(self, in_dim=768, out_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, out_dim)
        )

    def forward(self, x):
        x = self.net(x)
        return nn.functional.normalize(x, p=2, dim=-1)  # L2 normalize


In [None]:
#contrastive Loss
def contrastive_loss(text_proj, audio_emb):
    # text_proj : (B,512)
    # audio_emb : (B,512)

    logits = text_proj @ audio_emb.T       # (B,B)
    labels = torch.arange(len(logits)).to(logits.device)

    loss_t = nn.CrossEntropyLoss()(logits, labels)
    loss_a = nn.CrossEntropyLoss()(logits.T, labels)

    return (loss_t + loss_a) / 2


In [None]:
#training
device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = AudioTextDataset(audio_emb, text_emb)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

model = ProjectionMLP().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for text_batch, audio_batch in loader:
        text_batch = text_batch.to(device)
        audio_batch = audio_batch.to(device)

        # CLAP audio is already normalized — ensure text also projected
        text_proj = model(text_batch)
        audio_proj = nn.functional.normalize(audio_batch, p=2, dim=-1)

        loss = contrastive_loss(text_proj, audio_proj)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(loader):.4f}")

# Save projector
torch.save(model.state_dict(), "/content/drive/MyDrive/MIR Project/projection_model.pt")
print("Saved projection model as projection_model.pt")


Epoch 1/10 - Loss: 3.8464
Epoch 2/10 - Loss: 3.7847
Epoch 3/10 - Loss: 3.7712
Epoch 4/10 - Loss: 3.7648
Epoch 5/10 - Loss: 3.7604
Epoch 6/10 - Loss: 3.7553
Epoch 7/10 - Loss: 3.7527
Epoch 8/10 - Loss: 3.7482
Epoch 9/10 - Loss: 3.7442
Epoch 10/10 - Loss: 3.7390
Saved projection model as projection_model.pt


split train val test

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

N = len(audio_emb)
indices = np.arange(N)

train_idx, test_idx = train_test_split(indices, test_size=0.10, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.10, random_state=42)

print("Train:", len(train_idx))
print("Val  :", len(val_idx))
print("Test :", len(test_idx))

# Create splits
audio_train = audio_emb[train_idx]
text_train  = text_emb[train_idx]

audio_val = audio_emb[val_idx]
text_val  = text_emb[val_idx]

audio_test = audio_emb[test_idx]
text_test  = text_emb[test_idx]


Train: 6560
Val  : 729
Test : 810


In [None]:
#evaluation metrics
import numpy as np

def recall_at_k(ranks, k):
    return np.mean([1 if r < k else 0 for r in ranks])

def mean_reciprocal_rank(ranks):
    return np.mean([1.0 / (r + 1) for r in ranks])

def mean_average_precision(ranks):
    return np.mean([1 / (r + 1) for r in ranks])  # 1 relevant item


In [None]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load projector model
model = ProjectionMLP().to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/MIR Project/projection_model.pt"))
model.eval()

# Normalize audio embeddings once
audio_test_norm = audio_test / np.linalg.norm(audio_test, axis=1, keepdims=True)

ranks = []  # store rank of each correct audio

for i in range(len(text_test)):

    t = torch.tensor(text_test[i], dtype=torch.float32).to(device)

    with torch.no_grad():
        t_proj = model(t.unsqueeze(0)).cpu().numpy()[0]

    t_proj = t_proj / np.linalg.norm(t_proj)

    # Compute similarity against all test audio
    sims = audio_test_norm @ t_proj

    # Sort in descending order
    sorted_idx = np.argsort(-sims)

    # Find rank of the correct audio
    rank = np.where(sorted_idx == i)[0][0]
    ranks.append(rank)

# Compute metrics
print("Recall@1  :", recall_at_k(ranks, 1))
print("Recall@5  :", recall_at_k(ranks, 5))
print("Recall@10 :", recall_at_k(ranks, 10))
print("MRR       :", mean_reciprocal_rank(ranks))
print("mAP       :", mean_average_precision(ranks))


Recall@1  : 0.07901234567901234
Recall@5  : 0.24814814814814815
Recall@10 : 0.362962962962963
MRR       : 0.16735700707567136
mAP       : 0.16735700707567136


eval save to csv

In [None]:
with open("/content/drive/MyDrive/MIR Project/Tamil/metadata.json", "r", encoding="utf-8") as f:
    ta_meta = json.load(f)


In [None]:
with open("/content/drive/MyDrive/MIR Project/English/dataset_metadata.json", "r", encoding="utf-8") as f:
    en_meta = json.load(f)


In [None]:
import numpy as np
import torch
import pandas as pd

# ---------------------------
# Load Projection Model
# ---------------------------
model = ProjectionMLP().to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/MIR Project/projection_model.pt"))
model.eval()

# ---------------------------
# Normalize audio embeddings
# ---------------------------
audio_test_norm = audio_test / np.linalg.norm(audio_test, axis=1, keepdims=True)

# ---------------------------
# Utility: Cosine similarity
# ---------------------------
def cos_sim(a, b):
    return np.dot(a, b)

# ---------------------------
# Prepare CSV rows
# ---------------------------

# Unified metadata access
# Get text from Tamil ("text") or English ("transcript")
def get_text(meta):
    if "text" in meta:
        return meta["text"]       # Tamil
    if "transcript" in meta:
        return meta["transcript"] # English
    return ""

# Get audio filename from Tamil ("audio_file") or English ("path")
def get_audio_file(meta):
    if "audio_file" in meta:
        return meta["audio_file"]  # Tamil
    if "path" in meta:
        return meta["path"]        # English
    return ""

rows = []

for i in range(len(text_test)):
    # Project text -> CLAP space
    t = torch.tensor(text_test[i], dtype=torch.float32).to(device)
    with torch.no_grad():
        t_proj = model(t.unsqueeze(0)).cpu().numpy()[0]

    t_proj = t_proj / np.linalg.norm(t_proj)

    # Compute similarity with all test audio
    sims = audio_test_norm @ t_proj

    # Sort all scores
    sorted_idx = np.argsort(-sims)

    # Ground-truth rank
    rank = int(np.where(sorted_idx == i)[0][0])

    # Top-5 retrieval
    top5 = sorted_idx[:5]
    top5_scores = sims[top5]

    # Retrieve metadata
    global_idx = test_idx[i]
    ref_meta = metadata_all[global_idx]

    top1_global = test_idx[top5[0]]
    top1_meta = metadata_all[top1_global]

    # Save a row
    rows.append({
    "query_text": get_text(ref_meta),
    "reference_audio": get_audio_file(ref_meta),

    "top1_audio": get_audio_file(top1_meta),
    "top1_score": float(top5_scores[0]),

    "rank_of_correct_audio": rank,

    "top5_audio_list": [get_audio_file(metadata_all[test_idx[x]]) for x in top5],
    "top5_scores_list": [float(s) for s in top5_scores]
   })


# ---------------------------
# Save to CSV
# ---------------------------
df = pd.DataFrame(rows)
df.to_csv("/content/drive/MyDrive/MIR Project/retrieval_results.csv", index=False, encoding="utf-8-sig")

print("Saved retrieval_results.csv")


Saved retrieval_results.csv


CLAP Embeddding

In [None]:
from transformers import ClapProcessor, ClapModel
import torch
import numpy as np
import json

device = "cuda" if torch.cuda.is_available() else "cpu"

clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)
clap_model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

ClapModel(
  (text_model): ClapTextModel(
    (embeddings): ClapTextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ClapTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x ClapTextLayer(
          (attention): ClapTextAttention(
            (self): ClapTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ClapTextSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [None]:
TEXT_FILE = "/content/drive/MyDrive/MIR Project/Tamil/Tamil_fem_mono/Tamil_fem_mono.txt"

tamil_meta = []

with open(TEXT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        key, text = line.split("\t", 1)
        tamil_meta.append({"id": key, "text": text})


In [None]:
all_texts = [m["text"] for m in tamil_meta]
batch_size = 16

text_embeds = []

for i in range(0, len(all_texts), batch_size):
    batch = all_texts[i:i+batch_size]
    inputs = clap_processor(text=batch, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        emb = clap_model.get_text_features(**inputs)
        emb = emb.cpu().numpy()

    text_embeds.append(emb)

# Final embedding matrix
text_embeds = np.vstack(text_embeds)
print("CLAP Tamil text embeddings shape:", text_embeds.shape)


CLAP Tamil text embeddings shape: (5396, 512)


In [None]:
OUTPUT_PATH = "/content/drive/MyDrive/MIR Project/Tamil/clap_text_embeddings.npy"
np.save(OUTPUT_PATH, text_embeds)

print("Saved:", OUTPUT_PATH)

Saved: /content/drive/MyDrive/MIR Project/Tamil/clap_text_embeddings.npy


English text CLAP embedding

In [None]:
from transformers import ClapProcessor, ClapModel
import torch
import numpy as np
import json

device = "cuda" if torch.cuda.is_available() else "cpu"

clap_processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused").to(device)
clap_model.eval()


ClapModel(
  (text_model): ClapTextModel(
    (embeddings): ClapTextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ClapTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x ClapTextLayer(
          (attention): ClapTextAttention(
            (self): ClapTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ClapTextSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [None]:
EN_METADATA = "/content/drive/MyDrive/MIR Project/English/dataset_metadata.json"

with open(EN_METADATA, "r", encoding="utf-8") as f:
    en_meta = json.load(f)

print("Loaded English metadata:", len(en_meta))


Loaded English metadata: 2703


In [None]:
english_texts = [entry["transcript"] for entry in en_meta]
print("Example:", english_texts[0])

Example: A MAN IN THE WELL


In [None]:
batch_size = 16
text_embeddings = []

for i in range(0, len(english_texts), batch_size):
    batch = english_texts[i : i + batch_size]

    # Tokenize English text
    inputs = clap_processor(
        text=batch,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)

    # CLAP text features
    with torch.no_grad():
        emb = clap_model.get_text_features(**inputs)
        emb = emb.cpu().numpy()

    text_embeddings.append(emb)

# Final (N, 512)
text_embeddings = np.vstack(text_embeddings)
print("English CLAP text embeddings shape:", text_embeddings.shape)


English CLAP text embeddings shape: (2703, 512)


In [None]:
OUTPUT_EN_TEXT = "/content/drive/MyDrive/MIR Project/English/clap_text_embeddings.npy"
np.save(OUTPUT_EN_TEXT, text_embeddings)

print("Saved English CLAP text embeddings to:")
print(OUTPUT_EN_TEXT)


Saved English CLAP text embeddings to:
/content/drive/MyDrive/MIR Project/English/clap_text_embeddings.npy


Faiss indices

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [None]:
# Step 1: Build per-language FAISS indices
import os
import json
import numpy as np
import faiss

# ====== Adjust paths below if needed ======
TAMIL_AUDIO_EMB = "/content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy"        # CLAP audio (512)
TAMIL_TEXT_EMB  = "/content/drive/MyDrive/MIR Project/Tamil/clap_text_embeddings.npy"   # CLAP text (512)
TAMIL_META      = "/content/drive/MyDrive/MIR Project/Tamil/metadata.json"

EN_AUDIO_EMB    = "/content/drive/MyDrive/MIR Project/English/clap_embeddings.npy"     # CLAP audio (512)
EN_TEXT_EMB     = "/content/drive/MyDrive/MIR Project/English/clap_text_embeddings.npy" # CLAP text (512)
EN_META         = "/content/drive/MyDrive/MIR Project/English/dataset_metadata.json"

OUT_DIR = "/content/drive/MyDrive/MIR Project/indices"
os.makedirs(OUT_DIR, exist_ok=True)

# ---- helper to load and validate ----
def load_npy(path, name):
    arr = np.load(path)
    print(f"Loaded {name}: {path} -> shape {arr.shape} dtype {arr.dtype}")
    if arr.dtype != np.float32:
        arr = arr.astype(np.float32)
    return arr

def load_meta(path, name):
    with open(path, "r", encoding="utf-8") as f:
        m = json.load(f)
    print(f"Loaded {name} metadata: {len(m)} entries")
    return m

# ---- Load files ----
t_audio = load_npy(TAMIL_AUDIO_EMB, "Tamil audio")
t_text  = load_npy(TAMIL_TEXT_EMB,  "Tamil text")
t_meta  = load_meta(TAMIL_META, "Tamil")

e_audio = load_npy(EN_AUDIO_EMB, "English audio")
e_text  = load_npy(EN_TEXT_EMB,  "English text")
e_meta  = load_meta(EN_META, "English")

# Basic sanity checks
assert t_audio.ndim == 2 and t_audio.shape[1] == 512, "Tamil audio emb shape unexpected"
assert t_text.ndim  == 2 and t_text.shape[1]  == 512, "Tamil text emb shape unexpected"
assert e_audio.ndim == 2 and e_audio.shape[1] == 512, "English audio emb shape unexpected"
assert e_text.ndim  == 2 and e_text.shape[1]  == 512, "English text emb shape unexpected"

# Ensure metadata counts match embeddings
if not (len(t_meta) == t_audio.shape[0] == t_text.shape[0]):
    print("WARNING: Tamil counts mismatch (meta vs embeddings). Check ordering.")
if not (len(e_meta) == e_audio.shape[0] == e_text.shape[0]):
    print("WARNING: English counts mismatch (meta vs embeddings). Check ordering.")

# ---- Normalize embeddings for cosine (L2) ----
def l2_normalize(a):
    norms = np.linalg.norm(a, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    return a / norms

t_audio_norm = l2_normalize(t_audio.astype(np.float32))
t_text_norm  = l2_normalize(t_text.astype(np.float32))
e_audio_norm = l2_normalize(e_audio.astype(np.float32))
e_text_norm  = l2_normalize(e_text.astype(np.float32))

# save normalized audio embeddings (useful for later diagnostics)
np.save(os.path.join(OUT_DIR, "tamil_audio_norm.npy"), t_audio_norm)
np.save(os.path.join(OUT_DIR, "tamil_text_norm.npy"),  t_text_norm)
np.save(os.path.join(OUT_DIR, "english_audio_norm.npy"), e_audio_norm)
np.save(os.path.join(OUT_DIR, "english_text_norm.npy"),  e_text_norm)
print("Saved normalized embeddings to", OUT_DIR)

# ---- Build FAISS IndexFlatIP (cosine via normalized inner product) ----
d = 512
t_index = faiss.IndexFlatIP(d)
t_index.add(t_audio_norm.astype(np.float32))
print("Tamil FAISS index built: ntotal =", t_index.ntotal)

e_index = faiss.IndexFlatIP(d)
e_index.add(e_audio_norm.astype(np.float32))
print("English FAISS index built: ntotal =", e_index.ntotal)

# ---- Save FAISS indices to disk ----
faiss.write_index(t_index, os.path.join(OUT_DIR, "tamil.index"))
faiss.write_index(e_index, os.path.join(OUT_DIR, "english.index"))
print("Saved FAISS indices to", OUT_DIR)

# ---- Quick test: search first text against its language audio (sanity) ----
k = 5
# Tamil sample 0
D, I = t_index.search(t_text_norm[0:1].astype(np.float32), k)
print("Tamil sample0 topk indices:", I[0], "scores:", D[0])
# English sample 0
D, I = e_index.search(e_text_norm[0:1].astype(np.float32), k)
print("English sample0 topk indices:", I[0], "scores:", D[0])

print("STEP 1 complete — FAISS indices built and saved.")


Loaded Tamil audio: /content/drive/MyDrive/MIR Project/Tamil/audio_embeddings.npy -> shape (5396, 512) dtype float32
Loaded Tamil text: /content/drive/MyDrive/MIR Project/Tamil/clap_text_embeddings.npy -> shape (5396, 512) dtype float32
Loaded Tamil metadata: 5396 entries
Loaded English audio: /content/drive/MyDrive/MIR Project/English/clap_embeddings.npy -> shape (2703, 512) dtype float32
Loaded English text: /content/drive/MyDrive/MIR Project/English/clap_text_embeddings.npy -> shape (2703, 512) dtype float32
Loaded English metadata: 2703 entries
Saved normalized embeddings to /content/drive/MyDrive/MIR Project/indices
Tamil FAISS index built: ntotal = 5396
English FAISS index built: ntotal = 2703
Saved FAISS indices to /content/drive/MyDrive/MIR Project/indices
Tamil sample0 topk indices: [ 461 5246 4439 5135 3290] scores: [0.15353173 0.15130688 0.15088314 0.14984086 0.14872383]
English sample0 topk indices: [2555 2554  980 2582  970] scores: [0.07589738 0.06528402 0.05716542 0.0473

In [None]:
import numpy as np
import faiss
import json
import os

# Load normalized embeddings saved earlier
t_audio = np.load("/content/drive/MyDrive/MIR Project/indices/tamil_audio_norm.npy")
e_audio = np.load("/content/drive/MyDrive/MIR Project/indices/english_audio_norm.npy")

# Combine into ONE space
all_audio = np.vstack([t_audio, e_audio]).astype(np.float32)

print("Unified audio shape:", all_audio.shape)

# Build FAISS index in the SAME CLAP space
d = 512
index_all = faiss.IndexFlatIP(d)
index_all.add(all_audio)

print("Unified FAISS index entries:", index_all.ntotal)

# Save it
OUT = "/content/drive/MyDrive/MIR Project/indices/unified_audio.index"
faiss.write_index(index_all, OUT)
print("Saved unified FAISS index:", OUT)


Unified audio shape: (8099, 512)
Unified FAISS index entries: 8099
Saved unified FAISS index: /content/drive/MyDrive/MIR Project/indices/unified_audio.index


In [None]:
#unified metadata
import json
import numpy as np
import os

# Paths (adjust if needed)
T_AUDIO = "/content/drive/MyDrive/MIR Project/indices/tamil_audio_norm.npy"
E_AUDIO = "/content/drive/MyDrive/MIR Project/indices/english_audio_norm.npy"

T_META  = "/content/drive/MyDrive/MIR Project/Tamil/metadata.json"
E_META  = "/content/drive/MyDrive/MIR Project/English/dataset_metadata.json"

OUT_DIR = "/content/drive/MyDrive/MIR Project/unified"
os.makedirs(OUT_DIR, exist_ok=True)

# ---- Load everything ----
t_audio = np.load(T_AUDIO)
e_audio = np.load(E_AUDIO)

with open(T_META, "r", encoding="utf-8") as f:
    t_meta = json.load(f)

with open(E_META, "r", encoding="utf-8") as f:
    e_meta = json.load(f)

print("Tamil audio:", t_audio.shape)
print("English audio:", e_audio.shape)
print("Tamil metadata:", len(t_meta))
print("English metadata:", len(e_meta))

# ---- Add 'language' flag for each entry ----
for m in t_meta:
    m["language"] = "tamil"

for m in e_meta:
    m["language"] = "english"

# ---- Create unified metadata list ----
unified_meta = t_meta + e_meta
print("Unified metadata count:", len(unified_meta))

# ---- Create unified audio embedding matrix ----
unified_audio = np.vstack([t_audio, e_audio]).astype(np.float32)
print("Unified audio shape:", unified_audio.shape)

# ---- Save them ----
UNIFIED_AUDIO_PATH = os.path.join(OUT_DIR, "unified_audio_embeddings.npy")
UNIFIED_META_PATH  = os.path.join(OUT_DIR, "unified_metadata.json")

np.save(UNIFIED_AUDIO_PATH, unified_audio)

with open(UNIFIED_META_PATH, "w", encoding="utf-8") as f:
    json.dump(unified_meta, f, indent=2, ensure_ascii=False)

print("Saved unified_audio_embeddings.npy")
print("Saved unified_metadata.json")


Tamil audio: (5396, 512)
English audio: (2703, 512)
Tamil metadata: 5396
English metadata: 2703
Unified metadata count: 8099
Unified audio shape: (8099, 512)
Saved unified_audio_embeddings.npy
Saved unified_metadata.json


In [None]:
import numpy as np
import faiss
import json
import os

# ----- Paths -----
UNIFIED_EMB = "/content/drive/MyDrive/MIR Project/unified/unified_audio_embeddings.npy"
UNIFIED_META = "/content/drive/MyDrive/MIR Project/unified/unified_metadata.json"
OUT_INDEX = "/content/drive/MyDrive/MIR Project/unified/unified_audio.index"

# ----- Load unified embeddings -----
audio_emb = np.load(UNIFIED_EMB).astype(np.float32)
print("Unified audio embeddings:", audio_emb.shape)

# ----- Normalize embeddings (L2) -----
norm = np.linalg.norm(audio_emb, axis=1, keepdims=True)
norm[norm == 0] = 1.0
audio_emb_norm = audio_emb / norm

# ----- Build FAISS index -----
d = audio_emb_norm.shape[1]  # 512 dimensions
index = faiss.IndexFlatIP(d) # inner-product = cosine on normalized vectors
index.add(audio_emb_norm)

print("FAISS index created.")
print("Index size:", index.ntotal)

# ----- Save index -----
faiss.write_index(index, OUT_INDEX)
print("Saved unified FAISS index to:", OUT_INDEX)

# ----- Optional: quick sanity test -----
print("\nTesting retrieval for unified index (first audio as query):")
D, I = index.search(audio_emb_norm[0:1], 5)
print("Top-5 indices:", I[0])
print("Scores:", D[0])


Unified audio embeddings: (8099, 512)
FAISS index created.
Index size: 8099
Saved unified FAISS index to: /content/drive/MyDrive/MIR Project/unified/unified_audio.index

Testing retrieval for unified index (first audio as query):
Top-5 indices: [   0 5170 3511 3348 2127]
Scores: [1.0000001  0.9661641  0.9656447  0.9656324  0.96427983]


In [None]:
import numpy as np
import json
import faiss
from transformers import ClapProcessor, ClapModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Paths ----
UNIFIED_INDEX = "/content/drive/MyDrive/MIR Project/unified/unified_audio.index"
UNIFIED_EMB = "/content/drive/MyDrive/MIR Project/unified/unified_audio_embeddings.npy"
UNIFIED_META = "/content/drive/MyDrive/MIR Project/unified/unified_metadata.json"

# ---- Load FAISS index ----
index = faiss.read_index(UNIFIED_INDEX)

# ---- Load audio embeddings (normalized) ----
audio_emb = np.load(UNIFIED_EMB).astype(np.float32)

# ---- Load metadata ----
with open(UNIFIED_META, "r", encoding="utf-8") as f:
    metadata = json.load(f)

print("Unified index entries:", index.ntotal)


Unified index entries: 8099
