# EchoFind – Self-Supervised Audio Representation Learning
Impulse 2026 Submission (Cleaned Pipeline)


## Phase 1: Input Pipeline & Preprocessing (Steps 1–4)


In [3]:
import sys
print(sys.executable)


c:\Users\Sandeep kumar\impulse_env\Scripts\python.exe


In [5]:
!python -m pip install librosa numpy pandas scikit-learn matplotlib tqdm torch




In [8]:
import librosa
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import os


print("Libraries imported successfully")

Libraries imported successfully


In [9]:
SAFE_BASE = r"H:\My Drive\Impulse2026_SSL\data"
DATA_DIR = SAFE_BASE
SAFE_AUDIO_PATH = r"H:\My Drive\Impulse2026_SSL\data\fma_small\fma_small"

print("Exists:", os.path.exists(SAFE_AUDIO_PATH))
print("Sample folders:", sorted(os.listdir(SAFE_AUDIO_PATH))[:5])


Exists: True
Sample folders: ['000', '001', '002', '003', '004']


In [10]:
audio_root = SAFE_AUDIO_PATH
print("LOCKED audio root:", audio_root)


LOCKED audio root: H:\My Drive\Impulse2026_SSL\data\fma_small\fma_small


## Phase 2: Self-Supervised Representation Learning (Steps 5–12)


# **Step-5 MFCC**

MFCCs are extracted per audio clip using librosa. In practice, MFCCs are computed inside the batch embedding pipeline (see extract_embedding). This section illustrates the concept on a single sample.

# **STEP 6 — Convert variable-length MFCC → fixed-size embedding**

MFCC features are computed as a time–frequency representation, where the number of time frames depends on the duration of the audio signal.
As a result, raw MFCC matrices have variable temporal length across different audio clips, which makes them unsuitable for direct use in neural networks that expect fixed-size inputs.

To address this, a statistical pooling strategy is applied:

Mean pooling across the time axis captures the average spectral characteristics of the audio.

Standard deviation pooling captures temporal variability and dynamics.

By concatenating the mean and standard deviation vectors, each audio clip is transformed into a fixed-dimensional embedding independent of its duration.

# **STEP 7 — Apply this embedding extraction to MULTIPLE audio files**

In [11]:

import librosa
import numpy as np

def extract_embedding(audio_path, sr=22050, n_mfcc=40):
    signal, _ = librosa.load(audio_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)

    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)

    embedding = np.concatenate([mfcc_mean, mfcc_std])
    return embedding

In [12]:
audio_files = []

for folder in sorted(os.listdir(SAFE_AUDIO_PATH)):
    folder_path = os.path.join(SAFE_AUDIO_PATH, folder)

    if not os.path.isdir(folder_path):
        continue

    for file in os.listdir(folder_path):
        if file.endswith(".mp3"):
            audio_files.append(os.path.join(folder_path, file))

print("Total audio files found:", len(audio_files))
print("Sample file:", audio_files[0])

Total audio files found: 8000
Sample file: H:\My Drive\Impulse2026_SSL\data\fma_small\fma_small\000\000002.mp3


In [17]:
# SUBSAMPLING FOR SPEED 

sample_files = audio_files[:799]

In [18]:
from tqdm import tqdm


In [19]:
embeddings = []

for path in tqdm(sample_files, desc="Extracting embeddings", unit="file"):
    emb = extract_embedding(path)
    embeddings.append(emb)

X = np.array(embeddings)
print("Embedding matrix shape:", X.shape)
print("Mean:", X.mean())
print("Std:", X.std())


Extracting embeddings: 100%|██████████| 799/799 [07:18<00:00,  1.82file/s]

Embedding matrix shape: (799, 80)
Mean: 4.0233145
Std: 25.741259





# **STEP 8 — Normalize the embedding matrix**

In [20]:
from sklearn.preprocessing import StandardScaler
import numpy as np

embedding_matrix = X   # LOCK THIS

scaler = StandardScaler()
X_norm = scaler.fit_transform(embedding_matrix)

print("Shape:", X_norm.shape)
print("Mean:", np.mean(X_norm))
print("Std:", np.std(X_norm))

Shape: (799, 80)
Mean: 9.548679e-09
Std: 1.0


# **STEP 9 — SSL AUGMENTATION**

In [21]:
import numpy as np

def augment_embedding(x, noise_std=0.1, drop_prob=0.07):
    x_aug = x.copy()
    x_aug += np.random.normal(0, noise_std, size=x.shape)
    mask = np.random.rand(*x.shape) > drop_prob
    x_aug *= mask
    return x_aug


# **STEP 10 — Define the Neural Encoder (MLP)**

In [22]:
import sys
print(sys.executable)


c:\Users\Sandeep kumar\impulse_env\Scripts\python.exe


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [26]:
class AudioEncoder(nn.Module):
    def __init__(self, input_dim=80, hidden_dim=128, output_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        z = self.net(x)
        z = F.normalize(z, dim=1)  # critical for cosine-based SSL
        return z

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioEncoder().to(device)
print(model)

AudioEncoder(
  (net): Sequential(
    (0): Linear(in_features=80, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=32, bias=True)
  )
)


# **STEP 11 — Define the Contrastive Loss (SSL Learning Signal)**

In [28]:
def contrastive_loss(z1, z2, temperature=0.1):
    """
    z1, z2: (batch_size, embedding_dim)
    """
    batch_size = z1.size(0)

    # Similarity matrix
    sim_matrix = torch.matmul(z1, z2.T) / temperature

    # Positive pairs are on the diagonal
    labels = torch.arange(batch_size).to(z1.device)

    loss = F.cross_entropy(sim_matrix, labels)
    return loss

# **STEP 12 — Full SSL Training Loop**

In [29]:
from torch.utils.data import Dataset, DataLoader

class SSLAudioDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        x1 = augment_embedding(x)
        x2 = augment_embedding(x)
        return (
            torch.tensor(x1, dtype=torch.float32),
            torch.tensor(x2, dtype=torch.float32)
        )

In [30]:

dataset = SSLAudioDataset(X_norm)
dataloader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    drop_last=True
)

print("Batches per epoch:", len(dataloader))

Batches per epoch: 24


In [31]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [32]:
epochs = 15
model.train()

for epoch in range(epochs):
    total_loss = 0.0

    for x1, x2 in dataloader:
        x1 = x1.to(device)
        x2 = x2.to(device)

        z1 = model(x1)
        z2 = model(x2)

        loss = contrastive_loss(z1, z2)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{epochs}] - Avg Loss: {avg_loss:.4f}")

Epoch [1/15] - Avg Loss: 0.2479
Epoch [2/15] - Avg Loss: 0.0865
Epoch [3/15] - Avg Loss: 0.0742
Epoch [4/15] - Avg Loss: 0.0592
Epoch [5/15] - Avg Loss: 0.0558
Epoch [6/15] - Avg Loss: 0.0629
Epoch [7/15] - Avg Loss: 0.0479
Epoch [8/15] - Avg Loss: 0.0452
Epoch [9/15] - Avg Loss: 0.0483
Epoch [10/15] - Avg Loss: 0.0420
Epoch [11/15] - Avg Loss: 0.0416
Epoch [12/15] - Avg Loss: 0.0383
Epoch [13/15] - Avg Loss: 0.0434
Epoch [14/15] - Avg Loss: 0.0364
Epoch [15/15] - Avg Loss: 0.0330


In [33]:
import os
# CHECKPOINT SAVE — trained encoder (path is environment-specific)
# CHECKPOINT 1 — end of Phase 2
output_dir = "/content/drive/MyDrive/Impulse2026/weights"
os.makedirs(output_dir, exist_ok=True)

torch.save(
    model.state_dict(),
    os.path.join(output_dir, "encoder_final.pth")
)



---



## Phase 3: Retrieval / Shazam Test (Steps 12.5–14)


# **STEP 12.5 — Phase 3: Retrieval (Shazam Test)**

In [34]:
#Loaded Trained Encoder
model.load_state_dict(torch.load(
    "/content/drive/MyDrive/Impulse2026/weights/encoder_final.pth"
))
model.eval()

def get_audio_model_embedding(audio_path, model, scaler_obj, device_obj):
    # 1. Extract raw MFCC embedding
    raw_embedding = extract_embedding(audio_path)

    # 2. Normalize using the trained scaler
    normalized_embedding = scaler_obj.transform(raw_embedding.reshape(1, -1))

    # 3. Pass through the trained model
    model.eval() # Ensure model is in evaluation mode
    with torch.no_grad():
        tensor_embedding = torch.tensor(normalized_embedding, dtype=torch.float32).to(device_obj)
        final_embedding = model(tensor_embedding).cpu().numpy().flatten()
    return final_embedding

In [38]:
# build database
database_model_embeddings = {}
model.eval()

audio_files_phase3 = audio_files[:700]


for path in tqdm(audio_files_phase3, desc="Building model embedding database"):
    track_id = os.path.basename(path)
    model_emb = get_audio_model_embedding(path, model, scaler, device)
    database_model_embeddings[track_id] = model_emb

database = database_model_embeddings
import os

save_dir = r"H:/My Drive/Impulse2026_SSL/weights"
os.makedirs(save_dir, exist_ok=True)

torch.save(database, os.path.join(save_dir, "database_embeddings.pt"))



Building model embedding database: 100%|██████████| 700/700 [07:26<00:00,  1.57it/s]


In [39]:
#retrival Function
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

def predict_track(query_audio_path, model, database_embeddings_dict):

    query_model_embedding = get_audio_model_embedding(query_audio_path, model, scaler, device)
    best_id, best_score = None, -1.0

    for track_id, db_model_embedding in database_embeddings_dict.items():

        score = cosine_similarity(query_model_embedding.reshape(1, -1), db_model_embedding.reshape(1, -1))[0][0]
        if score > best_score:
            best_score, best_id = score, track_id

    return best_id, best_score

# **STEP 13 — Generate FINAL Learned Embeddings**

In [40]:
model.eval()
for p in model.parameters():
    p.requires_grad = False


In [41]:
import numpy as np

final_embeddings = []

with torch.no_grad():
    for i in range(0, len(X_norm), 32):
        batch = torch.tensor(
            X_norm[i:i+32],
            dtype=torch.float32
        ).to(device)

        z = model(batch)
        final_embeddings.append(z.cpu().numpy())

final_embeddings = np.vstack(final_embeddings)

print("Final embedding matrix shape:", final_embeddings.shape)

Final embedding matrix shape: (799, 32)


# **STEP - 14 OUTPUT CSV- download**

In [55]:
audio_ids = [os.path.basename(p) for p in audio_files_phase3]
final_embeddings = final_embeddings[:len(audio_ids)]


In [57]:

import pandas as pd
import os

df = pd.DataFrame(final_embeddings)
df.insert(0, "audio_id", audio_ids)
df.to_csv("outputs.csv", index=False)
print("outputs.csv saved with shape:", df.shape)


outputs.csv saved with shape: (700, 33)


In [58]:
# DEBUG / PREVIEW — inspect saved CSV

pd.read_csv("outputs.csv").head()

Unnamed: 0,audio_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,000002.mp3,-0.249946,0.132043,0.018047,-0.077988,0.193615,0.088332,-0.407737,-0.087728,0.091752,...,0.097703,0.07156,0.114588,-0.055781,-0.26922,0.191771,-0.094684,0.178524,-0.13639,-0.050486
1,000005.mp3,0.330494,0.138537,-0.091513,0.015584,0.15951,-0.074857,-0.215261,-0.326665,0.258441,...,0.155142,-0.102439,-0.397522,0.243152,-0.167941,-0.025723,0.069258,-0.024735,0.018322,-0.180069
2,000010.mp3,-0.032046,0.078719,-0.171027,0.016308,-0.055242,0.148234,-0.271889,0.058019,-0.155045,...,0.131858,0.011409,-0.455784,0.259496,-0.146869,0.145466,0.276061,0.004051,-0.088571,-0.169602
3,000140.mp3,-0.133465,0.104155,0.201438,-0.010338,0.147765,0.120849,0.115359,-0.056955,0.055758,...,0.202641,-0.280212,-0.0134,-0.075353,-0.076547,0.002127,0.124289,0.016724,0.285925,0.128968
4,000141.mp3,-0.071291,-0.151354,-0.089084,-0.290726,0.107025,-0.064288,-0.259864,0.326355,0.022939,...,0.216391,0.242005,-0.092364,0.298605,-0.110453,0.108763,0.228472,-0.106571,0.071141,-0.012153




---



## Phase 4: Semantic Evaluation (Linear Probe & Visualization)


# **Load embeddings**

In [59]:
import pandas as pd
import numpy as np

df = pd.read_csv("outputs.csv")

audio_ids = df.iloc[:, 0].values
embeddings = df.iloc[:, 1:].values

print("Total samples:", embeddings.shape[0])
print("Embedding dimension:", embeddings.shape[1])

Total samples: 700
Embedding dimension: 32


# **Cosine similarity retrieval**

In [60]:
# EXPLORATORY / DEMO — cosine similarity check (not formal evaluation)

from sklearn.metrics.pairwise import cosine_similarity

# pick a query
query_idx = 0
query_embedding = embeddings[query_idx].reshape(1, -1)

# compute similarity
similarities = cosine_similarity(query_embedding, embeddings)[0]

# top-5 most similar (excluding itself)
top_k = similarities.argsort()[::-1][1:6]

print("Query audio:", audio_ids[query_idx])
print("\nTop 5 similar audios:")
for idx in top_k:
    print(audio_ids[idx], "-> similarity:", round(similarities[idx], 3))

Query audio: 000002.mp3

Top 5 similar audios:
006448.mp3 -> similarity: 0.646
006802.mp3 -> similarity: 0.59
007376.mp3 -> similarity: 0.587
006440.mp3 -> similarity: 0.536
006609.mp3 -> similarity: 0.512




---



## Phase 5: Bonus / Extensions


# **GRANDMASTER EXTENSIONS**

## Qualitative Retrieval Demonstration

**Expected behavior**:
*   Similar audio tracks cluster closer
*   Noise-augmented versions remain nearby




## Stability & Robustness

The model was trained using contrastive self-supervised learning
with additive noise and feature masking.

**This encourages invariance to:**

*   Background noise
*   Minor spectral corruption
*   Recording variations

