In [16]:
import kagglehub
import librosa
import numpy as np
import pandas as pd
import os
from numpy.linalg import norm

In [2]:
# This is 30gb, its slow
path = kagglehub.dataset_download("imsparsh/fma-free-music-archive-small-medium")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/imsparsh/fma-free-music-archive-small-medium?dataset_version_number=1...


100%|██████████| 29.8G/29.8G [05:40<00:00, 93.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/imsparsh/fma-free-music-archive-small-medium/versions/1


In [8]:
AUDIO_DIR = os.path.join(path, "fma_small/fma_small")
SAVE_PATH = "audio_embeddings.pkl"
SR = 48000 # HZ
DURATION = 10 # seconds

In [9]:
# Count total files and subdirs
n_files = 0
n_dirs = 0
for root, dirs, files in os.walk(AUDIO_DIR):
    n_files += len([f for f in files if f.endswith(".mp3")])
    n_dirs += len(dirs)
print(f"Total MP3 files: {n_files}")
print(f"Total subdirs: {n_dirs}")

Total MP3 files: 8000
Total subdirs: 156


In [14]:
def load_and_preprocess_audio(filepath, sr=48000, duration=10):
    y, _ = librosa.load(filepath, sr=sr, mono=True)
    target_len = sr * duration
    if len(y) < target_len:
        # Pad with zeros if too short
        y = np.pad(y, (0, target_len - len(y)))
    elif len(y) > target_len:
        # Trim if too long
        y = y[:target_len]
    return y

In [15]:
def get_fake_embedding(audio):
    # Simulate real encoder work
    return np.random.rand(512)

In [None]:
# Untested

embeddings = []
track_ids = []

for root, _, files in os.walk(AUDIO_DIR):
    for fname in files:
        if fname.endswith(".mp3"):
            filepath = os.path.join(root, fname)
            track_id = int(os.path.splitext(fname)[0])
            audio = load_and_preprocess_audio(filepath)
            emb = get_fake_embedding(audio)
            embeddings.append(emb)
            track_ids.append(track_id)

In [23]:
# Untested
# Save as DataFrame
embeddings_df = pd.DataFrame(embeddings, index=track_ids)
embeddings_df.to_pickle(SAVE_PATH)
print(f"Saved {len(embeddings_df)} embeddings to {SAVE_PATH}")

Saved 10 embeddings to audio_embeddings.pkl


In [None]:
from google.colab import files
files.download(SAVE_PATH)

In [17]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

def find_best_match(query_emb, embeddings_df, top_k=5):
    scores = embeddings_df.apply(lambda row: cosine_similarity(query_emb, row.values), axis=1)
    return scores.nlargest(top_k)

In [20]:
######## TEST RUN
# Create 10 random embeddings
embeddings = [np.random.rand(512) for _ in range(10)]
track_ids = [f"track_{i}" for i in range(10)]
embeddings_df = pd.DataFrame(embeddings, index=track_ids)

query_embedding = embeddings_df.iloc[3].values

top_matches = find_best_match(query_embedding, embeddings_df, top_k=5)

print(top_matches)

track_3    1.000000
track_6    0.769301
track_5    0.764414
track_7    0.763734
track_9    0.756734
dtype: float64


In [22]:
# top 3 matches
print("Query embedding, first 10 values:\n", query_embedding[:10], "\n")

for i, idx in enumerate(top_matches.index[:3]):
    emb = embeddings_df.loc[idx].values
    print(f"Top {i+1} match (track_id={idx}), first 10 values:\n{emb[:10]}\n")


Query embedding, first 10 values:
 [0.05517154 0.14344381 0.92098478 0.90524225 0.14059859 0.02802065
 0.33153839 0.50003882 0.23035156 0.40961226] 

Top 1 match (track_id=track_3), first 10 values:
[0.05517154 0.14344381 0.92098478 0.90524225 0.14059859 0.02802065
 0.33153839 0.50003882 0.23035156 0.40961226]

Top 2 match (track_id=track_6), first 10 values:
[0.96758908 0.44620667 0.26217524 0.76981148 0.39393867 0.40281111
 0.19902281 0.72525252 0.29093822 0.4834461 ]

Top 3 match (track_id=track_5), first 10 values:
[0.54867458 0.6390797  0.37262114 0.89687935 0.58662965 0.13242765
 0.58461819 0.11318776 0.45150855 0.82464045]

