In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import autorootcwd
import os
import librosa
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [8]:
def get_mfcc_embedding(path, duration=30):
    """
    Load an audio file and return a fixed-size MFCC embedding.
    """
    y, sr = librosa.load(path, sr=22050, mono=True, duration=duration)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    return np.mean(mfcc, axis=1)  # Average over time

def get_embeddings(folder, duration=30):
    """
    Load all audio files in a folder and return their MFCC embeddings.
    """
    embeddings = {}
    files = [f for f in os.listdir(folder) if f.lower().endswith(".mp3")]

    # Extract MFCC embeddings
    for f in tqdm(files, desc="Extracting embeddings"):
        try:
            path = os.path.join(folder, f)
            embeddings[f] = get_mfcc_embedding(path, duration)
        except Exception as e:
            print(f"Error processing {f}: {e}")

    return embeddings

def find_similar_audio_files(embeddings, threshold=0.95):
    """
    Finds near-duplicate audio files in a folder based on cosine similarity of MFCCs.

    Args:
        folder (str): Path to folder of MP3 files.
        threshold (float): Cosine similarity threshold above which files are considered duplicates.

    Returns:
        List of (file1, file2, similarity) tuples.
    """

    # Compare all pairs
    matches = []
    keys = list(embeddings.keys())
    for i in range(len(keys)):
        for j in range(i + 1, len(keys)):
            f1, f2 = keys[i], keys[j]
            sim = cosine_similarity(
                [embeddings[f1]], [embeddings[f2]]
            )[0, 0]
            if sim >= threshold:
                matches.append((f1, f2, sim))

    return matches

In [9]:
# Get embeddings
folder = "data/processed/audio"
embeddings = get_embeddings(folder)

Extracting embeddings:  12%|█▏        | 141/1213 [00:19<02:02,  8.77it/s][src/libmpg123/id3.c:process_extra():684] error: No extra frame text / valid description?
Extracting embeddings:  43%|████▎     | 525/1213 [01:11<01:23,  8.24it/s][src/libmpg123/id3.c:process_extra():684] error: No extra frame text / valid description?
Extracting embeddings:  51%|█████     | 620/1213 [01:27<02:09,  4.58it/s][src/libmpg123/id3.c:process_extra():684] error: No extra frame text / valid description?
Extracting embeddings: 100%|██████████| 1213/1213 [03:12<00:00,  6.29it/s]


In [13]:
matches = find_similar_audio_files(embeddings, threshold=0.99)

In [14]:
matches.sort(key=lambda x: x[2], reverse=True)
matches

[('neildiamond_songsungblue.mp3',
  'barbaralewis_hellostranger.mp3',
  0.99918085),
 ('weezer_elscorcho.mp3', 'thetrammps_discoinferno.mp3', 0.9990221),
 ('harrychapin_sundaymorningsunshine.mp3',
  'joelbilly_shesalwaysawoman.mp3',
  0.9989279),
 ('kennyrogers_lady.mp3', 'thebeatles_29-revolution9.mp3', 0.99892765),
 ('tracychapman_babycaniholdyou.mp3',
  'billyidol_catchmyfall1999digitalremaster.mp3',
  0.9987671),
 ('talkingheads_116-burningdownthehouse.mp3',
  'thebeatles_mothernaturesson.mp3',
  0.9987591),
 ('thepolice_spiritsinthematerialworld.mp3',
  'weezer_elscorcho.mp3',
  0.99870944),
 ('nickgilder_hotchildinthecity.mp3',
  'billyidol_catchmyfall1999digitalremaster.mp3',
  0.9986686),
 ('ettajames_woulditmakeanydifferencetoyou.mp3',
  'georgebenson_breezin.mp3',
  0.9986559),
 ('queen_anotheronebitesthedust.mp3',
  'nickgilder_hotchildinthecity.mp3',
  0.9986344),
 ('thestring-a-longs_wheels.mp3', 'cheaptrick_stopthisgame.mp3', 0.99857545),
 ('neildiamond_songsungblue.mp3',