In [1]:
!pip install librosa numpy faiss-cpu torch transformers crepe scipy fastdtw tqdm nnAudio jsonlines

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [6]:
import librosa
import crepe  # Consider torchcrepe for GPU acceleration if available
import numpy as np
from fastdtw import fastdtw
from sklearn.metrics.pairwise import cosine_similarity
import logging
from tqdm import tqdm
import os
import warnings
from scipy import signal

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

def validate_audio(audio, sr, min_duration=5.0):
    """Validate audio input for sufficient length and non-silence."""
    duration = len(audio) / sr
    if duration < min_duration or not np.any(audio):
        logger.warning(f"Invalid audio: duration={duration:.2f}s, max_amplitude={np.max(np.abs(audio)):.4f}")
        return False
    return True

def extract_contextual_embeddings(audio, sr, hop_length=512, min_confidence=0.3, context_window=10, onset_frames=None):
    """
    Extract contextual embeddings representing pitch-onset-harmony relationships.
    Optionally use provided onset_frames for consistency.

    Returns:
        contextual_sequence: Array of shape (n_onsets, embedding_dim)
        onset_times: Array of onset timestamps
    """
    try:
        if not validate_audio(audio, sr):
            return np.array([]), np.array([])

        # 1. Detect onsets or use provided ones
        if onset_frames is None:
            onset_frames = librosa.onset.onset_detect(
                y=audio, sr=sr, hop_length=hop_length,
                units='frames', backtrack=True, pre_max=0.05, post_max=0.05,
                pre_avg=0.2, post_avg=0.2, delta=0.08
            )
        onset_times = librosa.frames_to_time(onset_frames, sr=sr, hop_length=hop_length)
        logger.info(f"Onsets detected: {len(onset_frames)}")

        if len(onset_frames) < 2:
            logger.warning("Insufficient onsets detected (<2).")
            return np.array([]), np.array([])

        # 2. Extract pitch sequence using CREPE
        time, pitch, confidence, activation = crepe.predict(
            audio, sr, viterbi=True, step_size=(hop_length / sr) * 1000
        )

        # Interpolate pitch and confidence to match chroma frames
        frame_times = librosa.frames_to_time(np.arange(0, len(audio) // hop_length + 1), sr=sr, hop_length=hop_length)
        pitch = np.interp(frame_times, time, pitch, left=0, right=0)
        confidence = np.interp(frame_times, time, confidence, left=0, right=0)

        # 3. Extract harmonic content (chroma) with L2 normalization
        chroma = librosa.feature.chroma_stft(
            y=audio, sr=sr, hop_length=hop_length, n_chroma=12, norm=2
        )

        if chroma.ndim != 2 or chroma.shape[1] == 0:
            logger.error(f"Invalid chroma shape: {chroma.shape}")
            return np.array([]), np.array([])

        # 4. Create contextual embeddings for each onset
        contextual_embeddings = []

        for i, onset_frame in enumerate(onset_frames):
            embedding = create_single_contextual_embedding(
                onset_frame, pitch, chroma, confidence, i, onset_frames, hop_length, sr, min_confidence, context_window
            )
            if embedding is not None:
                contextual_embeddings.append(embedding)

        contextual_embeddings = np.array(contextual_embeddings)
        if len(contextual_embeddings) == 0:
            logger.warning("No valid embeddings created.")
            return np.array([]), np.array([])

        # Normalize timing contexts by mean IOI for tempo invariance
        if len(onset_times) > 1:
            intervals = np.diff(onset_times)
            valid_intervals = intervals[intervals > 0]
            mean_ioi = np.mean(valid_intervals) if len(valid_intervals) > 0 else 1.0
            contextual_embeddings[:, 14:16] /= mean_ioi

        logger.info(f"Embeddings created: {len(contextual_embeddings)}")
        return contextual_embeddings, onset_times

    except Exception as e:
        logger.error(f"Error in embedding extraction: {str(e)}", exc_info=True)
        return np.array([]), np.array([])

def create_single_contextual_embedding(onset_frame, pitch, chroma, confidence,
                                      onset_index, all_onsets, hop_length, sr, min_confidence, context_window):
    """
    Create a single contextual embedding for one onset.
    Uses semitones for pitch changes, normalizes harmony.
    """
    try:
        onset_frame = min(max(int(onset_frame), 0), len(pitch) - 1)

        curr_pitch = pitch[onset_frame]
        if curr_pitch <= 0:
            return None

        pitch_conf = confidence[onset_frame]
        if pitch_conf < min_confidence:
            return None

        # 1. PITCH CHANGE COMPONENT in semitones
        pitch_change_in = 0.0
        pitch_change_out = 0.0

        if onset_index > 0:
            prev_onset = min(max(all_onsets[onset_index - 1], 0), len(pitch) - 1)
            prev_pitch = pitch[prev_onset]
            if prev_pitch > 0:
                pitch_change_in = 12 * np.log2(curr_pitch / prev_pitch)
                pitch_change_in = np.clip(pitch_change_in, -24, 24)

        if onset_index < len(all_onsets) - 1:
            next_onset = min(max(all_onsets[onset_index + 1], 0), len(pitch) - 1)
            next_pitch = pitch[next_onset]
            if next_pitch > 0:
                pitch_change_out = 12 * np.log2(next_pitch / curr_pitch)
                pitch_change_out = np.clip(pitch_change_out, -24, 24)

        pitch_change_vector = [pitch_change_in, pitch_change_out]

        # 2. HARMONIC CONTEXT COMPONENT with unit normalization
        context_start = max(0, onset_frame - context_window)
        context_end = min(chroma.shape[1], onset_frame + context_window + 1)

        slice_chroma = chroma[:, context_start:context_end]
        if slice_chroma.shape[1] == 0:
            harmonic_context = np.zeros(12, dtype=slice_chroma.dtype)
        else:
            harmonic_context = slice_chroma.mean(axis=1)

        norm = np.linalg.norm(harmonic_context)
        if norm > 0:
            harmonic_context /= norm

        # 3. ONSET TIMING CONTEXT in seconds
        timing_context = [0.0, 0.0]

        if onset_index > 0:
            prev_frame = all_onsets[onset_index - 1]
            timing_context[0] = (onset_frame - prev_frame) * hop_length / sr

        if onset_index < len(all_onsets) - 1:
            next_frame = all_onsets[onset_index + 1]
            timing_context[1] = (next_frame - onset_frame) * hop_length / sr

        # Combine into single embedding (16 dimensions)
        return np.concatenate([pitch_change_vector, harmonic_context, timing_context])

    except Exception as e:
        logger.error(f"Error creating embedding: {str(e)}", exc_info=True)
        return None

def compute_contextual_similarity(query_embeddings, ref_embeddings, return_details=False):
    """
    Compare two tracks with DTW. Optionally return component-wise details.
    """
    if len(query_embeddings) == 0 or len(ref_embeddings) == 0:
        return 0.0 if not return_details else (0.0, {})

    distance, path = fastdtw(
        query_embeddings,
        ref_embeddings,
        dist=contextual_embedding_distance
    )

    # Adjust normalization to favor identical sequences
    normalized_distance = distance / (len(query_embeddings) + len(ref_embeddings)) * 2
    similarity = 1 / (1 + normalized_distance)

    if return_details:
        pitch_dists, harmony_dists, timing_dists = [], [], []

        for q_idx, r_idx in path:
            emb1, emb2 = query_embeddings[q_idx], ref_embeddings[r_idx]
            pitch1, harmony1, timing1 = split_embedding(emb1)
            pitch2, harmony2, timing2 = split_embedding(emb2)

            pitch_dists.append(np.linalg.norm(pitch1 - pitch2))
            harmony_dists.append(1 - cosine_similarity([harmony1], [harmony2])[0][0])
            timing_dists.append(np.linalg.norm(timing1 - timing2))

        details = {
            'avg_pitch_dist': np.mean(pitch_dists) if pitch_dists else 0,
            'avg_harmony_dist': np.mean(harmony_dists) if harmony_dists else 0,
            'avg_timing_dist': np.mean(timing_dists) if timing_dists else 0,
            'dtw_path_length': len(path)
        }
        return similarity, details

    return similarity

def contextual_embedding_distance(emb1, emb2):
    pitch1, harmony1, timing1 = split_embedding(emb1)
    pitch2, harmony2, timing2 = split_embedding(emb2)

    pitch_dist = np.linalg.norm(pitch1 - pitch2)
    harmony_dist = 1 - cosine_similarity([harmony1], [harmony2])[0][0]
    timing_dist = np.linalg.norm(timing1 - timing2)

    # Reduce timing weight for identical tracks
    return 0.6 * pitch_dist + 0.35 * harmony_dist + 0.05 * timing_dist

def split_embedding(embedding):
    return embedding[0:2], embedding[2:14], embedding[14:16]

def detect_onsets(audio, sr, hop_length):
    return librosa.onset.onset_detect(
        y=audio, sr=sr, hop_length=hop_length,
        units='frames', backtrack=True, pre_max=0.05, post_max=0.05,
        pre_avg=0.2, post_avg=0.2, delta=0.08
    )

def no_aug(audio, sr):
    return audio

def pitch_up(audio, sr):
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=2)

def pitch_down(audio, sr):
    return librosa.effects.pitch_shift(audio, sr=sr, n_steps=-2)

def stretch_fast(audio, sr):
    return librosa.effects.time_stretch(audio, rate=1.1)

def stretch_slow(audio, sr):
    return librosa.effects.time_stretch(audio, rate=0.9)

def add_noise(audio, sr):
    noise = np.random.randn(len(audio)) * 0.01 * np.std(audio)
    return audio + noise

def add_reverb(audio, sr):
    # Simple reverb with exponential decay
    reverb_time = 0.5  # RT60 in seconds
    ir_length = int(sr * reverb_time * 2)  # Extend a bit
    t = np.arange(ir_length) / sr
    ir = np.exp(-6.907 * t / reverb_time)  # -60 dB decay
    ir[0] += 1  # Direct sound
    # Add early reflection
    early_delay = int(sr * 0.02)
    if early_delay < ir_length:
        ir[early_delay] += 0.5
    audio_aug = signal.convolve(audio, ir, mode='same')
    max_abs = np.max(np.abs(audio_aug))
    return audio_aug / max_abs if max_abs > 0 else audio_aug

def build_database(audio_files_dir, sr=22050, hop_length=512, min_onsets=5):
    database = {}

    audio_files = [os.path.join(audio_files_dir, f) for f in os.listdir(audio_files_dir)
                   if f.lower().endswith(('.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a'))]

    aug_functions = [no_aug, pitch_up, pitch_down, stretch_fast, stretch_slow, add_noise, add_reverb]

    for audio_file in tqdm(audio_files, desc="Processing files"):
        track_name = os.path.basename(audio_file)
        if track_name in database:
            continue
        database[track_name] = {'embeddings': [], 'onset_counts': [], 'onset_frames': []}

        try:
            original_audio, _ = librosa.load(audio_file, duration=30, sr=sr, mono=True)
            if not validate_audio(original_audio, sr):
                logger.info(f"Skipping {audio_file}: invalid audio")
                del database[track_name]
                continue

            for aug_func in aug_functions:
                try:
                    audio_aug = aug_func(original_audio, sr)
                    if not validate_audio(audio_aug, sr):
                        continue
                    onset_frames = detect_onsets(audio_aug, sr, hop_length)
                    embeddings, onset_times = extract_contextual_embeddings(audio_aug, sr, hop_length, min_confidence=0.3, onset_frames=onset_frames)
                    if len(embeddings) >= min_onsets:
                        database[track_name]['embeddings'].append(embeddings)
                        database[track_name]['onset_counts'].append(len(embeddings))
                        database[track_name]['onset_frames'].append(onset_frames)
                except Exception as aug_e:
                    logger.warning(f"Augmentation failed for {track_name}: {str(aug_e)}")

            if len(database[track_name]['embeddings']) == 0:
                logger.info(f"Skipping {track_name}: no valid augmentations")
                del database[track_name]
            else:
                logger.info(f"Processed {track_name}: {len(database[track_name]['embeddings'])} variants")
        except Exception as e:
            logger.warning(f"Failed to process {audio_file}: {str(e)}", exc_info=True)
            if track_name in database:
                del database[track_name]

    logger.info(f"Database built with {len(database)} tracks.")
    return database

def find_similar_tracks(query_audio_path, database, top_k=20, sr=22050, hop_length=512, detailed=False):
    try:
        audio, _ = librosa.load(query_audio_path, duration=30, sr=sr, mono=True)
        if not validate_audio(audio, sr):
            logger.warning("Query audio is invalid.")
            return []

        query_filename = os.path.basename(query_audio_path)
        query_embeddings, _ = extract_contextual_embeddings(audio, sr, hop_length, min_confidence=0.3)

        if len(query_embeddings) < 3:
            logger.warning(f"Query has insufficient onsets ({len(query_embeddings)}).")
            return []

        results = []

        for track, data in database.items():
            if track == query_filename:
                logger.info(f"Query matches database track {track}, assigning similarity 1.0")
                details = {'avg_pitch_dist': 0, 'avg_harmony_dist': 0, 'avg_timing_dist': 0, 'dtw_path_length': 0} if detailed else None
                results.append({
                    'track': track,
                    'similarity': 1.0,
                    'onset_count': data['onset_counts'][0] if data['onset_counts'] else 0,
                    'details': details
                })
                continue

            track_sims = []
            for i, ref_embeddings in enumerate(data['embeddings']):
                if detailed:
                    sim, details = compute_contextual_similarity(query_embeddings, ref_embeddings, True)
                else:
                    sim = compute_contextual_similarity(query_embeddings, ref_embeddings)
                    details = None
                track_sims.append((sim, details, data['onset_counts'][i]))

            if track_sims:
                best = max(track_sims, key=lambda x: x[0])
                results.append({
                    'track': track,
                    'similarity': best[0],
                    'onset_count': best[2],
                    'details': best[1]
                })

        results.sort(key=lambda x: x['similarity'], reverse=True)
        return results[:top_k]

    except Exception as e:
        logger.error(f"Error in similarity search: {str(e)}", exc_info=True)
        return []

class ContextualMusicSimilarity:
    def __init__(self, hop_length=512, sr=22050, min_confidence=0.3, context_window=10, min_onsets=5):
        self.hop_length = hop_length
        self.sr = sr
        self.min_confidence = min_confidence
        self.context_window = context_window
        self.min_onsets = min_onsets
        self.database = None

    def load_database(self, database_path):
        try:
            self.database = np.load(database_path, allow_pickle=True).item()
            logger.info(f"Loaded database with {len(self.database)} tracks.")
        except Exception as e:
            logger.error(f"Failed to load database: {str(e)}", exc_info=True)

    def build_database(self, audio_files_dir, save_path=None):
        self.database = build_database(
            audio_files_dir, self.sr, self.hop_length, self.min_onsets
        )
        if save_path:
            np.save(save_path, self.database)
            logger.info(f"Database saved to {save_path}")

    def find_similar(self, query_audio_path, top_k=20, detailed=False):
        if self.database is None:
            raise ValueError("Database not loaded. Call load_database() or build_database() first.")

        return find_similar_tracks(
            query_audio_path, self.database, top_k, self.sr, self.hop_length, detailed
        )

# Usage Example:
if __name__ == "__main__":
    similarity_engine = ContextualMusicSimilarity()

    # Debug query track
    query_audio_path = "/home/ubuntu/mahesh_YUE/input_songs1/KISS - Crazy Crazy Nights.mp3"
    audio, sr = librosa.load(query_audio_path, duration=30, sr=22050, mono=True)
    logger.info(f"Query duration: {len(audio)/sr:.2f}s, max amplitude: {np.max(np.abs(audio)):.4f}")

    # Build database
    similarity_engine.build_database("/home/ubuntu/mahesh_YUE/input_songs1", "music_database.npy")

    # Find similar tracks with details
    results = similarity_engine.find_similar(query_audio_path, top_k=20, detailed=True)

    # Print results
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['track']} (similarity: {result['similarity']:.3f})")
        if result['details']:
            print(f"   Details: Pitch dist: {result['details']['avg_pitch_dist']:.3f}, "
                  f"Harmony dist: {result['details']['avg_harmony_dist']:.3f}, "
                  f"Timing dist: {result['details']['avg_timing_dist']:.3f}")

2025-08-21 11:25:07,964 - INFO - Query duration: 30.00s, max amplitude: 1.0161
2025-08-21 11:25:08,121 - INFO - Onsets detected: 12/s]


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:08,997 - INFO - Embeddings created: 6
2025-08-21 11:25:09,279 - INFO - Onsets detected: 2


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


2025-08-21 11:25:10,136 - INFO - Embeddings created: 1
2025-08-21 11:25:10,514 - INFO - Onsets detected: 2


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


2025-08-21 11:25:11,423 - INFO - Embeddings created: 1
2025-08-21 11:25:11,653 - INFO - Onsets detected: 2


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:12,791 - INFO - Onsets detected: 2


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:13,742 - INFO - Embeddings created: 1
2025-08-21 11:25:13,790 - INFO - Onsets detected: 31


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:14,669 - INFO - Embeddings created: 23
2025-08-21 11:25:14,739 - INFO - Onsets detected: 54


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:15,650 - INFO - Embeddings created: 32
2025-08-21 11:25:15,652 - INFO - Processed The Beatles - All My Loving - Remastered 2009.mp3: 3 variants
2025-08-21 11:25:15,755 - INFO - Onsets detected: 52  7.68s/it]


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


2025-08-21 11:25:16,630 - INFO - Embeddings created: 31
2025-08-21 11:25:16,912 - INFO - Onsets detected: 28


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:17,774 - INFO - Embeddings created: 16
2025-08-21 11:25:18,011 - INFO - Onsets detected: 24


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:18,919 - INFO - Embeddings created: 12
2025-08-21 11:25:19,151 - INFO - Onsets detected: 25


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:19,953 - INFO - Embeddings created: 16
2025-08-21 11:25:20,224 - INFO - Onsets detected: 28


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


2025-08-21 11:25:21,209 - INFO - Embeddings created: 17
2025-08-21 11:25:21,294 - INFO - Onsets detected: 55


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:22,200 - INFO - Embeddings created: 33
2025-08-21 11:25:22,272 - INFO - Onsets detected: 51


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:23,219 - INFO - Embeddings created: 17
2025-08-21 11:25:23,220 - INFO - Processed KISS - Crazy Crazy Nights.mp3: 7 variants
2025-08-21 11:25:23,296 - INFO - Onsets detected: 109 7.62s/it]


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:24,158 - INFO - Embeddings created: 103
2025-08-21 11:25:24,452 - INFO - Onsets detected: 65


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:25,310 - INFO - Embeddings created: 63
2025-08-21 11:25:25,558 - INFO - Onsets detected: 18


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:26,419 - INFO - Embeddings created: 14
2025-08-21 11:25:26,661 - INFO - Onsets detected: 43


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:27,451 - INFO - Embeddings created: 43
2025-08-21 11:25:27,757 - INFO - Onsets detected: 30


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:28,700 - INFO - Embeddings created: 30
2025-08-21 11:25:28,747 - INFO - Onsets detected: 108


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:29,629 - INFO - Embeddings created: 102
2025-08-21 11:25:29,697 - INFO - Onsets detected: 104


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:30,599 - INFO - Embeddings created: 96
2025-08-21 11:25:30,601 - INFO - Processed light-instrumental-melody-4-319030.wav: 7 variants
2025-08-21 11:25:30,697 - INFO - Onsets detected: 12  7.51s/it]


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:31,576 - INFO - Embeddings created: 8
2025-08-21 11:25:31,875 - INFO - Onsets detected: 1
2025-08-21 11:25:32,115 - INFO - Onsets detected: 4


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:32,989 - INFO - Embeddings created: 3
2025-08-21 11:25:33,220 - INFO - Onsets detected: 1
2025-08-21 11:25:33,497 - INFO - Onsets detected: 1
2025-08-21 11:25:33,546 - INFO - Onsets detected: 29


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:34,438 - INFO - Embeddings created: 19
2025-08-21 11:25:34,503 - INFO - Onsets detected: 67


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:35,386 - INFO - Embeddings created: 39
2025-08-21 11:25:35,387 - INFO - Processed Lita Ford - Playin_ with Fire.mp3: 3 variants
2025-08-21 11:25:35,478 - INFO - Onsets detected: 9,  6.43s/it]


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:36,348 - INFO - Embeddings created: 4
2025-08-21 11:25:36,658 - INFO - Onsets detected: 5


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:37,516 - INFO - Embeddings created: 2
2025-08-21 11:25:37,751 - INFO - Onsets detected: 10


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:38,608 - INFO - Embeddings created: 7
2025-08-21 11:25:38,837 - INFO - Onsets detected: 8


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


2025-08-21 11:25:39,623 - INFO - Embeddings created: 5
2025-08-21 11:25:39,902 - INFO - Onsets detected: 11


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:40,850 - INFO - Embeddings created: 5
2025-08-21 11:25:40,897 - INFO - Onsets detected: 34


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:41,777 - INFO - Embeddings created: 18
2025-08-21 11:25:41,831 - INFO - Onsets detected: 63


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:42,713 - INFO - Embeddings created: 42
2025-08-21 11:25:42,715 - INFO - Processed Aerosmith - Dude (Looks Like A Lady).mp3: 5 variants
Processing files: 100%|██████████| 5/5 [00:34<00:00,  6.95s/it]
2025-08-21 11:25:42,721 - INFO - Database built with 5 tracks.
2025-08-21 11:25:42,723 - INFO - Database saved to music_database.npy
2025-08-21 11:25:42,820 - INFO - Onsets detected: 52


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


2025-08-21 11:25:43,690 - INFO - Embeddings created: 31
2025-08-21 11:25:44,212 - INFO - Query matches database track KISS - Crazy Crazy Nights.mp3, assigning similarity 1.0


1. KISS - Crazy Crazy Nights.mp3 (similarity: 1.000)
   Details: Pitch dist: 0.000, Harmony dist: 0.000, Timing dist: 0.000
2. The Beatles - All My Loving - Remastered 2009.mp3 (similarity: 0.183)
   Details: Pitch dist: 6.485, Harmony dist: 0.175, Timing dist: 1.151
3. light-instrumental-melody-4-319030.wav (similarity: 0.176)
   Details: Pitch dist: 5.013, Harmony dist: 0.197, Timing dist: 0.551
4. Lita Ford - Playin_ with Fire.mp3 (similarity: 0.166)
   Details: Pitch dist: 6.456, Harmony dist: 0.140, Timing dist: 1.233
5. Aerosmith - Dude (Looks Like A Lady).mp3 (similarity: 0.121)
   Details: Pitch dist: 10.138, Harmony dist: 0.164, Timing dist: 1.041
