In [None]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure we can import from the current directory
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

from generate_embeddings import SongEmbeddingGenerator, EmbeddingConfig, load_embeddings, load_metadata

print("Libraries imported successfully")

In [None]:
# 1. Initialize Generator
print("Initializing generator...")
config = EmbeddingConfig()
generator = SongEmbeddingGenerator(config)

# 2. Test Step-by-Step Data Loading
print("\n--- Testing Data Loading ---")
df = generator.load_data()
display(df.head(3))
print(f"Loaded {len(df)} rows")

In [None]:
# 3. Test Feature Selection
print("\n--- Testing Feature Selection ---")
features_df, missing = generator.select_features(df)
print(f"Selected features shape: {features_df.shape}")
if missing:
    print(f"Missing features: {missing}")
else:
    print("All configured features found.")
display(features_df.head(3))

# 4. Test Missing Value Handling
print("\n--- Testing Missing Value Handling ---")
features_imputed = generator.handle_missing_values(features_df)
missing_count = features_imputed.isnull().sum().sum()
print(f"Total missing values after imputation: {missing_count}")
assert missing_count == 0, "There should be no missing values after imputation"

# 5. Test Outlier Handling
print("\n--- Testing Outlier Handling ---")
features_clean = generator.handle_outliers(features_imputed)
print("Descriptive stats after outlier handling:")
display(features_clean.describe().loc[['min', 'max', 'mean', 'std']])

In [None]:
# 6. Test Scaling and Generation
print("\n--- Testing Scaling ---")
scaled_features = generator.scale_features(features_clean)
print(f"Scaled features shape: {scaled_features.shape}")
print(f"Mean: {scaled_features.mean():.4f}, Std: {scaled_features.std():.4f}")

# 7. Test Full Pipeline
print("\n--- Testing Full Validation Pipeline ---")
# Re-instantiate to test the 'all-in-one' method
full_generator = SongEmbeddingGenerator()
embeddings = full_generator.generate_embeddings()

print(f"\nFinal Embeddings Shape: {embeddings.shape}")
assert embeddings.shape[0] == len(df), "Embedding count matches song count"
assert not np.isnan(embeddings).any(), "No NaNs in final embeddings"
print("Pipeline verification passed âœ“")

In [None]:
# 8. Test Saving and Loading
print("\n--- Testing Persistence ---")
full_generator.save_embeddings()

# Verify files exist
required_files = [
    config.OUTPUT_EMBEDDINGS_PATH, 
    config.OUTPUT_SCALER_PATH, 
    config.OUTPUT_METADATA_PATH
]

print("\nChecking output files:")
missing_files = []
for f in required_files:
    if os.path.exists(f):
        print(f"  [OK] Found {f}")
    else:
        print(f"  [FAIL] Missing {f}")
        missing_files.append(f)

if not missing_files:
    # Test Loading
    loaded_data = load_embeddings()
    loaded_meta = load_metadata()
    
    print(f"\nLoaded embeddings shape: {loaded_data['embeddings'].shape}")
    
    # Assert equality
    if np.array_equal(embeddings, loaded_data['embeddings']):
        print("Success: Loaded embeddings match generated embeddings perfectly.")
    else:
        diff = np.abs(embeddings - loaded_data['embeddings']).max()
        print(f"Warning: Embeddings differ slightly (max diff: {diff})")
else:
    print("Files missing, skipping load test.")

In [None]:
# 9. Simple Similarity Check (Sanity Check)
from sklearn.metrics.pairwise import cosine_similarity

print("\n--- Similarity Sanity Check ---")
if len(embeddings) > 5:
    # Pick a random song
    idx = np.random.randint(0, len(embeddings))
    
    # Get song details
    song_row = full_generator.songs_metadata.iloc[idx]
    query_vec = embeddings[idx].reshape(1, -1)
    
    print(f"Query Song: '{song_row['title']}' by {song_row['artist_name']}")
    
    # Compute similarity to all songs
    sims = cosine_similarity(query_vec, embeddings)[0]
    
    # Get top 5 matches (excluding the song itself)
    top_indices = sims.argsort()[-6:-1][::-1]
    
    print("\nTop 5 Similar Songs:")
    for i, match_idx in enumerate(top_indices):
        match_song = full_generator.songs_metadata.iloc[match_idx]
        score = sims[match_idx]
        print(f"{i+1}. [{score:.3f}] '{match_song['title']}' by {match_song['artist_name']}")
else:
    print("Not enough songs to perform similarity check.")