This file is used to test methods of the generate_embeddings.py file

## Test 1: Generate Embeddings with Default Configuration

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Pick a random song
test_idx = 42
test_song_id = embeddings_data['song_ids'][test_idx]
test_embedding = embeddings_data['embeddings'][test_idx:test_idx+1]

print(f"Finding songs similar to:")
print(f"  Song: {metadata['titles'][test_idx]}")
print(f"  Artist: {metadata['artists'][test_idx]}")
print(f"  Song ID: {test_song_id}")
print("\n" + "="*70 + "\n")

# Compute cosine similarities
similarities = cosine_similarity(test_embedding, embeddings_data['embeddings'])[0]

# Get top 10 most similar (excluding itself)
top_indices = np.argsort(similarities)[::-1][1:11]

print("Top 10 Most Similar Songs:")
print("-"*70)
for rank, idx in enumerate(top_indices, 1):
    similarity = similarities[idx]
    print(f"{rank:2d}. [{similarity:.3f}] {metadata['titles'][idx]}")
    print(f"    by {metadata['artists'][idx]}")
print("="*70)

## Test 5: Simple Nearest Neighbor Test

Test finding similar songs using cosine similarity

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Apply PCA to reduce to 2D for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings_data['embeddings'])

# Create scatter plot
plt.figure(figsize=(12, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5, s=20)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Song Embeddings - 2D PCA Projection')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Total variance explained by 2 components: {pca.explained_variance_ratio_.sum():.1%}")

## Test 4: Visualize Embedding Space (2D Projection)

Use PCA to visualize the high-dimensional embedding space

In [None]:
from content_based.generate_embeddings import load_embeddings, load_metadata
import pandas as pd

# Load the embeddings
embeddings_data = load_embeddings()
metadata = load_metadata()

print("Embeddings loaded:")
print(f"  Shape: {embeddings_data['embeddings'].shape}")
print(f"  Number of songs: {len(embeddings_data['song_ids'])}")

print("\nMetadata:")
print(f"  Features used: {metadata['feature_names']}")
print(f"  Dimensions: {metadata['n_dimensions']}")

# Create a DataFrame for easy inspection
df_embeddings = pd.DataFrame(
    embeddings_data['embeddings'],
    columns=metadata['feature_names']
)
df_embeddings['song_id'] = embeddings_data['song_ids']
df_embeddings['title'] = metadata['titles']
df_embeddings['artist'] = metadata['artists']

# Reorder columns to have identifiers first
cols = ['song_id', 'title', 'artist'] + metadata['feature_names']
df_embeddings = df_embeddings[cols]

print("\nEmbeddings DataFrame:")
display(df_embeddings.head(10))

## Test 3: Load and Inspect Saved Embeddings

In [None]:
# Example: Only use acoustic features (disable popularity and temporal features)
from content_based.generate_embeddings import FEATURE_CONFIG

# Modify the configuration
FEATURE_CONFIG['song_hotttnesss'] = 0      # Disable song popularity
FEATURE_CONFIG['artist_hotttnesss'] = 0    # Disable artist popularity
FEATURE_CONFIG['year'] = 0                 # Disable year

print("Modified Feature Configuration:")
print("="*50)
for feature, active in FEATURE_CONFIG.items():
    status = "✓ ACTIVE" if active == 1 else "✗ DISABLED"
    print(f"{feature:20s}: {active} {status}")
print("="*50)

# Now generate embeddings with this configuration
generator_custom = SongEmbeddingGenerator()
embeddings_custom = generator_custom.generate_embeddings()
generator_custom.print_summary()

## Test 2: Custom Feature Configuration

Example of how to modify which features are used in the embeddings

In [None]:
# Save the embeddings
generator.save_embeddings()

In [None]:
# Print detailed summary
generator.print_summary()

In [None]:
# Initialize and run the embedding generator
generator = SongEmbeddingGenerator()
embeddings = generator.generate_embeddings()

In [None]:
import sys
sys.path.append('..')
from content_based.generate_embeddings import SongEmbeddingGenerator, FEATURE_CONFIG

# Show current feature configuration
print("Current Feature Configuration:")
print("="*50)
for feature, active in FEATURE_CONFIG.items():
    status = "✓ ACTIVE" if active == 1 else "✗ DISABLED"
    print(f"{feature:20s}: {active} {status}")
print("="*50)