# Test Content-Based Recommender

This notebook demonstrates how to use the content-based recommender system with real user data.

## 1. Import Libraries

In [1]:
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath('__file__')))

import pandas as pd
import numpy as np
from recommender import ContentBasedRecommender

## 2. Load User Data

Load the taste profile data containing user listening history.

In [2]:
print("Loading taste profile data...")
taste_profile = pd.read_pickle('../data/taste_profile.pkl')

print(f"Total records: {len(taste_profile):,}")
print(f"Unique users: {taste_profile['user_id'].nunique():,}")
print(f"Unique songs: {taste_profile['song_id'].nunique():,}")
print("\nSample data:")
display(taste_profile.head())

Loading taste profile data...
Total records: 48,373,586
Unique users: 1,019,318
Unique songs: 384,546

Sample data:


Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


## 3. Select a Random User

Pick a random user with at least 10 listening records for a meaningful test.

In [3]:
# Count songs per user
user_counts = taste_profile.groupby('user_id').size()

# Filter users with at least 10 songs
active_users = user_counts[user_counts >= 10].index.tolist()

# Select a random user
random_user_id = np.random.choice(active_users)

print(f"Selected user: {random_user_id}")
print(f"This user has listened to {user_counts[random_user_id]} songs")

Selected user: 166a595131fb455aa1c47a54810827f8b6f0bc46
This user has listened to 112 songs


## 4. Prepare User History

Extract the listening history for the selected user and format it for the recommender.

In [4]:
# Get user's listening history
user_history_df = taste_profile[taste_profile['user_id'] == random_user_id].copy()

# Sort by play count to see most listened songs
user_history_df = user_history_df.sort_values('play_count', ascending=False)

print(f"User listening history ({len(user_history_df)} songs):")
print(f"Total plays: {user_history_df['play_count'].sum()}")
print(f"Average plays per song: {user_history_df['play_count'].mean():.2f}")
print("\nTop 10 most played songs:")
display(user_history_df.head(10))

# Convert to the format expected by the recommender
# Format: [{'song_id': 'SO...', 'play_count': 5}, ...]
user_history = user_history_df[['song_id', 'play_count']].to_dict('records')

print(f"\nFormatted {len(user_history)} records for recommender")

User listening history (112 songs):
Total plays: 167
Average plays per song: 1.49

Top 10 most played songs:


Unnamed: 0,user_id,song_id,play_count
39788689,166a595131fb455aa1c47a54810827f8b6f0bc46,SOVRIPE12A6D4FEA19,6
39788619,166a595131fb455aa1c47a54810827f8b6f0bc46,SOGELNH12AB017F92C,5
39788691,166a595131fb455aa1c47a54810827f8b6f0bc46,SOVYPMX12AF72A1C26,5
39788610,166a595131fb455aa1c47a54810827f8b6f0bc46,SOEGKIL12AB017F90A,5
39788645,166a595131fb455aa1c47a54810827f8b6f0bc46,SOKICUR12AB017F936,4
39788643,166a595131fb455aa1c47a54810827f8b6f0bc46,SOKCMEU12A8AE48894,4
39788604,166a595131fb455aa1c47a54810827f8b6f0bc46,SODFUZR12AB017F927,4
39788620,166a595131fb455aa1c47a54810827f8b6f0bc46,SOGFNYP12AB017F946,4
39788680,166a595131fb455aa1c47a54810827f8b6f0bc46,SOSVPQJ12A8AE48888,4
39788671,166a595131fb455aa1c47a54810827f8b6f0bc46,SORHBWY12A58A7BB05,4



Formatted 112 records for recommender


## 5. Initialize Recommender System

Load the recommender with pre-generated embeddings.

In [5]:
print("Initializing recommender system...")
recommender = ContentBasedRecommender(
    embeddings_path='../data/song_embeddings.pkl',
    metadata_path='../data/songs_metadata.pkl'
)
print("✓ Recommender ready!")

Initializing recommender system...
Loading embeddings from ../data/song_embeddings.pkl...
Loading metadata from ../data/songs_metadata.pkl...
Building NearestNeighbors index for 10000 songs...
✓ Recommender ready!


## 6. Calculate User Embedding

Create a weighted average embedding based on the user's listening history.

In [6]:
print("Calculating user profile embedding...")
user_embedding = recommender.calculate_user_embedding(user_history)

if user_embedding is not None:
    print(f"✓ User embedding shape: {user_embedding.shape}")
    print(f"  Embedding dimension: {len(user_embedding)}")
    print(f"  Based on {len(user_history)} songs")
else:
    print("⚠ Warning: Could not calculate user embedding (no matching songs found)")

Calculating user profile embedding...
✓ User embedding shape: (384,)
  Embedding dimension: 384
  Based on 112 songs


## 7. Generate Recommendations

Find the 5 nearest songs to the user's profile using cosine similarity.

In [7]:
print("Generating recommendations...")
recommendations = recommender.recommend(user_embedding, n_recommendations=5)

print(f"\n{'='*60}")
print(f"TOP 5 RECOMMENDATIONS FOR USER: {random_user_id}")
print(f"{'='*60}\n")

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec['title']}")
    print(f"   Artist: {rec['artist_name']}")
    print(f"   Similarity Score: {rec['similarity']:.4f}")
    print(f"   Song ID: {rec['song_id']}")
    print("-" * 60)

Generating recommendations...

TOP 5 RECOMMENDATIONS FOR USER: 166a595131fb455aa1c47a54810827f8b6f0bc46

1. Jam 4 U
   Artist: Redman
   Similarity Score: 0.9400
   Song ID: SOPOJVI12A58A7E706
------------------------------------------------------------
2. I Got A Seecret
   Artist: Redman
   Similarity Score: 0.7437
   Song ID: SOERJUK12AF72A49F7
------------------------------------------------------------
3. The Modern World
   Artist: The Jam
   Similarity Score: 0.7354
   Song ID: SOQMDPN12AF72A25EE
------------------------------------------------------------
4. I Am The Club (Explicit Album Version)
   Artist: Plies
   Similarity Score: 0.7308
   Song ID: SORTPHZ12A8C13D374
------------------------------------------------------------
5. Muh- F***a
   Artist: Redman
   Similarity Score: 0.7123
   Song ID: SOATEEN12A67020394
------------------------------------------------------------


## 8. Display Recommendations as DataFrame

Show the recommendations in a clean table format.

In [8]:
recommendations_df = pd.DataFrame(recommendations)
recommendations_df.index = recommendations_df.index + 1  # Start from 1
display(recommendations_df[['title', 'artist_name', 'similarity', 'song_id']])

Unnamed: 0,title,artist_name,similarity,song_id
1,Jam 4 U,Redman,0.939965,SOPOJVI12A58A7E706
2,I Got A Seecret,Redman,0.743656,SOERJUK12AF72A49F7
3,The Modern World,The Jam,0.735357,SOQMDPN12AF72A25EE
4,I Am The Club (Explicit Album Version),Plies,0.730846,SORTPHZ12A8C13D374
5,Muh- F***a,Redman,0.712285,SOATEEN12A67020394
