In [1]:
# ------------------------------------------------------
# 1. Install Spotipy
# ------------------------------------------------------
!pip install spotipy




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# ------------------------------------------------------
# 2. Import Libraries
# ------------------------------------------------------
import spotipy
import json
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth


In [2]:
# ------------------------------------------------------
# 3. Credentials
# Replace with your values from https://developer.spotify.com/dashboard
# ------------------------------------------------------
CLIENT_ID = "XXXXXXXXXXXX"
CLIENT_SECRET = "XXXXXXXXXXXX"
REDIRECT_URI = "http://localhost:8888/callback"  # must match your app's settings

In [3]:
client_credentials_manager = SpotifyClientCredentials(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET
)
sp = spotipy.Spotify(auth_manager=client_credentials_manager)

In [4]:
sp

<spotipy.client.Spotify at 0x26cafc4fa10>


## 🔄 Data Acquisition and Preparation

Before we build the recommender system, we first **collect and organize the data**:

1. **Connect to Spotify API** – using the `spotipy` library and proper authentication.  
2. **Artist Data** – search for a curated list of top artists with `sp.search`, retrieving their `artist_id`, followers, and popularity.  
3. **Album Data** – for each artist, fetch their albums using `sp.artist_albums`, filtering to keep only solo albums (to avoid redundant or duplicate collaborations).  
4. **Track Data** – for each album, retrieve the track list using `sp.album_tracks`. Keep only tracks without collaborations for cleaner analysis.  
5. **Audio Features** – enrich the track table by joining with a Kaggle dataset that contains detailed audio features (danceability, energy, valence, etc.).  
6. **Data Modeling** – the result is a **star schema** with three clean tables:  
   - `artist_df` – one row per artist (no redundancy).  
   - `album_df` – one row per album.  
   - `track_df` – one row per track, enriched with audio features.  

This structured approach ensures there is **no redundancy** across tables and prepares the data for efficient analysis and machine learning.


In [5]:
artists_to_check = ["Taylor Swift", "Drake", "The Weeknd", "Ed Sheeran", "Ariana Grande",
                    "Bad Bunny", "BTS", "Eminem", "Justin Bieber", "Rihanna",
                    "Kanye West", "Adele", "Shakira", "Billie Eilish", "Kendrick Lamar",
                    "Post Malone", "Coldplay", "Harry Styles", "J Balvin", "Bruno Mars", "Dua Lipa"]

artist_data = []
for name in artists_to_check:
    results = sp.search(q=name, type="artist", limit=1)
    artist = results["artists"]["items"][0]
    artist_data.append({
        "artist_id": artist["id"],
        "artist_name": artist["name"],
        "followers": artist["followers"]["total"],
        "popularity": artist["popularity"]
    })

In [6]:
artist_df = pd.DataFrame(artist_data)

In [7]:
artist_df.sort_values(by=['popularity','followers'], ascending=[False,False], inplace=True)

In [8]:
artist_df = artist_df.reset_index(drop=True)

In [9]:
artist_df

Unnamed: 0,artist_id,artist_name,followers,popularity
0,06HL4z0CvFAxyc27GXpf02,Taylor Swift,143458138,100
1,3TVXtAsR1Inumwj472S9r4,Drake,101996862,99
2,4q3ewBCX7sLwd24euuV69X,Bad Bunny,100915885,99
3,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,111354683,97
4,1uNFoZAHBGtllmzznpCI3s,Justin Bieber,84501721,96
5,6qqNVTkY8uBg9cP3Jd7DAH,Billie Eilish,117397976,94
6,66CXWjxzNUsdJxJ2JdwvnR,Ariana Grande,106874595,94
7,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,122189989,93
8,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,74768735,93
9,5pKCCKE2ajJHZ9KAiaK11H,Rihanna,68326604,93


In [10]:
all_tracks = []
all_albums = []

for artist in artist_data:
    albums = sp.artist_albums(artist["artist_id"], album_type="album,single", limit=50)
    
    for album in albums["items"]:
        # print(album["id"])
        # break
        if len(album["artists"]) == 1:                           #Only including solo albums for each artist
         
            tracks = sp.album_tracks(album["id"])
            

            all_albums.append({
                    "album_id": album["id"],
                    "album_name": album["name"],
                    "artist_id": album["artists"][0]['id'],
                    "release_date": album["release_date"],
                     "total_tracks": album['total_tracks']
                })
            
            for track in tracks["items"]:
                
                #features = sp.audio_features([track["id"]])[0]
    
                if len(track["artists"]) == 1:                  #Only including tracks where there are no collaborations
                    all_tracks.append({
                        "track_id": track["id"],
                        "track_name": track["name"],
                        "duration": track["duration_ms"]/60000,
                        "album_id": album["id"],
                        "artist_id": artist["artist_id"],
                        "available_markets" : len(track['available_markets']),
                       # "energy": features["energy"]
                    })
                else:
                     continue
        else:
            continue

album_df = pd.DataFrame(all_albums)
track_df = pd.DataFrame(all_tracks)

In [11]:
df_kaggle = pd.read_csv('dataset.csv')

In [12]:
df_kaggle.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [13]:
df_kaggle.drop(columns=['Unnamed: 0'], inplace=True)

In [14]:
track_df = track_df.merge(df_kaggle[['track_id', 'popularity', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre']], on='track_id', how='left')

In [15]:
track_df = track_df[track_df['popularity'].notna()]

In [16]:
track_df = track_df.reset_index(drop=True)

In [17]:
track_df.columns

Index(['track_id', 'track_name', 'duration', 'album_id', 'artist_id',
       'available_markets', 'popularity', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [18]:
track_df = track_df.drop_duplicates(subset=['track_id'])

In [19]:
track_df

Unnamed: 0,track_id,track_name,duration,album_id,artist_id,available_markets,popularity,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,3hUxzQpSfdDqwM3ZTFQY0K,august,4.365367,2fenSS68JI1h4Fo296JfGr,06HL4z0CvFAxyc27GXpf02,184,86.0,False,0.532,0.623,...,-9.208,1.0,0.0331,0.5380,0.000073,0.0925,0.403,89.937,4.0,pop
1,1dGr1c8CrMLDpV6mPbImSI,Lover,3.688433,1NAmidJlEaVgA3MpcPFYGq,06HL4z0CvFAxyc27GXpf02,184,85.0,False,0.359,0.543,...,-7.582,1.0,0.0919,0.4920,0.000016,0.1180,0.453,68.534,4.0,pop
2,1R0a2iXumgCiFb7HEZ7gUE,Don’t Blame Me,3.940217,6DEjYFkNZh67HP7R9PSZvv,06HL4z0CvFAxyc27GXpf02,184,88.0,False,0.615,0.534,...,-6.719,0.0,0.0386,0.1060,0.000018,0.0607,0.193,135.917,4.0,pop
3,1u8c2t2Cy7UBoG4ArRcF5g,Blank Space,3.863767,1yGbNOtRIgdIiGHOEBaZWf,06HL4z0CvFAxyc27GXpf02,180,85.0,False,0.760,0.703,...,-5.412,1.0,0.0540,0.1030,0.000000,0.0913,0.570,95.997,4.0,pop
4,3fVnlF4pGqWI9flVENcT28,Wildest Dreams,3.674000,5fy0X0JmZRZnVa2UEicIOl,06HL4z0CvFAxyc27GXpf02,180,80.0,False,0.553,0.664,...,-7.417,1.0,0.0741,0.0709,0.005600,0.1060,0.467,140.060,4.0,pop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,6zEgnpM0qYmHLDnh8WPejL,Amarillo,2.628883,2mX8ktJoWvyidWBU9U8Jis,1vyhD5VmyZ7KMfW5gqLgo5,184,69.0,False,0.641,0.857,...,-5.725,0.0,0.3010,0.0130,0.005340,0.0695,0.961,122.728,5.0,latino
331,2C2TGgFzrTRIOdQS1vUN5h,Ginza,2.851550,4cGc9Eeb3Gjff2Aq5ILLEf,1vyhD5VmyZ7KMfW5gqLgo5,183,70.0,False,0.730,0.809,...,-6.406,0.0,0.0876,0.2080,0.001300,0.0804,0.825,101.965,4.0,latino
334,6Ges5C2IE738iJh4HyQizQ,Ay Vamos,3.773550,4loQMkHOAbqVdyb59KBbsp,1vyhD5VmyZ7KMfW5gqLgo5,166,69.0,False,0.718,0.711,...,-5.746,0.0,0.1100,0.1570,0.000055,0.1000,0.920,173.916,4.0,latino
336,0KKkJNfGyhkQ5aFogxQAPU,That's What I Like,3.444883,4PgleR09JVnm3zY1fW3XBA,0du5cEVh5yTK9QJze8zA0C,184,83.0,False,0.853,0.560,...,-4.961,1.0,0.0406,0.0130,0.000000,0.0944,0.860,134.066,4.0,dance


## Machine Learning

In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity


# Choose numeric audio features for similarity
FEATURE_COLS = [
    "danceability", "energy", "loudness", "speechiness",
    "acousticness", "instrumentalness", "liveness",
    "valence", "tempo", "duration"
]

# Keep only rows that have these features + IDs/names
needed_cols = ["track_id", "track_name", "artist_id"] + FEATURE_COLS
tracks = track_df.loc[:, [c for c in needed_cols if c in track_df.columns]].copy()

# Drop dupes and obvious bad IDs
tracks = tracks.drop_duplicates(subset=["track_id"]).reset_index(drop=True)
tracks = tracks[tracks["track_id"].notna()].copy()

# Handle missing values in features (fill with column medians)
for c in FEATURE_COLS:
    if tracks[c].isna().any():
        tracks[c] = tracks[c].fillna(tracks[c].median())

# Optional: ensure numeric dtypes
for c in FEATURE_COLS:
    tracks[c] = pd.to_numeric(tracks[c], errors="coerce")



In [21]:
#Rescaling the numeric audio features to have mean of 0 and SD of 1 so all features have same importance

scaler = StandardScaler()
X = scaler.fit_transform(tracks[FEATURE_COLS].values)

In [22]:
# kNN index (we'll query with k+1 to drop the seed itself)
knn = NearestNeighbors(
    n_neighbors=11,              # 10 recs + 1 self
    metric="cosine",
    algorithm="brute"            # brute is fine for cosine on medium data
)
knn.fit(X)

# Keep references handy
id_to_row = {tid: i for i, tid in enumerate(tracks["track_id"])} #Builds a dictionary which has index stored as a value for each key (spotify track id)

In [23]:
def pick_seed_row(track_name_query: str, artist_id_hint: str | None = None) -> int:
    # Case-insensitive name match
    mask = tracks["track_name"].str.lower().str.strip() == track_name_query.lower().strip()
    subset = tracks[mask].copy()

    if artist_id_hint is not None:
        subset = subset[subset["artist_id"] == artist_id_hint]

    if subset.empty:
        # Fallback: contains() match
        mask2 = tracks["track_name"].str.lower().str.contains(track_name_query.lower().strip(), na=False)
        subset = tracks[mask2].copy()

        if artist_id_hint is not None:
            subset = subset[subset["artist_id"] == artist_id_hint]

    if subset.empty:
        raise ValueError(f"No track found for query='{track_name_query}' (artist_id_hint={artist_id_hint}).")

    # Prefer most popular if available, otherwise the first one
    if "popularity" in track_df.columns:
        subset = subset.merge(track_df[["track_id", "popularity"]], on="track_id", how="left")
        subset = subset.sort_values("popularity", ascending=False)

    seed_track_id = subset.iloc[0]["track_id"]
    return id_to_row[seed_track_id]


In [25]:
def recommend_similar_by_trackname(track_name: str,
                                   artist_id_hint: str | None = None,
                                   k: int = 10) -> pd.DataFrame:
    seed_idx = pick_seed_row(track_name, artist_id_hint)
    seed_vec = X[seed_idx].reshape(1, -1)

    distances, indices = knn.kneighbors(seed_vec, n_neighbors=k+1)  # +1 because first is the seed
    distances = distances.flatten()
    indices = indices.flatten()

    # Drop the seed itself (distance==0)
    keep = indices != seed_idx
    indices = indices[keep][:k]
    distances = distances[keep][:k]

    recs = tracks.iloc[indices][["track_id", "track_name", "artist_id"]].copy()
    recs["similarity"] = (1.0 - distances).round(4)

    # Optional: enrich with extra columns for display if you have them
    extra_cols = []
    for c in ["popularity", "album_id", "track_genre"]:
        if c in track_df.columns:
            extra_cols.append(c)
    if extra_cols:
        recs = recs.merge(track_df[["track_id"] + extra_cols], on="track_id", how="left")

    return recs


In [26]:
# ==========================
# Interactive Recommender
# ==========================

# Ask user for a song name
song_query = input("Enter a song name: ").strip()
try:
    k = int(input("How many recommendations? (default=10): ") or 10)
except ValueError:
    k = 10

# Run recommender
try:
    recommendations = recommend_similar_by_trackname(song_query, k=k)
    
    # Print results
    print("\n🎵 Recommendations based on:", song_query)
    display(recommendations)   # works nicely inside Jupyter
except ValueError as e:
    print("Error:", e)


Enter a song name:  lover
How many recommendations? (default=10):  4



🎵 Recommendations based on: lover


Unnamed: 0,track_id,track_name,artist_id,similarity,popularity,album_id,track_genre
0,1Ty11iX5DIhZRRD9zCHP3l,Melt My Heart to Stone,4dpARuHxo51G3z768sgnrY,0.8261,52.0,3uftDqGs13LsE1s8nn5XSe,british
1,4RVwu0g32PAqgUiJoXsdF8,Happier Than Ever,6qqNVTkY8uBg9cP3Jd7DAH,0.7949,88.0,0JGOiO34nwfUdDrD612dOp,pop
2,3hUxzQpSfdDqwM3ZTFQY0K,august,06HL4z0CvFAxyc27GXpf02,0.7674,86.0,2fenSS68JI1h4Fo296JfGr,pop
3,6VObnIkLVruX4UVyxWhlqm,Skyfall,4dpARuHxo51G3z768sgnrY,0.7641,78.0,6TwN6Lq9glwnG8kNp6chHY,british
