In [6]:
# Install spotipy if not installed
!pip install spotipy

import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd

# --- AUTHENTICATION ---
client_id = "7f7305da4b824870aa236c2b1a1c33af"
client_secret = "d2f4fe1868b84321a089d6e08d7761b0"
redirect_uri = "http://127.0.0.1:8888/callback"

scope = (
    "user-read-recently-played "
    "user-top-read "
    "playlist-read-private "
    "user-library-read"
)

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id=client_id,
    client_secret=client_secret,
    redirect_uri=redirect_uri,
    scope=scope
))

# --- FUNCTIONS TO GET YOUR DATA ---

def get_recent_tracks(limit=50):
    """Get your recently played tracks."""
    results = sp.current_user_recently_played(limit=limit)
    data = []
    for item in results['items']:
        track = item['track']
        data.append({
            'track_name': track['name'],
            'artist': track['artists'][0]['name'],
            'album': track['album']['name'],
            'duration_ms': track['duration_ms'],
            'played_at': item['played_at']
        })
    return pd.DataFrame(data)


def get_top_tracks(limit=50, term='medium_term'):
    """Get your most listened tracks."""
    results = sp.current_user_top_tracks(limit=limit, time_range=term)
    data = []
    for track in results['items']:
        data.append({
            'track_name': track['name'],
            'artist': track['artists'][0]['name'],
            'popularity': track['popularity'],
            'album': track['album']['name'],
            'duration_ms': track['duration_ms'],
            'id': track['id']
        })
    return pd.DataFrame(data)


def get_playlist_tracks(playlist_id):
    """Get tracks from one playlist."""
    results = sp.playlist_tracks(playlist_id)
    data = []
    for item in results['items']:
        track = item['track']
        data.append({
            'track_name': track['name'],
            'artist': track['artists'][0]['name'],
            'album': track['album']['name'],
            'duration_ms': track['duration_ms'],
            'id': track["id"]
        })
    return pd.DataFrame(data)


# --- FETCH YOUR DATA ---

print("Fetching your recent tracks...")
df_recent = get_recent_tracks()

print("Fetching your top tracks...")
df_top = get_top_tracks()

# Example playlist: replace with any playlist ID from your profile
# df_playlist = get_playlist_tracks("PLAYLIST_ID_HERE")

# --- SAVE DATASETS ---
df_recent.to_csv("my_recent_spotify_tracks.csv", index=False)
df_top.to_csv("my_top_spotify_tracks.csv", index=False)

print("Datasets created successfully!")


Fetching your recent tracks...
Fetching your top tracks...
Datasets created successfully!


# Spotify Genre-Based Playlist Generator (Clustering)
**Multi-Genre support with Classical demo**

This notebook loads your Spotify track dataset, clusters tracks per genre using audio features, automatically assigns a mood label to each cluster, visualizes clusters (PCA) and saves top-N playlists per cluster.


In [15]:
# Run this only if you need to install libraries (uncomment)
# !pip install scikit-learn matplotlib pandas

import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

print("Libraries loaded")


Libraries loaded


In [16]:
# Change path to the folder where your notebook and datasets live (VS Code: current workspace)
# Example windows path (use raw string r"...") or forward slashes:
# path = r"C:\Users\DELL\OneDrive\Desktop\Data Science Project"
path = os.getcwd()   # use current notebook folder by default
print("Working directory:", path)
print("Files in working directory:")
print(sorted(os.listdir(path)))

# If your CSVs are elsewhere, update `path` above and re-run this cell.


Working directory: c:\Users\DELL\OneDrive\Desktop\Data Science Project
Files in working directory:
['.cache', 'Untitled-1.ipynb', 'my_recent_spotify_tracks.csv', 'my_top_spotify_tracks.csv', 'playlists_output', 'train.csv']


In [18]:
# Pick the CSV with audio features. Common names you might have:
candidates = [f for f in os.listdir(path) if f.lower().endswith('.csv')]
print("CSV files found:", candidates)

# Change this to the exact file name if multiple CSVs appear
# Example: fname = "my_top_spotify_tracks.csv"
# If you want auto-selection, uncomment the heuristics below.

# Auto-select dataset that contains key audio features
preferred_cols = {'danceability','energy','valence','tempo'}
chosen = None
for f in candidates:
    try:
        tmp = pd.read_csv(os.path.join(path, f), nrows=5)
        cols = set([c.lower() for c in tmp.columns])
        if preferred_cols.issubset(cols):
            chosen = f
            break
    except Exception:
        continue

if chosen is None:
    # fallback: use first CSV in folder
    chosen = candidates[0] if candidates else None

if chosen is None:
    raise FileNotFoundError("No CSV file found. Upload your dataset CSV and rerun.")
    
print("Loading dataset:", chosen)
df = pd.read_csv(os.path.join(path, chosen))
print("Dataset shape:", df.shape)
display(df.head())


CSV files found: ['my_recent_spotify_tracks.csv', 'my_top_spotify_tracks.csv']
Loading dataset: my_recent_spotify_tracks.csv
Dataset shape: (50, 5)


Unnamed: 0,track_name,artist,album,duration_ms,played_at
0,Vannam Konda,S. P. Balasubrahmanyam,Sigaram (Original Motion Picture Soundtrack),113711,2025-11-12T05:29:25.036Z
1,Nithiyathil Erupeerum,S. P. Balasubrahmanyam,Sigaram (Original Motion Picture Soundtrack),86674,2025-11-11T17:23:46.631Z
2,Pulikku Perandhavane,S. P. Balasubrahmanyam,Sigaram (Original Motion Picture Soundtrack),109244,2025-11-11T17:21:20.968Z
3,Vannam Konda,S. P. Balasubrahmanyam,Sigaram (Original Motion Picture Soundtrack),305136,2025-11-11T17:19:28.131Z
4,Idho Idho En Pallavi,S. P. Balasubrahmanyam,Sigaram (Original Motion Picture Soundtrack),278987,2025-11-11T17:14:22.661Z


In [20]:
# Keep only rows with the required features
df_clean = df.dropna(subset=available_feats).reset_index(drop=True)

# Ensure numeric conversion
for f in available_feats:
    df_clean[f] = pd.to_numeric(df_clean[f], errors='coerce')
df_clean = df_clean.dropna(subset=available_feats).reset_index(drop=True)

# Deduplicate by track_id or (track_name, artists) if possible
if 'track_id' in df_clean.columns:
    df_clean = df_clean.drop_duplicates(subset=['track_id']).reset_index(drop=True)
elif set(['track_name','artists']).issubset(df_clean.columns):
    df_clean = df_clean.drop_duplicates(subset=['track_name','artists']).reset_index(drop=True)

print("Cleaned dataset shape:", df_clean.shape)
display(df_clean[ ['track_name','artists'] + available_feats ].head())


KeyError: ['popularity']