# Adding Genre Data via Spotipy and Spotify API Access
- This will allow me to do more in depth analysis on my data

In [7]:
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Load variables from .env file
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')

auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
import pandas as pd
import os
import json

# Load your actual Spotify data (same as in initial exploration)
data_path = '../data/Spotify Extended Streaming History'
dfs = []

for file in os.listdir(data_path):
    if file.endswith('.json') and file != 'artist_genre_map.json':
        dfs.append(pd.read_json(os.path.join(data_path, file)))

df = pd.concat(dfs, ignore_index=True)

# Clean up the data
df['ts'] = pd.to_datetime(df['ts'])
df['year'] = df['ts'].dt.year
df['month'] = df['ts'].dt.month
df['hour'] = df['ts'].dt.hour

df.rename(columns={'master_metadata_album_artist_name': 'artist', 'master_metadata_track_name': 'track'}, inplace=True)

# Get all unique artists from your listening history
unique_artists = df['artist'].dropna().unique()
print(f"Found {len(unique_artists)} unique artists in your listening history")

genre_map_path = os.path.join(data_path, 'artist_genre_map.json')

if os.path.exists(genre_map_path):
    print("Loading artist_genre_map from cache...")
    with open(genre_map_path, 'r', encoding='utf-8') as f:
        if os.stat(genre_map_path).st_size == 0:  # Check if the file is empty
            print("The artist_genre_map.json file is empty. Initializing an empty dictionary.")
            artist_genre_map = {}
        else:
            artist_genre_map = json.load(f)
else:
    print("Generating artist_genre_map from Spotify API...")
    artist_genre_map = {}
    processed = 0

    for artist in unique_artists:
        try:
            results = sp.search(q=f'artist:{artist}', type='artist', limit=1)
            items = results['artists']['items']
            if items:
                genres = items[0]['genres']
                artist_genre_map[artist] = genres
            else:
                artist_genre_map[artist] = []

            processed += 1
            if processed % 50 == 0:
                print(f"Processed {processed}/{len(unique_artists)} artists")

        except Exception as e:
            print(f"Error with {artist}: {e}")
            artist_genre_map[artist] = []

    with open(genre_map_path, 'w', encoding='utf-8') as f:
        json.dump(artist_genre_map, f, indent=2)
    print("Saved artist_genre_map to JSON.")

ValueError: All arrays must be of the same length

In [None]:
# --- Assign genres and clean ---
df['genre'] = df['artist'].map(lambda a: artist_genre_map.get(a, ['Unknown'])[0] if artist_genre_map.get(a) else 'Unknown')

# Filter out unknown genres
before = len(df)
df = df[df['genre'] != 'Unknown']
after = len(df)
print(f"Dropped {before - after:,} plays with unknown genres")

# --- Summary stats ---
total_plays = len(df)
genre_counts = df['genre'].value_counts()

print(f"\nTotal plays: {total_plays:,}")
print(f"Unique genres: {df['genre'].nunique()}")
print(f"\nTop 10 genres by play count:\n{genre_counts.head(10)}")

# --- Sample of artists with genres ---
print("\nSample of artists with their genres:")
print(df[['artist', 'genre']].drop_duplicates().head(20))

Dropped 120,702 plays with unknown genres

Total plays: 118,263
Unique genres: 378

Top 10 genres by play count:
genre
rap                     41052
rage rap                14310
melodic rap             13662
hip hop                  6361
r&b                      2723
chicago drill            1980
metal                    1800
alternative hip hop      1796
southern hip hop         1765
experimental hip hop     1678
Name: count, dtype: int64

Sample of artists with their genres:
                                             artist              genre
0                                     Drowning Pool           nu metal
1                                       DevilDriver       groove metal
2                                         Disturbed              metal
3                                         Metallica              metal
4                          Rage Against The Machine          rap metal
5                                       Linkin Park           nu metal
8                   