In [1]:
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv
import spotipy
from tqdm import tqdm
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials
import time

tqdm.pandas()

In [92]:
request_timer = 0.15
person = "rob"
data_source = "./data/" + person

In [93]:
load_dotenv()
ID = os.getenv("ID_B")
SECRET = os.getenv("SECRET_B")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=ID, client_secret=SECRET))

### 1. History cleaning
- load all .json files containing the extended history
- drop unwanted columns
- rename columns

In [94]:
paths = Path(data_source).glob("Streaming*.json")
df = pd.concat(map(pd.read_json, paths))
df = df.drop(["ip_addr_decrypted", "user_agent_decrypted", "episode_name", "episode_show_name", "spotify_episode_uri", "offline", "incognito_mode", "master_metadata_album_artist_name"], axis=1)
df = df.rename({"master_metadata_track_name":"track_name", "master_metadata_album_album_name": "album_name", "spotify_track_uri":"track_uri"}, axis=1)
df = df.dropna(subset="track_uri").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125013 entries, 0 to 125012
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ts                 125013 non-null  object 
 1   username           125013 non-null  object 
 2   platform           125013 non-null  object 
 3   ms_played          125013 non-null  int64  
 4   conn_country       125013 non-null  object 
 5   track_name         125013 non-null  object 
 6   album_name         125013 non-null  object 
 7   track_uri          125013 non-null  object 
 8   reason_start       125013 non-null  object 
 9   reason_end         118138 non-null  object 
 10  shuffle            125013 non-null  bool   
 11  skipped            17002 non-null   object 
 12  offline_timestamp  125013 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(10)
memory usage: 11.6+ MB


### 2. Unique track uris
> **Some uris point to the same track!** \
> E.g. track with two artists, both realease it under different titles, still same track. **But** the isrc code is still the same. This can be used to combine the data.
- extract unique track uris
- download data for every uri
    - build tracks dataframe
    - build artists dataframe
    - build build tracks - artists dataframe; join on isrc

`TRACK <-isrc-> TRACK_ARTIST <-artist_uri-> ARTIST`

In [95]:
unique_uris = df["track_uri"].unique()
len(unique_uris)

12893

In [96]:
process = True
if process:
    artist_data = []
    artist_track_data = []
    tracks_data = []
    for chunk in tqdm(np.array_split(unique_uris, len(unique_uris)/45)):
        try:
            result = sp.tracks(chunk, market="DE")["tracks"]
            time.sleep(request_timer)
            result_nonull = []
            for i, track in enumerate(result):
                if track != None:
                    track["orig_uri"] = chunk[i]
                    tracks_data.append(track)
                    for artist in track["artists"]:
                        artist_data.append([artist["name"], artist["uri"]])
                        artist_track_data.append([artist["uri"], track["external_ids"].get("isrc", None)])
                    
        except Exception as e:
            print(e)
            break

    artists_tracks = pd.DataFrame(artist_track_data, columns=["artist_uri", "isrc"]).dropna()
    artists_tracks.to_csv("./spotify_downloads/" + person + "/artist_track.csv", index=False)

100%|█████████████████████████████████████████| 286/286 [01:31<00:00,  3.12it/s]


In [97]:
df_artists = pd.DataFrame(sum(pd.DataFrame(tracks_data)["artists"], [])).drop(["external_urls", "href", "type"], axis=1).drop_duplicates().reset_index(drop=True)
df_artists.to_csv("./spotify_downloads/" + person + "/artists.csv", index=False)
print(len(df_artists), "unique artists")
df_artists.head(3)

6740 unique artists


Unnamed: 0,id,name,uri
0,7dGJo4pcD2V6oG8kP0tJRR,Eminem,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR
1,5pKCCKE2ajJHZ9KAiaK11H,Rihanna,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H
2,3hImYGOkGn8Rsh9Vygcds2,Maps,spotify:artist:3hImYGOkGn8Rsh9Vygcds2


In [98]:
df_tracks = pd.DataFrame(tracks_data).drop(["album", "artists", "disc_number", "is_playable", "is_local", "preview_url", "href", "id", "track_number", "type", "linked_from", "restrictions"], axis=1)
df_tracks = df_tracks.join(pd.DataFrame(df_tracks["external_ids"].values.tolist())).drop(["external_ids", "external_urls"], axis=1).dropna().reset_index(drop=True)
df_tracks.to_csv("./spotify_downloads/" + person + "/tracks_unified_isrc.csv", index=False)
df_tracks.head(3)

Unnamed: 0,duration_ms,explicit,name,popularity,uri,orig_uri,isrc
0,250760,True,Mockingbird,86,spotify:track:561jH07mF1jHuk7KlaeF0s,spotify:track:17baAghWcrewNOcc9dCewx,USIR10400813
1,250188,True,The Monster,78,spotify:track:48RrDBpOSSl1aLVCalGl5C,spotify:track:5U8hKxSaDXB8cVeLFQjvwx,USUM71314082
2,272417,True,Survival,60,spotify:track:3stOygN0I7CIvkEB2LJGbv,spotify:track:29Key5Lj0YlIMH8JzRDy6U,USUM71312879


In [99]:
df_tracks.loc[df_tracks.uri != df_tracks.orig_uri]

df_test = df_tracks.copy()
df_test["orig_track_name"] = df_test.orig_uri.progress_map(lambda r: df.loc[df.track_uri == r].head(1).track_name.values[0])
df_test.loc[(df_test.name != df_test.orig_track_name) & (df_test.uri != df_test.orig_uri)].orig_track_name.isna().count()

100%|█████████████████████████████████████| 12891/12891 [02:30<00:00, 85.75it/s]


560

In [100]:
df_test.to_csv("./spotify_downloads/" + person + "/tracks")

In [101]:
print("Unique ISRCs:", len(df_tracks.isrc.unique()))
print("Unique URIs :", len(df_tracks.uri.unique()))

Unique ISRCs: 11551
Unique URIs : 11859


To correctly associate an individual play with the corresponding song, the history must be extended with the tracks isrc. \
**First check:** how many track_uris are missing in the tracks dataframe.

In [102]:
print("Plays overall:    ", len(df))
print("Plays with isrc:  ", len(df.loc[df.track_uri.isin(df_tracks.uri)]))
print("Plays without isrc:", len(df.loc[~df.track_uri.isin(df_tracks.uri)]))

Plays overall:     125013
Plays with isrc:   87482
Plays without isrc: 37531


In [103]:
uris_fix = []
for chunk in np.array_split(unique_uris, len(unique_uris)/45):
    uris_fix = [*uris_fix, *chunk]

print("Plays with chunks:", len(uris_fix))
print("Plays without chunks:", len(df.track_uri.unique()))

Plays with chunks: 12893
Plays without chunks: 12893


### 3. Add ISRC to history

In [104]:
tracks_isrc = pd.read_csv("./spotify_downloads/" + person + "/tracks_unified_isrc.csv")
print(len(tracks_isrc))
tracks_isrc.head(3)

12891


Unnamed: 0,duration_ms,explicit,name,popularity,uri,orig_uri,isrc
0,250760,True,Mockingbird,86,spotify:track:561jH07mF1jHuk7KlaeF0s,spotify:track:17baAghWcrewNOcc9dCewx,USIR10400813
1,250188,True,The Monster,78,spotify:track:48RrDBpOSSl1aLVCalGl5C,spotify:track:5U8hKxSaDXB8cVeLFQjvwx,USUM71314082
2,272417,True,Survival,60,spotify:track:3stOygN0I7CIvkEB2LJGbv,spotify:track:29Key5Lj0YlIMH8JzRDy6U,USUM71312879


In [105]:
tracks_isrc_unique = tracks_isrc.drop_duplicates("uri")
print(len(tracks_isrc_unique))
tracks_isrc_unique.to_csv("./spotify_downloads/" + person + "/tracks_isrc_unique.csv", index=False)
tracks_isrc_unique.head(3)

11859


Unnamed: 0,duration_ms,explicit,name,popularity,uri,orig_uri,isrc
0,250760,True,Mockingbird,86,spotify:track:561jH07mF1jHuk7KlaeF0s,spotify:track:17baAghWcrewNOcc9dCewx,USIR10400813
1,250188,True,The Monster,78,spotify:track:48RrDBpOSSl1aLVCalGl5C,spotify:track:5U8hKxSaDXB8cVeLFQjvwx,USUM71314082
2,272417,True,Survival,60,spotify:track:3stOygN0I7CIvkEB2LJGbv,spotify:track:29Key5Lj0YlIMH8JzRDy6U,USUM71312879


In [106]:
tqdm.pandas()
def try_get(array):
    try:
        return array[0]
    except:
        return None
#df["isrc"] = df.track_uri.progress_apply(lambda r: df_tracks[(df_tracks.orig_uri == r) | (df_tracks.uri == r)].isrc.iloc[0])
df["isrc"] = df.track_uri.progress_apply(lambda r: try_get(df_tracks[(df_tracks.orig_uri == r) | (df_tracks.uri == r)].isrc.values))

100%|██████████████████████████████████| 125013/125013 [03:59<00:00, 522.91it/s]


In [91]:
df.isrc

0         QMUY41500182
1         QMUY41500182
2         GBARL1501358
3         GB28K1500110
4         USRC11600042
              ...     
112616    QM24S2400597
112617    QM24S2403030
112618    QZK6H2208200
112619    QZK6H2208200
112620    USUM72401994
Name: isrc, Length: 112621, dtype: object

In [None]:
df[df.track_name == "Can't Feel My Face"].track_uri.unique()

In [None]:
df[df.track_name == "Can't Feel My Face"].apply(lambda r: tracks_isrc[(tracks_isrc.name == r.track_name) & (tracks_isrc.duration_ms == r.duration_ms)].isrc, axis=1)

In [None]:
df[df.track_name == "Can't Feel My Face"].apply(lambda r: tracks_isrc[tracks_isrc.uri == r.uri].isrc, axis=1)

In [107]:
df.to_csv("./spotify_downloads/" + person + "/history_isrc.csv", index=False)

### 4. Track features

In [19]:
audio_features = pd.DataFrame(columns=[
    "uri",
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "mode",
    "speechiness",
    "tempo",
    "time_signature",
    "valence",
])

In [20]:
tracks_isrc_unique = pd.read_csv("./spotify_downloads/" + person + "/tracks_isrc_unique.csv")
tracks_isrc_unique.head(3)

Unnamed: 0,duration_ms,explicit,name,popularity,uri,orig_uri,isrc
0,166138,False,Light It Up (feat. Nyla & Fuse ODG) [Remix],77,spotify:track:1pjvlYOMIg1NhGQbM6iwrY,spotify:track:6lDo13SSgTv0WbyUQKgnjk,QMUY41500182
1,180066,False,White Tiger - Single Version,54,spotify:track:64I0PKLFEKlcvc7fEVUGq0,spotify:track:64I0PKLFEKlcvc7fEVUGq0,GBARL1501358
2,193333,False,Bang My Head (feat. Sia & Fetty Wap),56,spotify:track:53Y0kdCa1CZ9gRqEuknfwy,spotify:track:53Y0kdCa1CZ9gRqEuknfwy,GB28K1500110


In [22]:
process = True
if process:
    for chunk in tqdm(np.array_split(tracks_isrc_unique, len(tracks_isrc_unique)/90)):
        try:
            time.sleep(request_timer)
            uris = chunk["uri"].to_numpy()
            result = sp.audio_features(uris)
            result_nonull = []
            for obj in result:
                if obj != None:
                    result_nonull.append(obj)
            chunk_df = pd.DataFrame(result_nonull)
            audio_features = pd.concat([audio_features, chunk_df], join="inner")
        except Exception as e:
            print(e)
            break
    audio_features.to_csv("./spotify_downloads/" + person + "/audio_features.csv", index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 148/148 [00:47<00:00,  3.09it/s]


In [23]:
audio_features_dropped = audio_features.drop_duplicates(subset="uri")
audio_features_dropped.to_csv("./spotify_downloads/" + person + "/audio_features.csv", index=False)

In [24]:
missed = len(tracks_isrc_unique) - len(audio_features_dropped)
print(missed, "tracks missing")

8 tracks missing


In [25]:
tracks_isrc_unique = pd.read_csv("./spotify_downloads/" + person + "/tracks_isrc_unique.csv")

In [26]:
tracks_with_features = tracks_isrc_unique.merge(audio_features_dropped, how="outer", on="uri")
tracks_with_features.to_csv("./spotify_downloads/" + person + "/tracks_with_features.csv")

### 5. Genres

In [108]:
artists = pd.read_csv("./spotify_downloads/" + person + "/artists.csv")
artists.head(3)

Unnamed: 0,id,name,uri
0,7dGJo4pcD2V6oG8kP0tJRR,Eminem,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR
1,5pKCCKE2ajJHZ9KAiaK11H,Rihanna,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H
2,3hImYGOkGn8Rsh9Vygcds2,Maps,spotify:artist:3hImYGOkGn8Rsh9Vygcds2


In [109]:
request = True
if request:
    artist_genres = []
    for chunk in tqdm(np.array_split(artists["uri"], len(artists)/45)):
        try:
            uris = chunk.to_numpy()
            result = sp.artists(uris)["artists"]
            time.sleep(request_timer)
            for artist in result:
                artist_genres.append([artist["uri"], artist["genres"]])
                
        except Exception as e:
            print(e)
            break

100%|█████████████████████████████████████████| 149/149 [00:53<00:00,  2.77it/s]


In [110]:
artist_genres_arrays = pd.DataFrame(artist_genres, columns=["uri", "genres"])
artist_genres_arrays.head(3)

Unnamed: 0,uri,genres
0,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR,"[detroit hip hop, hip hop, rap]"
1,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H,"[barbadian pop, pop, urban contemporary]"
2,spotify:artist:3hImYGOkGn8Rsh9Vygcds2,"[northamptonshire indie, nu gaze, shoegaze]"


In [111]:
genres = pd.DataFrame(sum(artist_genres_arrays["genres"].tolist(), []), columns=["genre"]).drop_duplicates()

genre_artist_data = []
for i, artist in artist_genres_arrays.iterrows():
    artist_genres = artist["genres"]
    for genre_id in genres[genres["genre"].isin(artist_genres)].to_numpy():
        genre_artist_data.append([artist["uri"], genre_id[0]])

artist_to_genre = pd.DataFrame(genre_artist_data, columns=["uri", "genre"])
artist_to_genre.to_csv("./spotify_downloads/" + person + "/artist_genre.csv", index=False)

In [112]:
artist_genre = pd.read_csv("./spotify_downloads/" + person + "/artist_genre.csv")
artist_genre.head(3)

Unnamed: 0,uri,genre
0,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR,detroit hip hop
1,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR,hip hop
2,spotify:artist:7dGJo4pcD2V6oG8kP0tJRR,rap


In [113]:
genres.to_csv("./spotify_downloads/" + person + "/genres.csv", index=False)
genres.head(3)

Unnamed: 0,genre
0,detroit hip hop
1,hip hop
2,rap
