In [1]:
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv
import spotipy
from tqdm import tqdm
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials
import time

In [9]:
request_timer = 0.15
person = "selina"
data_source = "./data/" + person

In [10]:
load_dotenv()
ID = os.getenv("ID_B")
SECRET = os.getenv("SECRET_B")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=ID, client_secret=SECRET))

### 1. History cleaning
- load all .json files containing the extended history
- drop unwanted columns
- rename columns

In [22]:
paths = Path(data_source).glob("Streaming*.json")
df = pd.concat(map(pd.read_json, paths))
df = df.drop(["ip_addr_decrypted", "user_agent_decrypted", "episode_name", "episode_show_name", "spotify_episode_uri", "offline", "incognito_mode", "master_metadata_album_artist_name"], axis=1)
df = df.rename({"master_metadata_track_name":"track_name", "master_metadata_album_album_name": "album_name", "spotify_track_uri":"track_uri"}, axis=1)
df = df.dropna(subset="track_uri")
df.head(3)

Unnamed: 0,ts,username,platform,ms_played,conn_country,track_name,album_name,track_uri,reason_start,reason_end,shuffle,skipped,offline_timestamp
0,2021-12-28T22:07:51Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",23,DE,Womanizer,Circus (Deluxe Version),spotify:track:4fixebDZAVToLbUCuEloa2,fwdbtn,fwdbtn,False,,1640729000000.0
1,2021-12-28T22:07:53Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",1438,DE,Boom Boom Pow,THE E.N.D. (THE ENERGY NEVER DIES),spotify:track:3oDFtOhcN08qeDPAK6MEQG,fwdbtn,fwdbtn,False,,1640729000000.0
2,2021-12-28T22:07:53Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",328,DE,Shut Up And Drive,Good Girl Gone Bad: Reloaded,spotify:track:4Tn2llBm1g0UlWctmgPL8Z,fwdbtn,fwdbtn,False,,1640729000000.0


### 2. Unique track uris
> **Some uris point to the same track!** \
> E.g. track with two artists, both realease it under different titles, still same track. **But** the isrc code is still the same. This can be used to combine the data.
- extract unique track uris
- download data for every uri
    - build tracks dataframe
    - build artists dataframe
    - build build tracks - artists dataframe; join on isrc

`TRACK <-isrc-> TRACK_ARTIST <-artist_uri-> ARTIST`

In [23]:
unique_uris = df["track_uri"].unique()
len(unique_uris)

36190

In [18]:
process = True
if process:
    artist_data = []
    artist_track_data = []
    tracks_data = []
    for chunk in tqdm(np.array_split(unique_uris, len(unique_uris)/45)):
        try:
            result = sp.tracks(chunk, market="DE")["tracks"]
            time.sleep(request_timer)
            result_nonull = []
            for track in result:
                if track != None:
                    tracks_data.append(track)
                    for artist in track["artists"]:
                        artist_data.append([artist["name"], artist["uri"]])
                        artist_track_data.append([artist["uri"], track["external_ids"].get("isrc", None)])
                    
        except Exception as e:
            print(e)
            break

    artists_tracks = pd.DataFrame(artist_track_data, columns=["artist_uri", "isrc"]).dropna()
    artists_tracks.to_csv("./spotify_downloads/" + person + "/artist_track.csv", index=False)

100%|█████████████████████████████████████████| 804/804 [05:08<00:00,  2.61it/s]


In [46]:
df_artists = pd.DataFrame(sum(pd.DataFrame(tracks_data)["artists"], [])).drop(["external_urls", "href", "type"], axis=1).drop_duplicates().reset_index(drop=True)
df_artists.to_csv("./spotify_downloads/" + person + "/artists.csv", index=False)
print(len(df_artists), "unique artists")
df_artists.head(3)

18463 unique artists


Unnamed: 0,id,name,uri
0,26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:artist:26dSoYclwsYLMAKD3tpOr4
1,1yxSLGMDHlW21z4YXirZDS,Black Eyed Peas,spotify:artist:1yxSLGMDHlW21z4YXirZDS
2,5pKCCKE2ajJHZ9KAiaK11H,Rihanna,spotify:artist:5pKCCKE2ajJHZ9KAiaK11H


In [41]:
df_tracks = pd.DataFrame(tracks_data).drop(["album", "artists", "disc_number", "is_playable", "is_local", "preview_url", "href", "id", "track_number", "type", "linked_from", "restrictions"], axis=1)
df_tracks = df_tracks.join(pd.DataFrame(df_tracks["external_ids"].values.tolist())).drop(["external_ids", "external_urls"], axis=1).dropna().reset_index(drop=True)
df_tracks.to_csv("./spotify_downloads/" + person + "/tracks_unified_isrc.csv", index=False)
df_tracks.head(3)

Unnamed: 0,duration_ms,explicit,name,popularity,uri,isrc
0,224400,False,Womanizer,72,spotify:track:4fixebDZAVToLbUCuEloa2,USJI10800838
1,251440,True,Boom Boom Pow,64,spotify:track:3oDFtOhcN08qeDPAK6MEQG,USUM70955624
2,212280,False,Shut Up And Drive,65,spotify:track:4Tn2llBm1g0UlWctmgPL8Z,USUM70734703


In [47]:
print("Unique ISRCs:", len(df_tracks.isrc.unique()))
print("Unique URIs :", len(df_tracks.uri.unique()))

Unique ISRCs: 32578
Unique URIs : 34200


To correctly associate an individual play with the corresponding song, the history must be extended with the tracks isrc. \
**First check:** how many track_uris are missing in the tracks dataframe.

In [59]:
print("Plays overall:    ", len(df))
print("Plays with isrc:  ", len(df.loc[df.track_uri.isin(df_tracks.uri)]))
print("Plays without isrc:", len(df.loc[~df.track_uri.isin(df_tracks.uri)]))

Plays overall:     233983
Plays with isrc:   206457
Plays without isrc: 27526


In [60]:
df.loc[~df.track_uri.isin(df_tracks.uri)]

Unnamed: 0,ts,username,platform,ms_played,conn_country,track_name,album_name,track_uri,reason_start,reason_end,shuffle,skipped,offline_timestamp
5,2021-12-28T22:07:59Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",1637,DE,Baby Boy (feat. Beyoncé ),Dutty Rock,spotify:track:1uVfUdVv0h9MWia3tdZo5G,fwdbtn,fwdbtn,False,,1.640729e+12
12,2021-12-28T22:08:08Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",1134,DE,Milkshake,Tasty,spotify:track:2cMTIlktg3M9mXYqCPqw1J,fwdbtn,fwdbtn,False,,1.640729e+12
49,2021-12-29T08:30:40Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",4191,DE,Fever,King Of The Dancehall,spotify:track:4C1Smq2t3MVOPzsUPWW0oX,fwdbtn,fwdbtn,False,,1.640767e+12
57,2021-12-29T08:39:18Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",219166,DE,Kids (2 Finger an den Kopf),Zum Glück in die Zukunft II,spotify:track:6lxTIMeCLEkj48TlCRG3XX,clickrow,fwdbtn,False,,1.640767e+12
72,2021-12-29T15:26:59Z,selina.schuchmann,"Android OS 11 API 30 (samsung, SM-A528B)",732,DE,OMG!,Zum Glück in die Zukunft II,spotify:track:6ngfhauAytTZsaQgZyJIeK,fwdbtn,fwdbtn,False,,1.640792e+12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4182,2024-06-04T12:36:04Z,selina.schuchmann,android,155151,DE,MILLION DOLLAR BABY,MILLION DOLLAR BABY,spotify:track:7fzHQizxTqy8wTXwlrgPQQ,clickrow,trackdone,True,False,1.717504e+09
4198,2024-06-05T11:35:07Z,selina.schuchmann,android,155151,DE,MILLION DOLLAR BABY,MILLION DOLLAR BABY,spotify:track:7fzHQizxTqy8wTXwlrgPQQ,clickrow,trackdone,True,False,1.717587e+09
4204,2024-06-06T05:13:32Z,selina.schuchmann,android,23487,DE,Unity,Unity,spotify:track:47qYqGPgMTh3l1PRKBHEfI,clickrow,endplay,False,True,1.717651e+09
4208,2024-06-06T05:13:56Z,selina.schuchmann,android,5071,DE,Time Lapse,Time Lapse,spotify:track:3v8aPisqoGRZEwwf2rCeXQ,clickrow,endplay,False,True,1.717651e+09


In [64]:
tracks_fix = pd.DataFrame(tracks_data).drop(["album", "artists", "disc_number", "is_playable", "is_local", "preview_url", "href", "id", "track_number", "type", "linked_from", "restrictions", "external_urls"], axis=1)
tracks_fix.head(3)

Unnamed: 0,duration_ms,explicit,external_ids,name,popularity,uri
0,224400,False,{'isrc': 'USJI10800838'},Womanizer,72,spotify:track:4fixebDZAVToLbUCuEloa2
1,251440,True,{'isrc': 'USUM70955624'},Boom Boom Pow,64,spotify:track:3oDFtOhcN08qeDPAK6MEQG
2,212280,False,{'isrc': 'USUM70734703'},Shut Up And Drive,65,spotify:track:4Tn2llBm1g0UlWctmgPL8Z


In [65]:
tracks_fix.loc[tracks_fix.uri == "spotify:track:47qYqGPgMTh3l1PRKBHEfI"]

Unnamed: 0,duration_ms,explicit,external_ids,name,popularity,uri


In [68]:
uris_fix = []
for chunk in np.array_split(unique_uris, len(unique_uris)/45):
    uris_fix = [*uris_fix, *chunk]

print("Plays with chunks:", len(uris_fix))
print("Plays without chunks:", len(df.track_uri.unique()))

Plays with chunks: 36190
Plays without chunks: 36190


In [72]:
uris_fix.index("spotify:track:3VvBPkc24zC7x05mgJTyGO")

36121