In [36]:
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv
import spotipy
from tqdm import tqdm
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials
import time

In [37]:
request_timer = 1

In [2]:
paths = Path("./data/ole").glob("endsong*.json")
df = pd.concat(map(pd.read_json, paths))
df = df[df["spotify_track_uri"].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78786 entries, 0 to 15904
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ts                                 78786 non-null  object 
 1   username                           78786 non-null  object 
 2   platform                           78786 non-null  object 
 3   ms_played                          78786 non-null  int64  
 4   conn_country                       78786 non-null  object 
 5   ip_addr_decrypted                  78786 non-null  object 
 6   user_agent_decrypted               78693 non-null  object 
 7   master_metadata_track_name         78786 non-null  object 
 8   master_metadata_album_artist_name  78786 non-null  object 
 9   master_metadata_album_album_name   78786 non-null  object 
 10  spotify_track_uri                  78786 non-null  object 
 11  episode_name                       0 non-null      object 


![](./docs/er.svg)

## Preparation
1. fetch Track Data from SpotifyWeb Api
    - [ ] id
    - [ ] duration
    - [ ] music analysis
    - [ ] audio features
2. artist data

In [44]:
load_dotenv()
ID = os.getenv("ID_B")
SECRET = os.getenv("SECRET_B")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=ID, client_secret=SECRET))

## 1. Tracks Data

### 1.1 Existing data from the extended history

Drop all duplicate entries in the history, therey removing multiple plays.
However this is not sufficient to properly eliminate duplicate references to the same track, since one track can be played from different albums.
For Example, when a song is at first released as a single, and later within an album. Individual plays appear with a different `spotify_track_uri`, altough they should be counted as one.

To fix this, the online tool [Datablist](https://datablist.com) is used, to find and remove duplicates.
An entry is considered a duplicate if the artist, album name, and track name are identical.
Subsequently, the track uri's are merged into a comma-separated string, which is then transformed into an array for futher processing.
Tough the column is then no longer atomic, is is now easier to assign play entries, with different uris to the same song.

1. Import `unique_tracks.csv` into Datablist
2. Find duplicates based on `artist`, `track` and `album`
3. Merge `spotify_track_uri`, drop `album`

<div class="alert alert-block alert-info">
<b>TODO:</b>    
This should be automated, for processing new data.
</div>

In [4]:
unique_tracks = df[df["episode_name"].isnull()][["master_metadata_track_name", "master_metadata_album_album_name", "spotify_track_uri", "master_metadata_album_artist_name"]].drop_duplicates()
unique_tracks = unique_tracks.dropna()
unique_tracks.info()
#unique_tracks.to_csv("unique_tracks.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 22705 entries, 0 to 15819
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   master_metadata_track_name         22705 non-null  object
 1   master_metadata_album_album_name   22705 non-null  object
 2   spotify_track_uri                  22705 non-null  object
 3   master_metadata_album_artist_name  22705 non-null  object
dtypes: object(4)
memory usage: 886.9+ KB


In [6]:
tracks_cleaned = pd.read_csv("./processed/ole/unique_cleaned.csv")
tracks_cleaned["spotify_track_uri"] = tracks_cleaned["spotify_track_uri"].apply(lambda r: [s for s in r.split(",")])
print("Unique tracks after cleaning:", len(tracks_cleaned))
tracks_cleaned.head()

Unique tracks after cleaning: 21206


Unnamed: 0,master_metadata_track_name,master_metadata_album_album_name,spotify_track_uri,master_metadata_album_artist_name
0,STFU,"Long Time, No See.",[spotify:track:7DOq1mhfUI49SNVAlx8E1v],Aim Vision
1,Wo bist Du? (feat. Alligatoah),Wo bist Du? (feat. Alligatoah),"[spotify:track:7Dbq3XgIm824Od54EHIhBC, spotify...",Swiss
2,What I Know Is All Quicksand - rookery live tapes,rookery live tapes,[spotify:track:71E4QM6O81q0DX79Cg6EGk],Giant Rooks
3,Bentley Coupe,Bentley Coupe,[spotify:track:7aksJRV1H0pCds7FSiZ8ZI],Social House
4,Tauchen (feat. KYMA),Malu,[spotify:track:6yBIrzVvtYoL0meYNmDEHa],Tom Thaler & Basil


This separation still is not enough, because having two keys for every track makes it difficult to aggregate the data.
Therefore every uri in the `extended history`, that is related to another track, is replaced by the first uri in the `spotify_track_uri` array above.

<div class="alert alert-block alert-warning">
This only has to be done once. The resulting dataframe is strored as a `.csv`. 
</div>

In [8]:
process = False
if process:
    lookup = tracks_cleaned["spotify_track_uri"].to_frame()
    lookup.columns = ["from"]
    lookup["to"] = lookup["from"].map(lambda r: r[0])
    lookup.head()
    
    tqdm.pandas()
    df["spotify_track_uri"] = df["spotify_track_uri"].progress_map(lambda x: lookup[lookup["from"].str.join(" ").str.contains(x)]["to"].item())
    df.to_csv("./processed/ole/history.csv")

---
Once the tracks have been processed, the next stage is to prepare the relevant metrics.
The Spotify web API provides information about every track, namely audio features and analysis.

### 1.2 Audio Features

This is an example response for the audio features of "Time is Running Out" by "Muse".
```json
"acousticness": 0.00242,
"analysis_url": "https://api.spotify.com/v1/audio-analysis/2takcwOaAZWiXQijPHIx7B",
"danceability": 0.585,
"duration_ms": 237040,
"energy": 0.842,
"id": "2takcwOaAZWiXQijPHIx7B",
"instrumentalness": 0.00686,
"key": 9,
"liveness": 0.0866,
"loudness": -5.883,
"mode": 0,
"speechiness": 0.0556,
"tempo": 118.211,
"time_signature": 4,
"track_href": "https://api.spotify.com/v1/tracks/2takcwOaAZWiXQijPHIx7B",
"type": "audio_features",
"uri": "spotify:track:2takcwOaAZWiXQijPHIx7B",
"valence": 0.428
```

Since the uri's are now cleaned, we can just request the auto features via the spotify api and append them to our dataframe.

In [39]:
audio_features = pd.DataFrame(columns=[
    "uri",
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "mode",
    "speechiness",
    "tempo",
    "time_signature",
    "valence",
])
audio_features.head()

Unnamed: 0,uri,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence


<div class="alert alert-block alert-warning">
Use with caution! The spotify rate limit currently block any api call. (At least to the audio-features endpoint)
</div>

In [38]:
import time
for chunk in tqdm(np.array_split(unique_tracks, len(unique_tracks)/90)):
    try:
        time.sleep(request_timer)
        uris = chunk["spotify_track_uri"].to_numpy()
        result = sp.audio_features(uris)
        result_nonull = []
        for obj in result:
            if obj != None:
                result_nonull.append(obj)
        chunk_df = pd.DataFrame(result_nonull)
        audio_features = pd.concat([audio_features, chunk_df], join="inner")
    except Exception as e:
        print(e)
        break
    

NameError: name 'unique_tracks' is not defined

In [27]:
audio_features.rename({"uri": "spotify_track_uri"}, inplace=True, axis=1)
audio_features

Unnamed: 0,spotify_track_uri,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,spotify:track:7DOq1mhfUI49SNVAlx8E1v,0.0699,0.817,147252,0.674,0.00639,3,0.12,-7.787,0,0.0476,119.999,4,0.79


In [28]:
unique_cleaned = pd.read_csv("./processed/ole/unique_cleaned.csv")

In [29]:
tracks_features = unique_cleaned.merge(audio_features, how="outer", on="spotify_track_uri")
tracks_features[tracks_features["energy"].notnull()]

Unnamed: 0,master_metadata_track_name,master_metadata_album_album_name,spotify_track_uri,master_metadata_album_artist_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
19648,STFU,"Long Time, No See.",spotify:track:7DOq1mhfUI49SNVAlx8E1v,Aim Vision,0.0699,0.817,147252,0.674,0.00639,3,0.12,-7.787,0,0.0476,119.999,4,0.79


## 2. Artists and Genres

In order to gain insight into the different genres listened to, the artist data must be queried.
Due to the nature of an n to n relationship, the artist and genre data is modelled as shown in the ER diagram above.

The problem is that only the main artist is given in the extended history, not the features. Also, the artist is only referenced by name and a spotify_id is required to request further data.
However, the tracks api also contains all information about the artist that contributed to the track. So in order to get the artist ids, we have to querry all tracks.

In [37]:
unique_cleaned.head()

Unnamed: 0,master_metadata_track_name,master_metadata_album_album_name,spotify_track_uri,master_metadata_album_artist_name
0,STFU,"Long Time, No See.",spotify:track:7DOq1mhfUI49SNVAlx8E1v,Aim Vision
1,Wo bist Du? (feat. Alligatoah),Wo bist Du? (feat. Alligatoah),"spotify:track:7Dbq3XgIm824Od54EHIhBC,spotify:t...",Swiss
2,What I Know Is All Quicksand - rookery live tapes,rookery live tapes,spotify:track:71E4QM6O81q0DX79Cg6EGk,Giant Rooks
3,Bentley Coupe,Bentley Coupe,spotify:track:7aksJRV1H0pCds7FSiZ8ZI,Social House
4,Tauchen (feat. KYMA),Malu,spotify:track:6yBIrzVvtYoL0meYNmDEHa,Tom Thaler & Basil


In [93]:
df_artist = pd.DataFrame(columns=["artist_name", "artist_uri", "genres"])
df_artist_track = pd.DataFrame(columns=["artist_uri", "track_uri"])

In [94]:
request = False
if request:
    artist_data = []
    artist_track_data = []
    for chunk in tqdm(np.array_split(unique_cleaned, len(unique_cleaned)/45)):
        try:
            uris = chunk["spotify_track_uri"].map(lambda x: x.split(",")[0]).to_numpy()
            result = sp.tracks(uris, market="DE")["tracks"]
            time.sleep(request_timer)
            result_nonull = []
            for track in result:
                if track != None:
                    for artist in track["artists"]:
                        artist_data.append([artist["name"], artist["uri"], artist.get("genres", None)])
                        artist_track_data.append([artist["uri"], track["uri"]])
                
        except Exception as e:
            print(e)
            break

  return bound(*args, **kwds)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 471/471 [09:33<00:00,  1.22s/it]


In [24]:
load_from_file = True

if load_from_file:
    df_artist = pd.read_csv("./processed/ole/artist.csv")
    df_artist_track = pd.read_csv("./processed/ole/artist_track.csv")
else:    
    df_artist_track = pd.DataFrame(artist_track_data, columns=["artist_uri", "track_uri"])
    df_artist_track.to_csv("./processed/ole/artist_track.csv")
    df_artist_track.info()
    
    df_artist = pd.DataFrame(artist_data, columns=["artist_name", "artist_uri", "genres"]).drop_duplicates()
    df_artist.to_csv("./processed/ole/artist.csv")
    df_artist.head()

In [27]:
df_artist = df_artist[["artist_name", "artist_uri"]]
df_artist = df_artist.drop_duplicates("artist_uri")
df_artist

Unnamed: 0,artist_name,artist_uri
0,Aim Vision,spotify:artist:2sis3xwzmvAPHBEOrztf0B
1,Swiss,spotify:artist:702Fgj9mIQ9bzfp9i9Dg7B
2,Swiss & Die Andern,spotify:artist:3d8f0YBZivistZ4Ohauncb
3,Alligatoah,spotify:artist:0r0R5nIjDY04TfxRM10Bcb
4,Giant Rooks,spotify:artist:5wD0owYApRtYmjPWavWKvb
...,...,...
11458,Remoe,spotify:artist:2Q570lQPWiuP2dCOP69jO3
11459,Kidaf,spotify:artist:5JBZrdeCKdPc2ScgCceX1s
11460,Reggie ‘N’ Bollie,spotify:artist:3qCIifFHqOTzTAC1Sww8ms
11461,ICEDEALER,spotify:artist:1jY586SXHigiZ1m3wJpjdh


Now that each artist has its URI, it is possible to request more specific data about them.

In [49]:
request = True
if request:
    artist_genres = []
    for chunk in tqdm(np.array_split(df_artist, len(df_artist)/45)):
        try:
            uris = chunk["artist_uri"]
            result = sp.artists(uris)["artists"]
            time.sleep(request_timer)
            for artist in result:
                artist_genres.append([artist["uri"], artist["genres"]])
                
        except Exception as e:
            print(e)
            break

100%|█████████████████████████████████████████| 254/254 [05:14<00:00,  1.24s/it]


In [51]:
df_genres_a = pd.DataFrame(artist_genres, columns=["artist_uri", "genres"])
df_genres_a.head()

Unnamed: 0,artist_uri,genres
0,spotify:artist:2sis3xwzmvAPHBEOrztf0B,[]
1,spotify:artist:702Fgj9mIQ9bzfp9i9Dg7B,[]
2,spotify:artist:3d8f0YBZivistZ4Ohauncb,"[antideutsche, german alternative rap]"
3,spotify:artist:0r0R5nIjDY04TfxRM10Bcb,[german pop]
4,spotify:artist:5wD0owYApRtYmjPWavWKvb,[]


In [61]:
df_genres = pd.DataFrame(sum(df_genres_a["genres"].tolist(), []), columns=["genre"]).drop_duplicates()
df_genres

Unnamed: 0,genre
0,antideutsche
1,german alternative rap
2,german pop
3,rap calme
6,modern indie pop
...,...
18821,turkish jazz
18824,koto
18826,harp
18828,electroacoustic composition


In [81]:
genre_artist = []
for i, artist in df_genres_a.iterrows():
    artist_genres = artist["genres"]
    for genre_id in df_genres[df_genres["genre"].isin(artist_genres)].to_numpy():
        genre_artist.append([artist["artist_uri"], genre_id[0]])

In [82]:
df_genre_artist = pd.DataFrame(genre_artist, columns=["artist_uri", "genre_id"])
df_genre_artist

Unnamed: 0,artist_uri,genre_id
0,spotify:artist:3d8f0YBZivistZ4Ohauncb,antideutsche
1,spotify:artist:3d8f0YBZivistZ4Ohauncb,german alternative rap
2,spotify:artist:0r0R5nIjDY04TfxRM10Bcb,german pop
3,spotify:artist:46OlTXwi8hanoxXHTE7E5z,rap calme
4,spotify:artist:1b6KZ6XeJLiFJkFghmkbe8,antideutsche
...,...,...
18859,spotify:artist:3qCIifFHqOTzTAC1Sww8ms,uk dancehall
18860,spotify:artist:0WOxhx4hikIsyF3CRPLC8W,chicago rap
18861,spotify:artist:0WOxhx4hikIsyF3CRPLC8W,rap
18862,spotify:artist:0WOxhx4hikIsyF3CRPLC8W,trap


In [95]:
df_history = pd.read_csv("./processed/ole/history.csv").drop(["Unnamed: 0", "username", "platform", "ip_addr_decrypted", "user_agent_decrypted", "master_metadata_album_artist_name", "master_metadata_album_album_name", "episode_name", "episode_show_name", "spotify_episode_uri", "offline", "offline_timestamp", "incognito_mode"], axis=1)
df_history.head()

Unnamed: 0,ts,ms_played,conn_country,master_metadata_track_name,spotify_track_uri,reason_start,reason_end,shuffle,skipped
0,2022-09-30T07:27:34Z,147252,DE,STFU,spotify:track:7DOq1mhfUI49SNVAlx8E1v,trackdone,trackdone,True,
1,2022-09-30T07:31:17Z,223778,DE,Wo bist Du? (feat. Alligatoah),spotify:track:7Dbq3XgIm824Od54EHIhBC,trackdone,trackdone,True,
2,2022-09-30T07:38:50Z,412963,DE,What I Know Is All Quicksand - rookery live tapes,spotify:track:71E4QM6O81q0DX79Cg6EGk,trackdone,trackdone,True,
3,2022-09-30T07:41:43Z,174397,DE,Bentley Coupe,spotify:track:7aksJRV1H0pCds7FSiZ8ZI,trackdone,trackdone,True,
4,2022-09-30T07:45:21Z,219893,DE,Tauchen (feat. KYMA),spotify:track:6yBIrzVvtYoL0meYNmDEHa,trackdone,trackdone,True,
