In [1]:
import pandas as pd
import os
from pathlib import Path
from dotenv import load_dotenv
import spotipy
from tqdm import tqdm
import numpy as np
from spotipy.oauth2 import SpotifyClientCredentials
import time

In [2]:
request_timer = 0.33
person = "neele"
data_source = "./data/" + person

In [3]:
paths = Path(data_source).glob("Streaming*.json")
df = pd.concat(map(pd.read_json, paths))
df = df[df["spotify_track_uri"].notnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 112621 entries, 0 to 13725
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   ts                                 112621 non-null  object
 1   username                           112621 non-null  object
 2   platform                           112621 non-null  object
 3   ms_played                          112621 non-null  int64 
 4   conn_country                       112621 non-null  object
 5   ip_addr_decrypted                  112609 non-null  object
 6   user_agent_decrypted               108498 non-null  object
 7   master_metadata_track_name         112621 non-null  object
 8   master_metadata_album_artist_name  112621 non-null  object
 9   master_metadata_album_album_name   112621 non-null  object
 10  spotify_track_uri                  112621 non-null  object
 11  episode_name                       0 non-null       object

![](./docs/er.svg)

## Preparation
1. fetch Track Data from SpotifyWeb Api
    - [ ] id
    - [ ] duration
    - [ ] music analysis
    - [ ] audio features
2. artist data

In [4]:
load_dotenv()
ID = os.getenv("ID_B")
SECRET = os.getenv("SECRET_B")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=ID, client_secret=SECRET))

## 1. Tracks Data

### 1.1 Existing data from the extended history

Drop all duplicate entries in the history, therey removing multiple plays.
However this is not sufficient to properly eliminate duplicate references to the same track, since one track can be played from different albums.
For Example, when a song is at first released as a single, and later within an album. Individual plays appear with a different `spotify_track_uri`, altough they should be counted as one.

To fix this, the online tool [Datablist](https://datablist.com) is used, to find and remove duplicates.
An entry is considered a duplicate if the artist, album name, and track name are identical.
Subsequently, the track uri's are merged into a comma-separated string, which is then transformed into an array for futher processing.
Tough the column is then no longer atomic, is is now easier to assign play entries, with different uris to the same song.

1. Import `unique_tracks.csv` into Datablist
2. Find duplicates based on `artist`, `track` and `album`
3. Merge `spotify_track_uri`, drop `album`

<div class="alert alert-block alert-info">
<b>TODO:</b>    
This should be automated, for processing new data.
</div>

In [5]:
unique_tracks = df[df["episode_name"].isnull()][["master_metadata_track_name", "master_metadata_album_album_name", "spotify_track_uri", "master_metadata_album_artist_name"]].drop_duplicates()
unique_tracks = unique_tracks.dropna()
unique_tracks.info()
unique_tracks.to_csv("./processed/" + person + "/unique_tracks.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 55309 entries, 1 to 59
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   master_metadata_track_name         55309 non-null  object
 1   master_metadata_album_album_name   55309 non-null  object
 2   spotify_track_uri                  55309 non-null  object
 3   master_metadata_album_artist_name  55309 non-null  object
dtypes: object(4)
memory usage: 2.1+ MB


In [6]:
tracks_cleaned = pd.read_csv("./processed/" + person + "/unique_tracks.csv")
tracks_cleaned["spotify_track_uri"] = tracks_cleaned["spotify_track_uri"].apply(lambda r: [s for s in r.split(",")])
print("Unique tracks after cleaning:", len(tracks_cleaned))
tracks_cleaned.head()
tracks_cleaned.to_csv("./processed/" + person + "/tracks_cleaned.csv")

Unique tracks after cleaning: 55309


This separation still is not enough, because having two keys for every track makes it difficult to aggregate the data.
Therefore every uri in the `extended history`, that is related to another track, is replaced by the first uri in the `spotify_track_uri` array above.

<div class="alert alert-block alert-warning">
This only has to be done once. The resulting dataframe is strored as a `.csv`. 
</div>

In [7]:
process = True
if process:
    lookup = tracks_cleaned["spotify_track_uri"].to_frame()
    lookup.columns = ["from"]
    lookup["to"] = lookup["from"].map(lambda r: r[0])
    lookup.head()
    
    tqdm.pandas()
    df["spotify_track_uri"] = df["spotify_track_uri"].progress_map(lambda x: lookup[lookup["from"].str.join(" ").str.contains(x)]["to"].item())
    df.to_csv("./processed/" + person + "/history.csv")
    df.to_json("./processed/" + person + "/history.json")

  0%|▏                                                                         | 1212/699842 [00:36<5:46:38, 33.59it/s]


KeyboardInterrupt: 

In [8]:
df.reset_index().drop(["index"], axis=1).to_json("./processed/" + person + "/history.json", orient="records")

---
Once the tracks have been processed, the next stage is to prepare the relevant metrics.
The Spotify web API provides information about every track, namely audio features and analysis.

### 1.2 Audio Features

This is an example response for the audio features of "Time is Running Out" by "Muse".
```json
"acousticness": 0.00242,
"analysis_url": "https://api.spotify.com/v1/audio-analysis/2takcwOaAZWiXQijPHIx7B",
"danceability": 0.585,
"duration_ms": 237040,
"energy": 0.842,
"id": "2takcwOaAZWiXQijPHIx7B",
"instrumentalness": 0.00686,
"key": 9,
"liveness": 0.0866,
"loudness": -5.883,
"mode": 0,
"speechiness": 0.0556,
"tempo": 118.211,
"time_signature": 4,
"track_href": "https://api.spotify.com/v1/tracks/2takcwOaAZWiXQijPHIx7B",
"type": "audio_features",
"uri": "spotify:track:2takcwOaAZWiXQijPHIx7B",
"valence": 0.428
```

Since the uri's are now cleaned, we can just request the auto features via the spotify api and append them to our dataframe.

In [9]:
audio_features = pd.DataFrame(columns=[
    "uri",
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "mode",
    "speechiness",
    "tempo",
    "time_signature",
    "valence",
])
audio_features.head()

Unnamed: 0,uri,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence


<div class="alert alert-block alert-warning">
Use with caution! The spotify rate limit currently block any api call. (At least to the audio-features endpoint)
</div>

In [12]:
import time
for chunk in tqdm(np.array_split(unique_tracks, len(unique_tracks)/90)):
    try:
        time.sleep(request_timer)
        uris = chunk["spotify_track_uri"].to_numpy()
        result = sp.audio_features(uris)
        result_nonull = []
        for obj in result:
            if obj != None:
                result_nonull.append(obj)
        chunk_df = pd.DataFrame(result_nonull)
        audio_features = pd.concat([audio_features, chunk_df], join="inner")
    except Exception as e:
        print(e)
        break
    

100%|███████████████████████████████████████████████████████████████| 81/81 [00:39<00:00,  2.04it/s]


In [13]:
audio_features.rename({"uri": "spotify_track_uri"}, inplace=True, axis=1)
audio_features

Unnamed: 0,spotify_track_uri,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,spotify:track:1SSzM044DUbVpYNNHWIhYJ,0.169000,0.718,161882,0.667,0.000000,9,0.0674,-5.008,0,0.0551,89.029,4,0.480
1,spotify:track:30f7TNgvsS1AWtyuVqMucH,0.032100,0.601,210685,0.923,0.000000,10,0.1580,-3.958,1,0.0717,94.059,4,0.572
2,spotify:track:1SN1vSPsr4V4tIUTR3s8YW,0.469000,0.672,212424,0.578,0.000000,9,0.3120,-6.621,1,0.0569,113.945,4,0.548
3,spotify:track:2i3WO0WkrlnrA1I4GdUmxh,0.271000,0.711,227720,0.747,0.000000,10,0.3280,-6.165,0,0.0610,100.010,4,0.572
4,spotify:track:7qiZfU4dY1lWllzX7mPBI3,0.581000,0.825,233713,0.652,0.000000,1,0.0931,-3.183,0,0.0802,95.977,4,0.931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,spotify:track:0A9AWAY2WlJZTmSDPRlmKK,0.635000,0.581,262750,0.765,0.000000,3,0.4270,-5.715,1,0.0953,135.062,4,0.183
86,spotify:track:6ryWXzeSbDX5cebOrgjMKL,0.006000,0.771,178305,0.883,0.006010,1,0.1350,-5.242,0,0.0600,127.006,4,0.892
87,spotify:track:6WsaEpK2Bky9ZGCCaQPyg4,0.233000,0.540,153304,0.892,0.000093,11,0.1950,-4.985,1,0.1100,106.594,3,0.152
88,spotify:track:19OJLSiZod7YFmahNngVta,0.000003,0.534,361853,0.971,0.778000,9,0.0769,-4.089,0,0.0456,150.000,4,0.916


In [14]:
unique_cleaned = pd.read_csv("./processed/" + person + "/unique_tracks.csv")

In [15]:
tracks_features = unique_cleaned.merge(audio_features, how="outer", on="spotify_track_uri")
tracks_features[tracks_features["energy"].notnull()]
tracks_features.to_csv("./processed/" + person + "/tracks_features.csv")

In [16]:
tracks_features.head()

Unnamed: 0.1,Unnamed: 0,master_metadata_track_name,master_metadata_album_album_name,spotify_track_uri,master_metadata_album_artist_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0,Vermissen (feat. Henning May),Vermissen (feat. Henning May),spotify:track:1SSzM044DUbVpYNNHWIhYJ,Juju,0.169,0.718,161882,0.667,0.0,9,0.0674,-5.008,0,0.0551,89.029,4,0.48
1,0,Vermissen (feat. Henning May),Vermissen (feat. Henning May),spotify:track:1SSzM044DUbVpYNNHWIhYJ,Juju,0.169,0.718,161882,0.667,0.0,9,0.0674,-5.008,0,0.0551,89.029,4,0.48
2,4,Unforgettable,Streets Of Gold,spotify:track:30f7TNgvsS1AWtyuVqMucH,Nico Santos,0.0321,0.601,210685,0.923,0.0,10,0.158,-3.958,1,0.0717,94.059,4,0.572
3,4,Unforgettable,Streets Of Gold,spotify:track:30f7TNgvsS1AWtyuVqMucH,Nico Santos,0.0321,0.601,210685,0.923,0.0,10,0.158,-3.958,1,0.0717,94.059,4,0.572
4,5,Fast Car,Blue,spotify:track:1SN1vSPsr4V4tIUTR3s8YW,Jonas Blue,0.469,0.672,212424,0.578,0.0,9,0.312,-6.621,1,0.0569,113.945,4,0.548


## 2. Artists and Genres

In order to gain insight into the different genres listened to, the artist data must be queried.
Due to the nature of an n to n relationship, the artist and genre data is modelled as shown in the ER diagram above.

The problem is that only the main artist is given in the extended history, not the features. Also, the artist is only referenced by name and a spotify_id is required to request further data.
However, the tracks api also contains all information about the artist that contributed to the track. So in order to get the artist ids, we have to querry all tracks.

In [17]:
unique_cleaned.head()

Unnamed: 0.1,Unnamed: 0,master_metadata_track_name,master_metadata_album_album_name,spotify_track_uri,master_metadata_album_artist_name
0,0,Vermissen (feat. Henning May),Vermissen (feat. Henning May),spotify:track:1SSzM044DUbVpYNNHWIhYJ,Juju
1,4,Unforgettable,Streets Of Gold,spotify:track:30f7TNgvsS1AWtyuVqMucH,Nico Santos
2,5,Fast Car,Blue,spotify:track:1SN1vSPsr4V4tIUTR3s8YW,Jonas Blue
3,6,I Believe I'm Fine,Uncovered,spotify:track:2i3WO0WkrlnrA1I4GdUmxh,Robin Schulz
4,7,Shape of You,÷,spotify:track:7qiZfU4dY1lWllzX7mPBI3,Ed Sheeran


In [18]:
df_artist = pd.DataFrame(columns=["artist_name", "artist_uri", "genres"])


In [19]:
request = True
if request:
    artist_data = []
    artist_track_data = []
    for chunk in tqdm(np.array_split(unique_cleaned, len(unique_cleaned)/45)):
        try:
            uris = chunk["spotify_track_uri"].map(lambda x: x.split(",")[0]).to_numpy()
            result = sp.tracks(uris, market="DE")["tracks"]
            time.sleep(request_timer)
            result_nonull = []
            for track in result:
                if track != None:
                    for artist in track["artists"]:
                        artist_data.append([artist["name"], artist["uri"], artist.get("genres", None)])
                        artist_track_data.append([artist["uri"], track["uri"]])
                
        except Exception as e:
            print(e)
            break

100%|█████████████████████████████████████████████████████████████| 162/162 [01:23<00:00,  1.94it/s]


In [20]:
load_from_file = False

if load_from_file:
    df_artist = pd.read_csv("./processed/" + person + "/artist.csv")
    df_artist_track = pd.read_csv("./processed/" + person + "/artist_track.csv")
else:    
    df_artist_track = pd.DataFrame(artist_track_data, columns=["artist_uri", "track_uri"])
    df_artist_track.to_csv("./processed/" + person + "/artist_track.csv")
    df_artist_track.info()
    
    df_artist = pd.DataFrame(artist_data, columns=["artist_name", "artist_uri", "genres"]).drop_duplicates()
    df_artist.to_csv("./processed/" + person + "/artist.csv")
    df_artist.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10026 entries, 0 to 10025
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   artist_uri  10026 non-null  object
 1   track_uri   10026 non-null  object
dtypes: object(2)
memory usage: 156.8+ KB


In [21]:
df_artist = df_artist[["artist_name", "artist_uri"]]
df_artist = df_artist.drop_duplicates("artist_uri")
df_artist

Unnamed: 0,artist_name,artist_uri
0,Juju,spotify:artist:4sg4no0TXdsrM1s4SVUwNF
1,Henning May,spotify:artist:1cH0i2n6YLrWFDDemqztay
2,Nico Santos,spotify:artist:3A9B6c1CrSPauiOblw7pWz
3,Jonas Blue,spotify:artist:1HBjj22wzbscIZ9sEb5dyf
4,Dakota,spotify:artist:2zzpznMuhKlKlqh1ma7Sms
...,...,...
9998,Paul Engemann,spotify:artist:3Rfb8wuY9YHVntDXJqPW6r
10001,Clarke 99,spotify:artist:2oAPxiUkLs3EdFAtYX4Qr5
10006,Sante Sansone,spotify:artist:5fAwPpS78sokZdpktoSUE8
10012,Zerb,spotify:artist:6mDl7lQiLxT0iQ8LYhAlWy


Now that each artist has its URI, it is possible to request more specific data about them.

In [22]:
request = True
if request:
    artist_genres = []
    for chunk in tqdm(np.array_split(df_artist, len(df_artist)/45)):
        try:
            uris = chunk["artist_uri"]
            result = sp.artists(uris)["artists"]
            time.sleep(request_timer)
            for artist in result:
                artist_genres.append([artist["uri"], artist["genres"]])
                
        except Exception as e:
            print(e)
            break

100%|█████████████████████████████████████████████████████████████| 101/101 [00:51<00:00,  1.94it/s]


In [23]:
df_genres_a = pd.DataFrame(artist_genres, columns=["artist_uri", "genres"])
df_genres_a.head()

Unnamed: 0,artist_uri,genres
0,spotify:artist:4sg4no0TXdsrM1s4SVUwNF,"[frauenrap, german hip hop]"
1,spotify:artist:1cH0i2n6YLrWFDDemqztay,[]
2,spotify:artist:3A9B6c1CrSPauiOblw7pWz,[german pop]
3,spotify:artist:1HBjj22wzbscIZ9sEb5dyf,"[pop, pop dance, tropical house, uk dance]"
4,spotify:artist:2zzpznMuhKlKlqh1ma7Sms,[]


In [24]:
df_genres = pd.DataFrame(sum(df_genres_a["genres"].tolist(), []), columns=["genre"]).drop_duplicates()
df_genres

Unnamed: 0,genre
0,frauenrap
1,german hip hop
2,german pop
3,pop
4,pop dance
...,...
7782,croatian indie
7783,croatian rock
7784,yugoslav rock
7791,deep east coast hip hop


In [25]:
genre_artist = []
for i, artist in df_genres_a.iterrows():
    artist_genres = artist["genres"]
    for genre_id in df_genres[df_genres["genre"].isin(artist_genres)].to_numpy():
        genre_artist.append([artist["artist_uri"], genre_id[0]])

In [26]:
df_genre_artist = pd.DataFrame(genre_artist, columns=["artist_uri", "genre_id"])
df_genre_artist.to_csv("./processed/" + person + "/artist_genre.csv")

In [27]:
df_history = pd.read_csv("./processed/" + person + "/history.csv").drop(["Unnamed: 0", "username", "platform", "ip_addr_decrypted", "user_agent_decrypted", "master_metadata_album_artist_name", "master_metadata_album_album_name", "episode_name", "episode_show_name", "spotify_episode_uri", "offline", "offline_timestamp", "incognito_mode"], axis=1)
df_history

FileNotFoundError: [Errno 2] No such file or directory: './processed/marina/history.csv'