# Processing scraped track credits data (internal Spotify API)

In [1]:
import pandas as pd
from helpers.data import (
    create_data_path,
    write_series_to_file_as_prettified_json,
)
from helpers.spotify_util import (
    get_spotify_track_link,
)
import os

data_folder = create_data_path("top200_Jan_2017_to_June_2023")

In [2]:
credits_api_resp = pd.read_json(os.path.join(data_folder, "credits.jsonl"), lines=True).set_index('track_id')

In [3]:
credits_api_resp.head()

Unnamed: 0_level_0,status_code,content,content_type,url,timestamp
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01ARBHp9XYwUOiQHIR2F3Y,200,{'trackUri': 'spotify:track:01ARBHp9XYwUOiQHIR...,json,https://spclient.wg.spotify.com/track-credits-...,2023-07-19 15:49:35.452185+00:00
6QQ51riWstSxoNWBdOYRuV,200,{'trackUri': 'spotify:track:6QQ51riWstSxoNWBdO...,json,https://spclient.wg.spotify.com/track-credits-...,2023-07-19 15:49:35.553219+00:00
1xYzN6PXSwoOChUJnq8eJU,200,{'trackUri': 'spotify:track:1xYzN6PXSwoOChUJnq...,json,https://spclient.wg.spotify.com/track-credits-...,2023-07-19 15:49:35.572632+00:00
3GAz3o5o3lZGUOPUSPG8ul,200,{'trackUri': 'spotify:track:3GAz3o5o3lZGUOPUSP...,json,https://spclient.wg.spotify.com/track-credits-...,2023-07-19 15:49:35.542785+00:00
7GXr7pcHuyCMgYeZ1mb9OW,200,{'trackUri': 'spotify:track:7GXr7pcHuyCMgYeZ1m...,json,https://spclient.wg.spotify.com/track-credits-...,2023-07-19 15:49:35.654708+00:00


In [4]:
credits_df = pd.DataFrame.from_records(credits_api_resp['content'], index=credits_api_resp.index)
credits_df.head()

Unnamed: 0_level_0,trackUri,trackTitle,roleCredits,extendedCredits,sourceNames
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01ARBHp9XYwUOiQHIR2F3Y,spotify:track:01ARBHp9XYwUOiQHIR2F3Y,I Wish,"[{'roleTitle': 'Performers', 'artists': [{'uri...",[],[Alan 1]
6QQ51riWstSxoNWBdOYRuV,spotify:track:6QQ51riWstSxoNWBdOYRuV,Callaloo,"[{'roleTitle': 'Performers', 'artists': [{'uri...",[],[Antarktis Records]
1xYzN6PXSwoOChUJnq8eJU,spotify:track:1xYzN6PXSwoOChUJnq8eJU,Intro,"[{'roleTitle': 'Performers', 'artists': [{'uri...",[],[Rec. 118 / Mal Luné Music]
3GAz3o5o3lZGUOPUSPG8ul,spotify:track:3GAz3o5o3lZGUOPUSPG8ul,Intro : DT sugA (Feat. DJ Friz),"[{'roleTitle': 'Performers', 'artists': [{'uri...",[],[BIGHIT MUSIC]
7GXr7pcHuyCMgYeZ1mb9OW,spotify:track:7GXr7pcHuyCMgYeZ1mb9OW,A+,"[{'roleTitle': 'Performers', 'artists': [{'uri...",[],[AlphapopOriginal]


In [5]:
credits_df.extendedCredits.apply(len).value_counts()

0    163681
Name: extendedCredits, dtype: int64

Looks like the extended credits aren't used at all at this moment.

In [6]:
example_credits = credits_df.loc['6mICuAdrwEjh6Y6lroV2Kg']
example_credits

trackUri                        spotify:track:6mICuAdrwEjh6Y6lroV2Kg
trackTitle                                   Chantaje (feat. Maluma)
roleCredits        [{'roleTitle': 'Performers', 'artists': [{'uri...
extendedCredits                                                   []
sourceNames                [Sony Music Latin, Sony Music Publishing]
Name: 6mICuAdrwEjh6Y6lroV2Kg, dtype: object

In [7]:
temp_data_folder_path = 'data' # create data folder in directory of this notebook -> store JSON files etc. there for easier inspection (with code editor) than in notebook output

In [8]:
write_series_to_file_as_prettified_json(example_credits, os.path.join(temp_data_folder_path, "example_credits.json"))

In [9]:
get_spotify_track_link(example_credits.name) # check the link and open the song page for reference

'https://open.spotify.com/track/6mICuAdrwEjh6Y6lroV2Kg'

We see that internally, Spotify collects more detailed information about song credits than what is actually displayed in the web app. Even 'creator URIs' are mentioned, which do not exist at all in the public API!

Furthermore, it is convenient, that in the internal API the artist ID is (at least sometimes) also included if an artist also appears as a producer (see the example). 

However, unfortunately the data still looks messy: in the example, Shakira is mentioned as performer, writer, and producer, but the link to her artist page is missing for the writer entry is missing. This makes it more difficult to automatically match artist name and artist ID...

In [10]:
role_credits = pd.DataFrame(credits_df.roleCredits, index=credits_df.index)
role_credits.head()

Unnamed: 0_level_0,roleCredits
track_id,Unnamed: 1_level_1
01ARBHp9XYwUOiQHIR2F3Y,"[{'roleTitle': 'Performers', 'artists': [{'uri..."
6QQ51riWstSxoNWBdOYRuV,"[{'roleTitle': 'Performers', 'artists': [{'uri..."
1xYzN6PXSwoOChUJnq8eJU,"[{'roleTitle': 'Performers', 'artists': [{'uri..."
3GAz3o5o3lZGUOPUSPG8ul,"[{'roleTitle': 'Performers', 'artists': [{'uri..."
7GXr7pcHuyCMgYeZ1mb9OW,"[{'roleTitle': 'Performers', 'artists': [{'uri..."


In [11]:
role_credits.iloc[0].roleCredits

[{'roleTitle': 'Performers',
  'artists': [{'uri': 'spotify:artist:0z3xCl4Nsx6ok5sJy8xIJN',
    'name': 'Alan 1',
    'imageUri': 'https://i.scdn.co/image/ab677762000078e645c984e8c82f9ce15ebf1f51',
    'subroles': ['main artist'],
    'weight': 0.8999999761581421}]},
 {'roleTitle': 'Writers', 'artists': []},
 {'roleTitle': 'Producers', 'artists': []}]

In [12]:
def create_role_credits_rows(track_id: str, role_credits: list):
    roles = []
    for role in role_credits:
        title = role['roleTitle']
        for i, artist in enumerate(role['artists']):
            roles.append({
                **artist,
                'roleTitle': title[:-1],
                'pos': i + 1,
                'track_id': track_id
            })
    return roles

roles = []

for track_id, role_credits in credits_df.roleCredits.items():
    roles.extend(create_role_credits_rows(track_id, role_credits))

roles = pd.DataFrame(roles)
roles.head()

Unnamed: 0,uri,name,imageUri,subroles,weight,roleTitle,pos,track_id,externalUrl,creatorUri
0,spotify:artist:0z3xCl4Nsx6ok5sJy8xIJN,Alan 1,https://i.scdn.co/image/ab677762000078e645c984...,[main artist],0.9,Performer,1,01ARBHp9XYwUOiQHIR2F3Y,,
1,spotify:artist:3wLF9BqfbAd67GNKJlqq9o,Champion J.R,https://i.scdn.co/image/ab677762000078e6aae517...,[main artist],0.9,Performer,1,6QQ51riWstSxoNWBdOYRuV,,
2,spotify:artist:0K6fzyWWBati6Zlr6qveA8,Erik Emanuel,https://i.scdn.co/image/ab677762000078e645c984...,[featured artist],0.7999,Performer,2,6QQ51riWstSxoNWBdOYRuV,,
3,spotify:artist:6Te49r3A6f5BiIgBRxH7FH,Ninho,https://i.scdn.co/image/ab677762000078e6d2b377...,[main artist],0.9,Performer,1,1xYzN6PXSwoOChUJnq8eJU,,
4,spotify:artist:3I1nR638eYPb9CV6SdN6F1,Kozbeatz,https://i.scdn.co/image/ab677762000078e6afc079...,[composer],0.6992,Writer,1,1xYzN6PXSwoOChUJnq8eJU,,


In [13]:
# get missingness stats for all columns
1 - roles.isna().mean()

uri            0.913244
name           1.000000
imageUri       0.913244
subroles       1.000000
weight         1.000000
roleTitle      1.000000
pos            1.000000
track_id       1.000000
externalUrl    0.037715
creatorUri     0.037715
dtype: float64

Does the data only contain artist URIs in the `uri` column?

In [14]:
first_uri_chars = roles.uri.str.slice(0, len('spotify:artist'))
first_uri_chars.value_counts()


spotify:artist    751743
Name: uri, dtype: int64

Is there a relationship between the `externalUrl` and `creatorUri` columns?

In [15]:
roles[roles.externalUrl.notna()].shape[0]

31045

In [16]:
roles[roles.externalUrl.notna()].shape[0]

31045

In [17]:
roles[roles.creatorUri.notna()].shape[0]

31045

In [18]:
roles[roles.externalUrl.notna()].iloc[0].externalUrl

'https://artists.spotify.com/songwriter/1jSqmWuKCLxIcJolEbQTRP'

In [19]:
roles.externalUrl.str.startswith('https://artists.spotify.com/songwriter/').value_counts()

True    31045
Name: externalUrl, dtype: int64

In [20]:
roles[roles.externalUrl.notna() & (roles.externalUrl.str.startswith('https://artists.spotify.com/songwriter/') != True)].shape[0]

0

In [21]:
roles[roles.externalUrl.notna()].iloc[0].creatorUri

'spotify:songwriter:1jSqmWuKCLxIcJolEbQTRP'

In [22]:
roles[roles.creatorUri.notna() & (roles.creatorUri.str.startswith('spotify:songwriter:') != True)].shape[0]

0

Looks like there is a clear relationship between the `externalUrl` and `creatorUri`: Every external URL in the collected data relates to a particular creator URI! The external URL is not really external: It points to another Spotify page (separate from the Spotify web app) where one can get even more insights about particular songwriters whose tracks can be found on Spotify.

In [23]:
roles.roleTitle.value_counts()

Writer       410483
Performer    249329
Producer     163345
Name: roleTitle, dtype: int64

In [24]:
roles.subroles.apply(len).value_counts()

1    667138
2     84128
0     71414
3       452
4        24
6         1
Name: subroles, dtype: int64

In [25]:
# remove duplicates in every subroles array
roles.subroles = roles.subroles.apply(lambda x: list(set(x)))

## Exploring data by role title (Writer, Producer, Performer)

In [26]:
writers = roles[roles.roleTitle == 'Writer'].drop(columns=['roleTitle'])

In [27]:
producers = roles[roles.roleTitle == 'Producer'].drop(columns=['roleTitle'])

In [28]:
performers = roles[roles.roleTitle == 'Performer'].drop(columns=['roleTitle'])

In [29]:
writers.subroles.value_counts()

[composer]                      169127
[composer, lyricist]             72813
[]                               71414
[lyricist]                       44093
[author]                         24012
[writer]                         17954
[composer, author]                7738
[composer, writer]                1914
[writer, lyricist]                 896
[composer, writer, lyricist]       337
[composer, writer, author]          87
[writer, author]                    44
[lyricist, author]                  38
[composer, lyricist, author]        16
Name: subroles, dtype: int64

In [30]:
producers.subroles.value_counts()

[producer]    163345
Name: subroles, dtype: int64

In [31]:
performers.subroles.value_counts()

[main artist]                          214940
[featured artist]                       32064
[remixer]                                 781
[conductor]                               587
[remixer, main artist]                    332
[orchestra]                               315
[featured artist, main artist]            192
[featured artist, remixer]                 39
[conductor, main artist]                   34
[orchestra, main artist]                   30
[orchestra, featured artist]                5
[conductor, orchestra]                      4
[conductor, orchestra, main artist]         3
[orchestra, remixer]                        2
[orchestra, remixer, main artist]           1
Name: subroles, dtype: int64

In [32]:
1 - writers.isna().mean()

uri            0.826024
name           1.000000
imageUri       0.826024
subroles       1.000000
weight         1.000000
pos            1.000000
track_id       1.000000
externalUrl    0.075630
creatorUri     0.075630
dtype: float64

In [33]:
1 - producers.isna().mean()

uri            1.0
name           1.0
imageUri       1.0
subroles       1.0
weight         1.0
pos            1.0
track_id       1.0
externalUrl    0.0
creatorUri     0.0
dtype: float64

In [34]:
1 - performers.isna().mean()

uri            1.0
name           1.0
imageUri       1.0
subroles       1.0
weight         1.0
pos            1.0
track_id       1.0
externalUrl    0.0
creatorUri     0.0
dtype: float64

Looks like only songwriters don't have a Spotify artist URI in some cases, while **all** performers and producers have one (which is AWESOME). Also, the creator URI is only relevant (i.e. available in some rare cases) for the songwriters.