## Album data

In [1]:
import pandas as pd
from helpers.data import (
    create_data_path,
    load_parquet_files_in_dir,
)
import os
from tqdm import tqdm
from datetime import date

data_folder = create_data_path("top200_Jan_2017_to_June_2023")

In [2]:
album_data_dir =  os.path.join(data_folder, "albums")
album_df_dict = load_parquet_files_in_dir(album_data_dir)
album_df_dict.keys()

dict_keys(['copyrights', 'artists', 'markets', 'original_responses', 'images', 'metadata'])

## Metadata

In [3]:
album_metadata = album_df_dict['metadata']
album_metadata.head()

Unnamed: 0_level_0,album_type,label,name,release_date,release_date_precision,total_tracks,upc_id,amgid_id
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6bUxh58rYTL67FS8dyTKMN,album,Sony Music Latin,El Dorado,2017-05-26,day,13,886446480060,
1FkaJUwfqLdQdSmRPBlw6l,single,Sony Music Latin,Vente Pa' Ca (feat. Maluma),2016-09-22,day,1,886446092928,
0YLrAWUbY0nyM7PFtqnYld,album,Sony Music Latin,Primera Cita,2016-08-26,day,14,886445722550,
2LYwooMTH1iJeBvWyXXWUf,album,Universal Music Group,Energía,2016-06-24,day,15,602547952387,
2zrLk90b4qjmrxRZKyIY7X,single,Universal Music Group,Shaky Shaky,2016-04-08,day,1,602547885456,


In [4]:
album_metadata.label.value_counts().sort_values(ascending=False)[:10]

Sony Music Entertainment       2323
Universal Music Group          1778
Columbia                       1519
WM Finland                     1473
Universal Music Oy             1196
RCA Records Label               949
Universal Music Italia srL.     745
Universal Music AB              722
Universal Music A/S             676
WM Sweden                       622
Name: label, dtype: int64

In [5]:
album_metadata.release_date.min()

Timestamp('1840-06-14 00:00:00')

## Artists

In [6]:
album_artists = album_df_dict['artists']
album_artists.pos.value_counts()

1    103247
2     22756
3      6092
4         9
5         6
Name: pos, dtype: int64

In [7]:
album_artists.head()

Unnamed: 0_level_0,artist_id,pos
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6bUxh58rYTL67FS8dyTKMN,0EmeFodog0BfCgMzAIvKQp,1
1FkaJUwfqLdQdSmRPBlw6l,7slfeZO9LsJbWgpkIoXBUJ,1
0YLrAWUbY0nyM7PFtqnYld,0eecdvMrqBftK0M1VKhaF4,1
2LYwooMTH1iJeBvWyXXWUf,1vyhD5VmyZ7KMfW5gqLgo5,1
2zrLk90b4qjmrxRZKyIY7X,4VMYDCV2IEDYJArk749S6m,1


In [8]:
album_copyrights = album_df_dict['copyrights']
album_copyrights.head()

Unnamed: 0_level_0,text,type
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6bUxh58rYTL67FS8dyTKMN,(P) 2016 Wati-B under exclusive license to Son...,P
1FkaJUwfqLdQdSmRPBlw6l,(P) 2016 Sony Music Entertainment US Latin LLC,P
0YLrAWUbY0nyM7PFtqnYld,(P) 2016 Sony Music Entertainment US Latin LLC,P
2LYwooMTH1iJeBvWyXXWUf,© 2016 Capitol Latin,C
2LYwooMTH1iJeBvWyXXWUf,℗ 2016 Capitol Latin,P


In [9]:
album_copyrights.index.nunique()

102944

In [10]:
album_copyrights.type.value_counts()

P    100860
C     86506
Name: type, dtype: int64

In [11]:
album_copyrights.text.value_counts().sort_values(ascending=False)[:10]

2019 Super Cassettes Industries Private Limited                                                                    252
© 2018 Universal Music Spain S.L.U./ Gestmusic Endemol, SAU/ Radio Televisión Española, Sociedad Anónima S.M.E.    184
℗ 2018 Universal Music Spain S.L.U./ Gestmusic Endemol, SAU/ Radio Televisión Española, Sociedad Anónima S.M.E.    181
NMC United Entertainment Ltd.                                                                                      176
2020 Super Cassettes Industries Private Limited                                                                    152
© 2017 Warner Music Finland                                                                                        145
Music Nation Records Co. Ltd.                                                                                      144
℗ 2017 Warner Music Finland                                                                                        144
© 2019 Warner Music Finland                     

In [12]:
album_markets = album_df_dict['markets']
album_markets.head()

Unnamed: 0_level_0,market
album_id,Unnamed: 1_level_1
6bUxh58rYTL67FS8dyTKMN,AD
6bUxh58rYTL67FS8dyTKMN,AE
6bUxh58rYTL67FS8dyTKMN,AG
6bUxh58rYTL67FS8dyTKMN,AL
6bUxh58rYTL67FS8dyTKMN,AM


In [13]:
album_markets.market.value_counts()

NO    73438
FI    73344
DK    73293
CW    73249
DE    73204
      ...  
IQ    71750
LY    71678
CD    71627
US    71585
BY    65533
Name: market, Length: 184, dtype: int64

In [14]:
album_imgs = album_df_dict['images']
album_imgs.head()

Unnamed: 0_level_0,url,width,height
album_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6bUxh58rYTL67FS8dyTKMN,https://i.scdn.co/image/ab67616d0000b273d05d3a...,640,640
6bUxh58rYTL67FS8dyTKMN,https://i.scdn.co/image/ab67616d00001e02d05d3a...,300,300
6bUxh58rYTL67FS8dyTKMN,https://i.scdn.co/image/ab67616d00004851d05d3a...,64,64
1FkaJUwfqLdQdSmRPBlw6l,https://i.scdn.co/image/ab67616d0000b2733d90f5...,640,640
1FkaJUwfqLdQdSmRPBlw6l,https://i.scdn.co/image/ab67616d00001e023d90f5...,300,300


In [15]:
album_imgs.width.value_counts()

300    103221
64     103220
640    103219
582         1
273         1
58          1
435         1
218         1
46          1
476         1
223         1
48          1
540         1
253         1
54          1
350         1
Name: width, dtype: int64

In [16]:
album_imgs.height.value_counts()

300    103223
64     103222
640    103221
63          2
600         1
347         1
297         1
636         1
298         1
Name: height, dtype: int64

## Original Responses

In [17]:
original_responses = album_df_dict['original_responses']
original_responses.head()

Unnamed: 0,source,content,timestamp
0,"Spotify API (spotipy client (v2.23.0, method/f...","[{'album_id': '6bUxh58rYTL67FS8dyTKMN', 'album...",2023-08-01 20:48:05.175839
1,"Spotify API (spotipy client (v2.23.0, method/f...","[{'album_id': '68mjWwD3sLh4wONFIb2Upq', 'album...",2023-08-01 20:48:05.518130
2,"Spotify API (spotipy client (v2.23.0, method/f...","[{'album_id': '4YV0amQ85n4nCeSYyqW6Rz', 'album...",2023-08-01 20:48:05.843177
3,"Spotify API (spotipy client (v2.23.0, method/f...","[{'album_id': '3cz9fxBXppuImbufikGk2D', 'album...",2023-08-01 20:48:06.189859
4,"Spotify API (spotipy client (v2.23.0, method/f...","[{'album_id': '5dpzYigmd97HcZOxWMpv9h', 'album...",2023-08-01 20:48:06.501223


In [18]:
original_responses.source.value_counts()

Spotify API (spotipy client (v2.23.0, method/function: 'albums')    5163
Name: source, dtype: int64

In [19]:
original_responses.timestamp.min(), original_responses.timestamp.max()

(Timestamp('2023-08-01 20:48:05.175839'),
 Timestamp('2023-08-01 21:09:51.697537'))