In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
df = pd.read_csv('Dataset_with_Genres.csv', index_col=0)
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,0.744,0.151,0.127,104.851,3,['Uli'],[],['Uli'],UliCarve,"['nuevo_regional_mexicano', 'sierreno', 'regio..."
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,0.0,0.148,0.655,102.009,1,['Fernando Pessoa'],[],['Fernando Pessoa'],Fernando PessoaCapítulo 2.16 - Banquero Anarqu...,
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,0.0218,0.212,0.457,130.418,5,['Ignacio Corsini'],[],['Ignacio Corsini'],Ignacio CorsiniVivo para Quererte - Remasterizado,"['vintage_tango', 'tango']"
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,0.918,0.104,0.397,169.98,3,['Ignacio Corsini'],[],['Ignacio Corsini'],Ignacio CorsiniEl Prisionero - Remasterizado,"['vintage_tango', 'tango']"
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,...,0.13,0.311,0.196,103.22,4,['Dick Haymes'],[],['Dick Haymes'],Dick HaymesLady of the Evening,"['lounge', 'big_band', 'deep_adult_standards',..."


In [3]:
col_list = df.columns

In [4]:
col_list

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'artists_upd_v1',
       'artists_upd_v2', 'artists_upd', 'artists_song',
       'consolidates_genre_lists'],
      dtype='object')

In [5]:
df.dtypes

id                           object
name                         object
popularity                    int64
duration_ms                   int64
explicit                      int64
artists                      object
id_artists                   object
release_date                 object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
time_signature                int64
artists_upd_v1               object
artists_upd_v2               object
artists_upd                  object
artists_song                 object
consolidates_genre_lists     object
dtype: object

In [6]:
df.shape

(523475, 25)

In [7]:
# remove the unnecessary [''] from any useful entries (they weren't lists)
# credit for solution: https://stackoverflow.com/questions/38147447/how-to-remove-square-bracket-from-pandas-dataframe
df['artists'] = df['artists'].str.strip("[' ']").astype(object)
df['id_artists'] = df['id_artists'].str.strip("[' ']").astype(object)
df['consolidates_genre_lists'] = df['consolidates_genre_lists'].str.strip("[' ']").astype(str)

In [8]:
# drop unnecessary columns
to_drop = ['artists_upd_v1', 'artists_upd_v2','artists_upd', 'artists_song']
df.drop(to_drop, axis=1, inplace=True)

In [9]:
df.shape

(523475, 21)

In [10]:
# rename columns
df.rename(columns={'id': 'track_id', 
                   'name': 'track_name',
                   'artists':'artist_name',
                   'id_artists':'artist_id',
                   'consolidates_genre_lists':'genre'}, inplace=True)

In [11]:
# pre NaN-drop check
df.isna().sum()

track_id            0
track_name          1
popularity          0
duration_ms         0
explicit            0
artist_name         0
artist_id           0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
genre               0
dtype: int64

In [12]:
# drop NaN
df.dropna(axis=0, inplace=True)
df.isna().sum()

track_id            0
track_name          0
popularity          0
duration_ms         0
explicit            0
artist_name         0
artist_id           0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
genre               0
dtype: int64

In [13]:
df.shape

(523474, 21)

In [14]:
# create natural key 
df['track_natural_key'] = df.track_name + "_" + df.artist_name.map(str)


In [15]:
# Transform duration_ms to duration_mins
# credit to drop milliseconds from datetime format https://stackoverflow.com/questions/31487732/simple-way-to-drop-milliseconds-from-python-datetime-datetime-object

converted = pd.to_datetime(df['duration_ms'], unit='ms')
df['duration_mins'] = pd.Series([val.time().replace(microsecond=0) for val in converted])

df.head()

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.451,0.674,0.744,0.151,0.127,104.851,3,"nuevo_regional_mexicano', 'sierreno', 'regiona...",Carve_Uli,00:02:06
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.957,0.797,0.0,0.148,0.655,102.009,1,,Capítulo 2.16 - Banquero Anarquista_Fernando P...,00:01:38
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.0512,0.994,0.0218,0.212,0.457,130.418,5,"vintage_tango', 'tango",Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.0504,0.995,0.918,0.104,0.397,169.98,3,"vintage_tango', 'tango",El Prisionero - Remasterizado_Ignacio Corsini,00:02:56
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922,0.402,0.158,...,0.039,0.989,0.13,0.311,0.196,103.22,4,"lounge', 'big_band', 'deep_adult_standards', '...",Lady of the Evening_Dick Haymes,00:02:43


In [16]:
# convert release_date to datetime object
df['release_date'] = pd.to_datetime(df.release_date, format='%Y-%m-%d')

# add new column for seasons (borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # numeric trick is to wasp the winter separation at 12-31 / 01-01. 
    # By subtracting the end of winter, which is 03-21, and take modulo, you effectively change 01-01 to - 320 = 980 mod 1300, 
    # which is larger than the last day of autumn. So now your winter season is in one chunk instead of two.

date_offset = (df.release_date.dt.month*100 + df.release_date.dt.day - 320)%1300

df['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
df.head()

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744,0.151,0.127,104.851,3,"nuevo_regional_mexicano', 'sierreno', 'regiona...",Carve_Uli,00:02:06,winter
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.797,0.0,0.148,0.655,102.009,1,,Capítulo 2.16 - Banquero Anarquista_Fernando P...,00:01:38,spring
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.994,0.0218,0.212,0.457,130.418,5,"vintage_tango', 'tango",Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01,spring
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.995,0.918,0.104,0.397,169.98,3,"vintage_tango', 'tango",El Prisionero - Remasterizado_Ignacio Corsini,00:02:56,spring
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.402,0.158,...,0.989,0.13,0.311,0.196,103.22,4,"lounge', 'big_band', 'deep_adult_standards', '...",Lady of the Evening_Dick Haymes,00:02:43,winter


In [17]:
t_dup = df[df.duplicated('track_natural_key')]
len(t_dup)

0

In [18]:
df[df['track_id'] == '4kbj5MwxO1bq9wjT5g9HaA']['track_name']

81500    Shut Up and Dance
Name: track_name, dtype: object

In [19]:
df.shape

(523474, 24)

In [20]:
df.dtypes

track_id                     object
track_name                   object
popularity                    int64
duration_ms                   int64
explicit                      int64
artist_name                  object
artist_id                    object
release_date         datetime64[ns]
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
time_signature                int64
genre                        object
track_natural_key            object
duration_mins                object
release_season             category
dtype: object

In [21]:
df['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')

Unnamed: 0,genre,count
0,,260295
1,,16508
2,"vintage_tango', 'tango",2114
3,"filmi', 'classic_bollywood', 'desi_pop', 'sufi",2065
4,"filmi', 'deep_indian_pop",1527
...,...,...
18904,"trova', 'bolero_cubano', 'spanish_pop', 'nova_...",1
18905,"late_romantic_era', 'classical', 'classical_ce...",1
18906,"modern_salsa', 'tropical', 'latin_pop', 'ranch...",1
18907,"classical', 'galante_era', 'baroque', 'italian...",1


In [22]:
no_genre = df[df['genre'] == '']
len(no_genre)

16508

In [23]:
# drop no_genre songs from 
df.drop(df[df['genre'] == ''].index, inplace=True)
df.drop(df[df['genre'] == 'nan'].index, inplace=True)

len(df)

246671

In [24]:
# separate genres connected by ', ' using split then explode
# credit for solution: https://stackoverflow.com/questions/71175458/splitting-row-into-multiple-rows-in-pandas-dataframe
df = (
    df.assign(genre=df['genre'].str.split("', '")).explode('genre').reset_index(drop=True)
)

df

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,nuevo_regional_mexicano,Carve_Uli,00:02:06,winter
1,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,sierreno,Carve_Uli,00:02:06,winter
2,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,regional_mexican,Carve_Uli,00:02:06,winter
3,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,corrido,Carve_Uli,00:02:06,winter
4,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.994,0.021800,0.2120,0.4570,130.418,5,vintage_tango,Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01,spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233361,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,2020-10-21,0.765,0.663,...,0.141,0.000297,0.0924,0.6860,150.091,4,indie_cafe_pop,blind_ROLE MODEL,00:03:07,autumn
1233362,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,2020-10-21,0.765,0.663,...,0.141,0.000297,0.0924,0.6860,150.091,4,alternative_r&b,blind_ROLE MODEL,00:03:07,autumn
1233363,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,FINNEAS,37M5pPGs6V1fchFJSgCguX,2020-09-02,0.535,0.314,...,0.895,0.000150,0.0874,0.0663,145.095,4,pop,What They'll Say About Us_FINNEAS,00:02:22,summer
1233364,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,FINNEAS,37M5pPGs6V1fchFJSgCguX,2020-09-02,0.535,0.314,...,0.895,0.000150,0.0874,0.0663,145.095,4,la_indie,What They'll Say About Us_FINNEAS,00:02:22,summer


In [25]:
df.shape

(1233366, 24)

In [54]:
df['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count').head(50)

Unnamed: 0,genre,count
0,rock,25861
1,adult_standards,20698
2,classic_rock,19049
3,mellow_gold,15962
4,classical,15349
5,filmi,14923
6,album_rock,14848
7,folk_rock,14452
8,soft_rock,14016
9,vocal_jazz,11754


In [27]:
df[df['artist_name'] == 'Olivia Rodrigo']

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season
356234,4Yxc55NX3tAXC2mHRAhtcW,"All I Want - From ""High School Musical: The Mu...",80,177323,0,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2019-11-27,0.376,0.43,...,0.0902,0.0,0.0912,0.129,77.599,3,pop,"All I Want - From ""High School Musical: The Mu...",00:02:57,autumn
356235,4Yxc55NX3tAXC2mHRAhtcW,"All I Want - From ""High School Musical: The Mu...",80,177323,0,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2019-11-27,0.376,0.43,...,0.0902,0.0,0.0912,0.129,77.599,3,social_media_pop,"All I Want - From ""High School Musical: The Mu...",00:02:57,autumn
356236,4Yxc55NX3tAXC2mHRAhtcW,"All I Want - From ""High School Musical: The Mu...",80,177323,0,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2019-11-27,0.376,0.43,...,0.0902,0.0,0.0912,0.129,77.599,3,post-teen_pop,"All I Want - From ""High School Musical: The Mu...",00:02:57,autumn
361545,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-01-08,0.585,0.436,...,0.721,1.3e-05,0.105,0.132,143.874,4,pop,drivers license_Olivia Rodrigo,00:04:02,winter
361546,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-01-08,0.585,0.436,...,0.721,1.3e-05,0.105,0.132,143.874,4,social_media_pop,drivers license_Olivia Rodrigo,00:04:02,winter
361547,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-01-08,0.585,0.436,...,0.721,1.3e-05,0.105,0.132,143.874,4,post-teen_pop,drivers license_Olivia Rodrigo,00:04:02,winter
361608,61KpQadow081I2AsbeLcsb,deja vu,90,215508,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-04-01,0.439,0.61,...,0.593,1.1e-05,0.341,0.172,181.088,4,pop,deja vu_Olivia Rodrigo,00:03:35,spring
361609,61KpQadow081I2AsbeLcsb,deja vu,90,215508,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-04-01,0.439,0.61,...,0.593,1.1e-05,0.341,0.172,181.088,4,social_media_pop,deja vu_Olivia Rodrigo,00:03:35,spring
361610,61KpQadow081I2AsbeLcsb,deja vu,90,215508,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-04-01,0.439,0.61,...,0.593,1.1e-05,0.341,0.172,181.088,4,post-teen_pop,deja vu_Olivia Rodrigo,00:03:35,spring
640796,1v6svH1Fyx9C1nIt1mA2DT,All I Want,75,177323,0,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2020-01-10,0.376,0.43,...,0.0902,0.0,0.0912,0.129,77.599,3,pop,All I Want_Olivia Rodrigo,00:02:57,winter


In [62]:
df[df['artist_name'] == 'Nirvana'][0:9]

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season
250938,5muVpPu8Fj9fXfDbbqDdrZ,Love Buzz,65,215203,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.479,0.865,...,3.1e-05,0.228,0.182,0.77,137.902,4,permanent_wave,Love Buzz_Nirvana,00:03:35,spring
250939,5muVpPu8Fj9fXfDbbqDdrZ,Love Buzz,65,215203,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.479,0.865,...,3.1e-05,0.228,0.182,0.77,137.902,4,grunge,Love Buzz_Nirvana,00:03:35,spring
250940,5muVpPu8Fj9fXfDbbqDdrZ,Love Buzz,65,215203,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.479,0.865,...,3.1e-05,0.228,0.182,0.77,137.902,4,alternative_rock,Love Buzz_Nirvana,00:03:35,spring
250941,5muVpPu8Fj9fXfDbbqDdrZ,Love Buzz,65,215203,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.479,0.865,...,3.1e-05,0.228,0.182,0.77,137.902,4,rock,Love Buzz_Nirvana,00:03:35,spring
251185,7pETV41GUutaZ6KMHMAYIH,Blew,60,174134,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.411,0.882,...,6e-06,0.0147,0.298,0.531,129.848,4,permanent_wave,Blew_Nirvana,00:02:54,spring
251186,7pETV41GUutaZ6KMHMAYIH,Blew,60,174134,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.411,0.882,...,6e-06,0.0147,0.298,0.531,129.848,4,grunge,Blew_Nirvana,00:02:54,spring
251187,7pETV41GUutaZ6KMHMAYIH,Blew,60,174134,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.411,0.882,...,6e-06,0.0147,0.298,0.531,129.848,4,alternative_rock,Blew_Nirvana,00:02:54,spring
251188,7pETV41GUutaZ6KMHMAYIH,Blew,60,174134,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.411,0.882,...,6e-06,0.0147,0.298,0.531,129.848,4,rock,Blew_Nirvana,00:02:54,spring
251198,0EY1Z9UmZnYZyM7zHs6C0j,School,59,162116,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.21,0.943,...,1.2e-05,0.493,0.0739,0.13,164.844,4,permanent_wave,School_Nirvana,00:02:42,spring


In [29]:
df[df['artist_name'] == 'Nirvana']['genre'].unique()

array(['permanent_wave', 'grunge', 'alternative_rock', 'rock'],
      dtype=object)

In [30]:
df[df['artist_name'] == 'Frank Ocean']

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season
329577,3CgZCQyuyxHRMWB9BTwmni,Swim Good,66,257187,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2011-01-01,0.653,0.668,...,0.00116,0.00269,0.0731,0.7110,160.048,4,lgbtq+_hip_hop,Swim Good_Frank Ocean,00:04:17,winter
329578,3CgZCQyuyxHRMWB9BTwmni,Swim Good,66,257187,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2011-01-01,0.653,0.668,...,0.00116,0.00269,0.0731,0.7110,160.048,4,neo_soul,Swim Good_Frank Ocean,00:04:17,winter
329579,3CgZCQyuyxHRMWB9BTwmni,Swim Good,66,257187,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2011-01-01,0.653,0.668,...,0.00116,0.00269,0.0731,0.7110,160.048,4,hip_hop,Swim Good_Frank Ocean,00:04:17,winter
329580,3CgZCQyuyxHRMWB9BTwmni,Swim Good,66,257187,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2011-01-01,0.653,0.668,...,0.00116,0.00269,0.0731,0.7110,160.048,4,pop,Swim Good_Frank Ocean,00:04:17,winter
329581,3CgZCQyuyxHRMWB9BTwmni,Swim Good,66,257187,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2011-01-01,0.653,0.668,...,0.00116,0.00269,0.0731,0.7110,160.048,4,alternative_r&b,Swim Good_Frank Ocean,00:04:17,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891478,41cpvQ2GyGb2BRdIRSsTqK,Moon River,68,188324,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2018-02-14,0.240,0.116,...,0.87700,0.00092,0.1000,0.0937,77.349,3,lgbtq+_hip_hop,Moon River_Frank Ocean,00:04:02,winter
891479,41cpvQ2GyGb2BRdIRSsTqK,Moon River,68,188324,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2018-02-14,0.240,0.116,...,0.87700,0.00092,0.1000,0.0937,77.349,3,neo_soul,Moon River_Frank Ocean,00:04:02,winter
891480,41cpvQ2GyGb2BRdIRSsTqK,Moon River,68,188324,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2018-02-14,0.240,0.116,...,0.87700,0.00092,0.1000,0.0937,77.349,3,hip_hop,Moon River_Frank Ocean,00:04:02,winter
891481,41cpvQ2GyGb2BRdIRSsTqK,Moon River,68,188324,0,Frank Ocean,2h93pZq0e7k5yf4dywlkpM,2018-02-14,0.240,0.116,...,0.87700,0.00092,0.1000,0.0937,77.349,3,pop,Moon River_Frank Ocean,00:04:02,winter


In [31]:
df[df['artist_name'] == 'Frank Ocean']['genre'].unique()

array(['lgbtq+_hip_hop', 'neo_soul', 'hip_hop', 'pop', 'alternative_r&b'],
      dtype=object)

In [32]:
# artist_id 6DIS6PRrLS3wbnZsf7vYic is artist WALK THE MOON
df[df['artist_id'] == '6DIS6PRrLS3wbnZsf7vYic']['genre'].unique()

array(['rock', 'pop_rock', 'indie_pop', 'modern_alternative_rock',
       'modern_rock', 'indie_poptimism'], dtype=object)

In [33]:
df[df['genre'] == 'disco']['release_date']

13010     1932-01-01
21470     1936-01-01
24409     1938-01-01
47606     1947-01-01
75244     1954-01-01
             ...    
1227433   1958-01-01
1231742   1974-01-01
1231835   1976-01-01
1231870   1977-01-01
1232152   1982-04-18
Name: release_date, Length: 4239, dtype: datetime64[ns]

In [34]:
genre_list = ['rock', 'classical', 'latin', 'soul', 'pop', 'jazz', 
             'folk', 'funk', 'motown', 'rap', 'hip_hop', 'metal', 
             'singer-songwriter', 'country', 'blues','r&b','edm']
print(f'We have {len(genre_list)} genres.')

We have 17 genres.


In [35]:
new_df = df[df['genre'].isin(genre_list)]
new_df['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')

Unnamed: 0,genre,count
0,rock,25861
1,classical,15349
2,latin,11063
3,soul,9768
4,pop,8851
5,jazz,8330
6,folk,7658
7,funk,6169
8,motown,5572
9,rap,5517


In [36]:
# sort df by genre and popularity
new_df.sort_values(['genre','popularity'], ascending=[True,False]).shape


(133145, 24)

In [37]:
# sort new_df by popularity, groupby genre then save top 3000 from each genre
# https://stackoverflow.com/questions/41825978/sorting-columns-and-selecting-top-n-rows-in-each-group-pandas-dataframe
sorted_df_3K = new_df.sort_values('popularity',ascending = False).groupby('genre').head(3000)

In [38]:
sorted_df_3K.shape

(51000, 24)

In [39]:
sorted_df_3K.columns

Index(['track_id', 'track_name', 'popularity', 'duration_ms', 'explicit',
       'artist_name', 'artist_id', 'release_date', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'genre', 'track_natural_key', 'duration_mins', 'release_season'],
      dtype='object')

In [40]:
sorted_df_3K['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')

Unnamed: 0,genre,count
0,r&b,3000
1,singer-songwriter,3000
2,jazz,3000
3,rock,3000
4,latin,3000
5,hip_hop,3000
6,rap,3000
7,folk,3000
8,motown,3000
9,metal,3000


In [41]:
sorted_df_3K[sorted_df_3K['artist_id'] == '1Xyo4u8uXC1ZmMpatF05PJ']

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season
358712,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.68,0.826,...,0.0212,1.2e-05,0.543,0.644,118.051,4,pop,Save Your Tears_The Weeknd,00:03:35,spring
358722,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,96,200040,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.514,0.73,...,0.00146,9.5e-05,0.0897,0.334,171.005,4,pop,Blinding Lights_The Weeknd,00:03:20,spring
340700,7fBv7CLKzipRk6EC6TWHOB,The Hills,87,242253,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2015-08-28,0.585,0.564,...,0.0671,0.0,0.135,0.137,113.003,4,pop,The Hills_The Weeknd,00:04:02,summer
359151,7szuecWAPwGoV1e5vGu8tl,In Your Eyes,86,237520,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.667,0.719,...,0.00285,8.1e-05,0.0736,0.717,100.021,4,pop,In Your Eyes_The Weeknd,00:03:57,spring
352231,09mEdoA6zrmBPgTEN5qXmN,Call Out My Name,83,228373,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2018-03-30,0.461,0.593,...,0.17,0.0,0.307,0.175,134.17,3,pop,Call Out My Name_The Weeknd,00:03:48,spring
340762,22VdIZQfgXJea34mQxlt81,Can't Feel My Face,82,213520,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2015-08-28,0.705,0.769,...,0.113,0.0,0.105,0.583,107.949,4,pop,Can't Feel My Face_The Weeknd,00:03:33,summer
360082,6bnF93Rx87YqUBLSgjiMU8,Heartless,81,198267,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.537,0.746,...,0.0236,1e-06,0.156,0.252,170.062,4,pop,Heartless_The Weeknd,00:03:18,spring
340843,4PhsKqMdgMEUSstTDAmMpg,Often,80,249040,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2015-08-28,0.572,0.631,...,0.222,0.0,0.135,0.0713,134.078,4,pop,Often_The Weeknd,00:04:09,summer
337791,2PIvq1pGrUjY007X5y1UpM,"Earned It (Fifty Shades Of Grey) - From The ""F...",79,252227,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2014-12-23,0.659,0.381,...,0.385,0.0,0.0972,0.426,119.844,3,pop,"Earned It (Fifty Shades Of Grey) - From The ""F...",00:04:12,winter
345092,2Ch7LmS7r2Gy2kc64wv3Bz,Die For You,79,260253,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2016-11-25,0.582,0.525,...,0.111,0.0,0.134,0.51,133.249,4,pop,Die For You_The Weeknd,00:04:20,autumn


In [42]:
# try bringing in albums and r_albums_artists sqlite tables, then make the inner joins in sql?
# encoding error solution https://stackoverflow.com/questions/23508153/python-encoding-could-not-decode-to-utf8
import sqlite3

cnx = sqlite3.connect('spotify.sqlite')
cnx.text_factory = lambda x: str(x, 'latin1')
albums_sqlite = pd.read_sql_query("SELECT * FROM albums", cnx)

r_albums_tracks_sqlite = pd.read_sql_query("SELECT * FROM r_albums_tracks", cnx)

In [50]:
albums_sqlite.rename(columns={'id':'album_id','name':'album_name'}, inplace=True)


In [51]:
albums_sqlite.columns

Index(['album_id', 'album_name', 'album_group', 'album_type', 'release_date',
       'popularity'],
      dtype='object')

In [46]:
sorted_df_3K.shape

(51000, 24)

In [47]:
df_1 = pd.merge(sorted_df_3K, r_albums_tracks_sqlite, on='track_id', how='inner')
df_1.shape

(44118, 25)

In [48]:
df_1[df_1['artist_id'] == '1Xyo4u8uXC1ZmMpatF05PJ']

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,genre,track_natural_key,duration_mins,release_season,album_id
2,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.68,0.826,...,1.2e-05,0.543,0.644,118.051,4,pop,Save Your Tears_The Weeknd,00:03:35,spring,4yP0hdKOZPNshxUOjY0cZj
5,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,96,200040,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.514,0.73,...,9.5e-05,0.0897,0.334,171.005,4,pop,Blinding Lights_The Weeknd,00:03:20,spring,4yP0hdKOZPNshxUOjY0cZj
65,7fBv7CLKzipRk6EC6TWHOB,The Hills,87,242253,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2015-08-28,0.585,0.564,...,0.0,0.135,0.137,113.003,4,pop,The Hills_The Weeknd,00:04:02,summer,0P3oVJBFOv3TDXlYRhGL7s
122,7szuecWAPwGoV1e5vGu8tl,In Your Eyes,86,237520,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.667,0.719,...,8.1e-05,0.0736,0.717,100.021,4,pop,In Your Eyes_The Weeknd,00:03:57,spring,4yP0hdKOZPNshxUOjY0cZj
330,09mEdoA6zrmBPgTEN5qXmN,Call Out My Name,83,228373,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2018-03-30,0.461,0.593,...,0.0,0.307,0.175,134.17,3,pop,Call Out My Name_The Weeknd,00:03:48,spring,4qZBW3f2Q8y0k1A84d4iAO
495,22VdIZQfgXJea34mQxlt81,Can't Feel My Face,82,213520,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2015-08-28,0.705,0.769,...,0.0,0.105,0.583,107.949,4,pop,Can't Feel My Face_The Weeknd,00:03:33,summer,0P3oVJBFOv3TDXlYRhGL7s
582,6bnF93Rx87YqUBLSgjiMU8,Heartless,81,198267,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2020-03-20,0.537,0.746,...,1e-06,0.156,0.252,170.062,4,pop,Heartless_The Weeknd,00:03:18,spring,4yP0hdKOZPNshxUOjY0cZj
685,4PhsKqMdgMEUSstTDAmMpg,Often,80,249040,1,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2015-08-28,0.572,0.631,...,0.0,0.135,0.0713,134.078,4,pop,Often_The Weeknd,00:04:09,summer,0P3oVJBFOv3TDXlYRhGL7s
861,2PIvq1pGrUjY007X5y1UpM,"Earned It (Fifty Shades Of Grey) - From The ""F...",79,252227,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2014-12-23,0.659,0.381,...,0.0,0.0972,0.426,119.844,3,pop,"Earned It (Fifty Shades Of Grey) - From The ""F...",00:04:12,winter,61Ba3txRZWfiX6ZTEZlFCV
926,2Ch7LmS7r2Gy2kc64wv3Bz,Die For You,79,260253,0,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,2016-11-25,0.582,0.525,...,0.0,0.134,0.51,133.249,4,pop,Die For You_The Weeknd,00:04:20,autumn,2ODvWsOgouMbaA5xf0RkJe


In [52]:
df_all = pd.merge(df_1, albums_sqlite, on='album_id', how='inner')
df_all.shape

(44118, 30)

In [53]:
# check how many songs per genre after all merge
df_all['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')

Unnamed: 0,genre,count
0,singer-songwriter,2894
1,country,2868
2,jazz,2844
3,folk,2838
4,funk,2817
5,soul,2804
6,motown,2761
7,hip_hop,2727
8,rap,2723
9,rock,2627


In [60]:
# create the dataframes for writing

genre    44118
dtype: int64

In [None]:
genre_data = sorted_df_3K[['genre']].copy()
album_data = sorted_df_3K[['album_id','album_name','release_season']].copy()
track_features = sorted_df_3K[[
    'track_natural_key',
    'track_name',
    'artist_name',
    'album_id',
    'acousticness',
    'danceability',
    'duration_mins',
    'duration_ms',
    'energy',
    'genre',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'popularity',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]].copy()

In [None]:
# ideas for next time
# don't limit the songs to the genre list, 
    # see the full extent of genres and how frequent they come by, how is mellow gold is 4th most popular over a generic genre??
# sort by descending popularity per genre and grab the top 3000 popular songs (or some other number)