In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
df = pd.read_csv('Dataset_with_Genres.csv', index_col=0)
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,0.744,0.151,0.127,104.851,3,['Uli'],[],['Uli'],UliCarve,"['nuevo_regional_mexicano', 'sierreno', 'regio..."
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,0.0,0.148,0.655,102.009,1,['Fernando Pessoa'],[],['Fernando Pessoa'],Fernando PessoaCapítulo 2.16 - Banquero Anarqu...,
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,0.0218,0.212,0.457,130.418,5,['Ignacio Corsini'],[],['Ignacio Corsini'],Ignacio CorsiniVivo para Quererte - Remasterizado,"['vintage_tango', 'tango']"
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,0.918,0.104,0.397,169.98,3,['Ignacio Corsini'],[],['Ignacio Corsini'],Ignacio CorsiniEl Prisionero - Remasterizado,"['vintage_tango', 'tango']"
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,...,0.13,0.311,0.196,103.22,4,['Dick Haymes'],[],['Dick Haymes'],Dick HaymesLady of the Evening,"['lounge', 'big_band', 'deep_adult_standards',..."


In [3]:
col_list = df.columns

In [4]:
col_list

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'artists_upd_v1',
       'artists_upd_v2', 'artists_upd', 'artists_song',
       'consolidates_genre_lists'],
      dtype='object')

In [5]:
df.dtypes

id                           object
name                         object
popularity                    int64
duration_ms                   int64
explicit                      int64
artists                      object
id_artists                   object
release_date                 object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
time_signature                int64
artists_upd_v1               object
artists_upd_v2               object
artists_upd                  object
artists_song                 object
consolidates_genre_lists     object
dtype: object

In [6]:
df.shape

(523475, 25)

In [7]:
# remove the unnecessary [''] from any useful entries (they weren't lists)
# credit for solution: https://stackoverflow.com/questions/38147447/how-to-remove-square-bracket-from-pandas-dataframe
df['artists'] = df['artists'].str.strip("[' ']").astype(object)
df['id_artists'] = df['id_artists'].str.strip("[' ']").astype(object)
df['consolidates_genre_lists'] = df['consolidates_genre_lists'].str.strip("[' ']").astype(str)

In [8]:
# drop unnecessary columns
to_drop = ['artists_upd_v1', 'artists_upd_v2','artists_upd', 'artists_song']
df.drop(to_drop, axis=1, inplace=True)

In [9]:
df.shape

(523475, 21)

In [10]:
# rename columns
df.rename(columns={'id': 'track_id', 
                   'name': 'track_name',
                   'artists':'artist_name',
                   'id_artists':'artist_id',
                   'consolidates_genre_lists':'genres'}, inplace=True)

In [11]:
# pre NaN-drop check
df.isna().sum()

track_id            0
track_name          1
popularity          0
duration_ms         0
explicit            0
artist_name         0
artist_id           0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
genres              0
dtype: int64

In [12]:
# drop NaN
df.dropna(axis=0, inplace=True)
df.isna().sum()

track_id            0
track_name          0
popularity          0
duration_ms         0
explicit            0
artist_name         0
artist_id           0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
genres              0
dtype: int64

In [13]:
df.shape

(523474, 21)

In [14]:
# create natural key 
df['track_natural_key'] = df.track_name + "_" + df.artist_name.map(str)


In [15]:
# Transform duration_ms to duration_mins
# credit to drop milliseconds from datetime format https://stackoverflow.com/questions/31487732/simple-way-to-drop-milliseconds-from-python-datetime-datetime-object

converted = pd.to_datetime(df['duration_ms'], unit='ms')
df['duration_mins'] = pd.Series([val.time().replace(microsecond=0) for val in converted])

df.head()

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.451,0.674,0.744,0.151,0.127,104.851,3,"nuevo_regional_mexicano', 'sierreno', 'regiona...",Carve_Uli,00:02:06
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.957,0.797,0.0,0.148,0.655,102.009,1,,Capítulo 2.16 - Banquero Anarquista_Fernando P...,00:01:38
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.0512,0.994,0.0218,0.212,0.457,130.418,5,"vintage_tango', 'tango",Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.0504,0.995,0.918,0.104,0.397,169.98,3,"vintage_tango', 'tango",El Prisionero - Remasterizado_Ignacio Corsini,00:02:56
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922,0.402,0.158,...,0.039,0.989,0.13,0.311,0.196,103.22,4,"lounge', 'big_band', 'deep_adult_standards', '...",Lady of the Evening_Dick Haymes,00:02:43


In [16]:
# convert release_date to datetime object
df['release_date'] = pd.to_datetime(df.release_date, format='%Y-%m-%d')

# add new column for seasons (borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # numeric trick is to wasp the winter separation at 12-31 / 01-01. 
    # By subtracting the end of winter, which is 03-21, and take modulo, you effectively change 01-01 to - 320 = 980 mod 1300, 
    # which is larger than the last day of autumn. So now your winter season is in one chunk instead of two.

date_offset = (df.release_date.dt.month*100 + df.release_date.dt.day - 320)%1300

df['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
df.head()

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins,release_season
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744,0.151,0.127,104.851,3,"nuevo_regional_mexicano', 'sierreno', 'regiona...",Carve_Uli,00:02:06,winter
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.797,0.0,0.148,0.655,102.009,1,,Capítulo 2.16 - Banquero Anarquista_Fernando P...,00:01:38,spring
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.994,0.0218,0.212,0.457,130.418,5,"vintage_tango', 'tango",Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01,spring
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.995,0.918,0.104,0.397,169.98,3,"vintage_tango', 'tango",El Prisionero - Remasterizado_Ignacio Corsini,00:02:56,spring
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.402,0.158,...,0.989,0.13,0.311,0.196,103.22,4,"lounge', 'big_band', 'deep_adult_standards', '...",Lady of the Evening_Dick Haymes,00:02:43,winter


In [17]:
t_dup = df[df.duplicated('track_natural_key')]
len(t_dup)

0

In [40]:
df[df['track_id'] == '4kbj5MwxO1bq9wjT5g9HaA']['genres'].unique()

array(['rock', 'pop_rock', 'indie_pop', 'modern_alternative_rock',
       'modern_rock', 'indie_poptimism'], dtype=object)

In [19]:
df[df['track_id'] == '4kbj5MwxO1bq9wjT5g9HaA']['track_name']

81500    Shut Up and Dance
Name: track_name, dtype: object

In [20]:
df.shape

(523474, 24)

In [21]:
df.dtypes

track_id                     object
track_name                   object
popularity                    int64
duration_ms                   int64
explicit                      int64
artist_name                  object
artist_id                    object
release_date         datetime64[ns]
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
time_signature                int64
genres                       object
track_natural_key            object
duration_mins                object
release_season             category
dtype: object

In [22]:
df['genres'].value_counts(ascending=False).rename_axis('genres').reset_index(name='count')

Unnamed: 0,genres,count
0,,260295
1,,16508
2,"vintage_tango', 'tango",2114
3,"filmi', 'classic_bollywood', 'desi_pop', 'sufi",2065
4,"filmi', 'deep_indian_pop",1527
...,...,...
18904,"r&b', 'trap', 'southern_hip_hop', 'chicago_rap...",1
18905,"classic_mandopop', 'taiwan_pop', 'zhongguo_fen...",1
18906,"post-romantic_era', 'italian_opera', 'classica...",1
18907,"lounge', 'jazz_fusion', 'jazz_piano', 'cool_ja...",1


In [23]:
no_genre = df[df['genres'] == '']
len(no_genre)

16508

In [24]:
# drop no_genre songs from 
df.drop(df[df['genres'] == ''].index, inplace=True)
df.drop(df[df['genres'] == 'nan'].index, inplace=True)

len(df)

246671

In [25]:
# separate genres connected by ', ' using split then explode
# credit for solution: https://stackoverflow.com/questions/71175458/splitting-row-into-multiple-rows-in-pandas-dataframe
df = (
    df.assign(genres=df['genres'].str.split("', '")).explode('genres').reset_index(drop=True)
)

df

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins,release_season
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,nuevo_regional_mexicano,Carve_Uli,00:02:06,winter
1,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,sierreno,Carve_Uli,00:02:06,winter
2,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,regional_mexican,Carve_Uli,00:02:06,winter
3,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744000,0.1510,0.1270,104.851,3,corrido,Carve_Uli,00:02:06,winter
4,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.994,0.021800,0.2120,0.4570,130.418,5,vintage_tango,Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01,spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233361,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,2020-10-21,0.765,0.663,...,0.141,0.000297,0.0924,0.6860,150.091,4,indie_cafe_pop,blind_ROLE MODEL,00:03:07,autumn
1233362,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,2020-10-21,0.765,0.663,...,0.141,0.000297,0.0924,0.6860,150.091,4,alternative_r&b,blind_ROLE MODEL,00:03:07,autumn
1233363,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,FINNEAS,37M5pPGs6V1fchFJSgCguX,2020-09-02,0.535,0.314,...,0.895,0.000150,0.0874,0.0663,145.095,4,pop,What They'll Say About Us_FINNEAS,00:02:22,summer
1233364,27Y1N4Q4U3EfDU5Ubw8ws2,What They'll Say About Us,70,187601,0,FINNEAS,37M5pPGs6V1fchFJSgCguX,2020-09-02,0.535,0.314,...,0.895,0.000150,0.0874,0.0663,145.095,4,la_indie,What They'll Say About Us_FINNEAS,00:02:22,summer


In [26]:
df.shape

(1233366, 24)

In [27]:
df['genres'].value_counts(ascending=False).rename_axis('genres').reset_index(name='count')

Unnamed: 0,genres,count
0,rock,25861
1,adult_standards,20698
2,classic_rock,19049
3,mellow_gold,15962
4,classical,15349
...,...,...
2830,christelijk,1
2831,trio_cubano,1
2832,rva_indie,1
2833,steelpan,1


In [28]:
genre_list = [
    'soundtrack','indie','jazz','pop','electronic',
    'folk','hip_hop','rock','alternative','classical',
    'rap','world','soul','blues','R&B',
    'reggae','ska','dance','country','opera'
]
# ppp = ["'zydeco'"]
print(f'We have {len(genre_list)} genres.')

We have 20 genres.


In [29]:
new_df = df[df['genres'].isin(genre_list)]

In [30]:
new_df.shape

(101830, 24)

In [31]:
new_df

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins,release_season
245,2wAfHM7Whz67VFbdanhZlk,Nobody Knows You When You're Down and Out,41,177133,0,Bessie Smith,5ESobCkc6JI4tIMxQttqeg,1923-01-01,0.614,0.0423,...,0.9960,0.002930,0.1830,0.2110,89.822,4,blues,Nobody Knows You When You're Down and Out_Bess...,00:02:57,winter
260,3eMrYc092k7SIJfWJ7oasR,Weather Bird,38,161933,0,"Louis Armstrong', 'Earl Hines","19eLuQmk9aCobbVDHc6eek', '2mY5u4CceAPrpBnse1WpFr",1923-01-01,0.831,0.2620,...,0.9840,0.912000,0.2040,0.9010,104.606,4,jazz,"Weather Bird_Louis Armstrong', 'Earl Hines",00:02:41,winter
266,2AZgaYZSwUosJD71J2N2Zo,'Tain't Nobody's Bizness If I Do,30,206600,0,Bessie Smith,5ESobCkc6JI4tIMxQttqeg,1923-01-01,0.537,0.0443,...,0.9960,0.000265,0.1520,0.1370,80.468,4,blues,'Tain't Nobody's Bizness If I Do_Bessie Smith,00:03:26,winter
278,6XTvSCqGLLH0vzQQRn55hz,"Empty Bed Blues, Pt. 1",27,181173,0,Bessie Smith,5ESobCkc6JI4tIMxQttqeg,1923-01-01,0.709,0.0620,...,0.9950,0.003410,0.1200,0.5530,85.854,4,blues,"Empty Bed Blues, Pt. 1_Bessie Smith",00:03:01,winter
283,6qRvnXftofjYJm1Mg98UWL,Need a Little Sugar in My Bowl,27,167640,0,Bessie Smith,5ESobCkc6JI4tIMxQttqeg,1923-01-01,0.693,0.0270,...,0.9920,0.000000,0.1340,0.4020,75.749,4,blues,Need a Little Sugar in My Bowl_Bessie Smith,00:02:47,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233342,1uviKYHZuM4uINK33F7sCt,Fix It to Break It,70,198799,0,Clinton Kane,7okSU80WTrn4LXlyXYbX3P,2020-03-27,0.493,0.4610,...,0.8450,0.000000,0.1150,0.3500,51.414,4,pop,Fix It to Break It_Clinton Kane,00:04:32,spring
1233349,1fXmDeiCb3ABt5CzkMxp4u,Lotus Inn,63,195868,0,"""Why Don't We""",2jnIB6XdLvnJUeNTy5A0J2,2020-12-04,0.578,0.7400,...,0.0119,0.000000,0.1800,0.4950,155.984,4,pop,"Lotus Inn_""Why Don't We""",00:03:52,autumn
1233356,27kcZEJvhkb1rzZS9gCpdA,remember the mornings,67,202355,0,Clinton Kane,7okSU80WTrn4LXlyXYbX3P,2020-11-27,0.590,0.4430,...,0.6180,0.000000,0.1100,0.3420,149.447,4,pop,remember the mornings_Clinton Kane,00:04:33,autumn
1233360,0NuWgxEp51CutD2pJoF4OM,blind,72,153293,0,ROLE MODEL,1dy5WNgIKQU6ezkpZs4y8z,2020-10-21,0.765,0.6630,...,0.1410,0.000297,0.0924,0.6860,150.091,4,pop,blind_ROLE MODEL,00:03:07,autumn


In [32]:
new_df['genres'].value_counts(ascending=False).rename_axis('genres').reset_index(name='count')

Unnamed: 0,genres,count
0,rock,25861
1,classical,15349
2,soul,9768
3,pop,8851
4,jazz,8330
5,folk,7658
6,rap,5517
7,hip_hop,5184
8,country,3753
9,opera,3286


In [33]:
new_df[new_df['artist_name'] == 'Olivia Rodrigo']

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins,release_season
356234,4Yxc55NX3tAXC2mHRAhtcW,"All I Want - From ""High School Musical: The Mu...",80,177323,0,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2019-11-27,0.376,0.43,...,0.0902,0.0,0.0912,0.129,77.599,3,pop,"All I Want - From ""High School Musical: The Mu...",00:02:57,autumn
361545,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-01-08,0.585,0.436,...,0.721,1.3e-05,0.105,0.132,143.874,4,pop,drivers license_Olivia Rodrigo,00:04:02,winter
361608,61KpQadow081I2AsbeLcsb,deja vu,90,215508,1,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2021-04-01,0.439,0.61,...,0.593,1.1e-05,0.341,0.172,181.088,4,pop,deja vu_Olivia Rodrigo,00:03:35,spring
640796,1v6svH1Fyx9C1nIt1mA2DT,All I Want,75,177323,0,Olivia Rodrigo,1McMsnEElThX1knmY4oliG,2020-01-10,0.376,0.43,...,0.0902,0.0,0.0912,0.129,77.599,3,pop,All I Want_Olivia Rodrigo,00:02:57,winter


In [38]:
new_df[new_df['artist_name'] == 'Olivia Rodrigo']['genres'].unique()

array(['pop'], dtype=object)

In [34]:
new_df[new_df['artist_name'] == 'Nirvana']

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins,release_season
250941,5muVpPu8Fj9fXfDbbqDdrZ,Love Buzz,65,215203,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.479,0.865,...,0.000031,0.228000,0.1820,0.7700,137.902,4,rock,Love Buzz_Nirvana,00:03:35,spring
251188,7pETV41GUutaZ6KMHMAYIH,Blew,60,174134,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.411,0.882,...,0.000006,0.014700,0.2980,0.5310,129.848,4,rock,Blew_Nirvana,00:02:54,spring
251201,0EY1Z9UmZnYZyM7zHs6C0j,School,59,162116,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.210,0.943,...,0.000012,0.493000,0.0739,0.1300,164.844,4,rock,School_Nirvana,00:02:42,spring
251347,5IeTFRymTDiza7DciBD1Gk,Negative Creep,58,175745,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.370,0.947,...,0.000008,0.000175,0.0762,0.4180,81.744,4,rock,Negative Creep_Nirvana,00:02:55,spring
251637,5TZbtpZcTWKEn81Du4hiTb,Mr. Moustache,55,203929,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-01,0.194,0.898,...,0.000006,0.141000,0.1080,0.4820,177.191,4,rock,Mr. Moustache_Nirvana,00:03:23,spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1119595,3bvBBhAyIJ31LQF0D6T3zz,Flashbulb,4,133040,0,Nirvana,7dIxU1XgxBIa3KJAWzaFAC,1968-01-01,0.514,0.660,...,0.045100,0.214000,0.1780,0.6380,89.257,4,rock,Flashbulb_Nirvana,00:04:17,winter
1119599,40oMOoydQQZakQIxU8TGZu,St John's Wood Affair,5,257027,0,Nirvana,7dIxU1XgxBIa3KJAWzaFAC,1968-01-01,0.396,0.368,...,0.238000,0.024800,0.1610,0.0833,101.366,4,rock,St John's Wood Affair_Nirvana,00:02:13,winter
1119603,5uzryIN6Yq5kapHD6GXAad,C Side In Ocho Rios,4,133893,0,Nirvana,7dIxU1XgxBIa3KJAWzaFAC,1968-01-01,0.774,0.238,...,0.896000,0.273000,0.1040,0.7650,98.619,4,rock,C Side In Ocho Rios_Nirvana,00:02:49,winter
1186614,1LXCcreL7xDtzp9qD9vaBi,Love Buzz (Live at Pine Street Theatre),35,177761,0,Nirvana,6olE6TJLqED3rqDCT0FyPh,1989-06-15,0.407,0.952,...,0.000050,0.146000,0.1060,0.8100,152.945,4,rock,Love Buzz (Live at Pine Street Theatre)_Nirvana,00:04:47,spring


In [37]:
new_df[new_df['artist_name']== 'Frank Ocean']['genres'].unique()

array(['hip_hop', 'pop'], dtype=object)

In [36]:
# ideas for next time
# don't limit the songs to the genre list, 
    # see the full extent of genres and how frequent they come by, how mellow gold is 4th most popular over a generic genre??
# sort by descending popularity per genre and grab the top 3000 popular songs (or some other number)