In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
df = pd.read_csv('Dataset_with_Genres.csv', index_col=0)
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,...,0.744,0.151,0.127,104.851,3,['Uli'],[],['Uli'],UliCarve,"['nuevo_regional_mexicano', 'sierreno', 'regio..."
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,...,0.0,0.148,0.655,102.009,1,['Fernando Pessoa'],[],['Fernando Pessoa'],Fernando PessoaCapítulo 2.16 - Banquero Anarqu...,
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,...,0.0218,0.212,0.457,130.418,5,['Ignacio Corsini'],[],['Ignacio Corsini'],Ignacio CorsiniVivo para Quererte - Remasterizado,"['vintage_tango', 'tango']"
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,...,0.918,0.104,0.397,169.98,3,['Ignacio Corsini'],[],['Ignacio Corsini'],Ignacio CorsiniEl Prisionero - Remasterizado,"['vintage_tango', 'tango']"
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,...,0.13,0.311,0.196,103.22,4,['Dick Haymes'],[],['Dick Haymes'],Dick HaymesLady of the Evening,"['lounge', 'big_band', 'deep_adult_standards',..."


In [3]:
col_list = df.columns

In [4]:
col_list

Index(['id', 'name', 'popularity', 'duration_ms', 'explicit', 'artists',
       'id_artists', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'artists_upd_v1',
       'artists_upd_v2', 'artists_upd', 'artists_song',
       'consolidates_genre_lists'],
      dtype='object')

In [5]:
df.dtypes

id                           object
name                         object
popularity                    int64
duration_ms                   int64
explicit                      int64
artists                      object
id_artists                   object
release_date                 object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
time_signature                int64
artists_upd_v1               object
artists_upd_v2               object
artists_upd                  object
artists_song                 object
consolidates_genre_lists     object
dtype: object

In [6]:
df.shape

(523475, 25)

In [7]:
# remove the unnecessary [''] from any useful entries (they weren't lists)
# credit for solution: https://stackoverflow.com/questions/38147447/how-to-remove-square-bracket-from-pandas-dataframe
df['artists'] = df['artists'].str.strip("[' ']").astype(object)
df['id_artists'] = df['id_artists'].str.strip("[' ']").astype(object)
df['consolidates_genre_lists'] = df['consolidates_genre_lists'].str.strip("[' ']").astype(str)

In [8]:
# drop unnecessary columns
to_drop = ['artists_upd_v1', 'artists_upd_v2','artists_upd', 'artists_song']
df.drop(to_drop, axis=1, inplace=True)

In [9]:
df.shape

(523475, 21)

In [10]:
# rename columns
df.rename(columns={'id': 'track_id', 
                   'name': 'track_name',
                   'artists':'artist_name',
                   'id_artists':'artist_id',
                   'genres':'genre',
                   'consolidates_genre_lists':'genres'}, inplace=True)

In [11]:
# pre NaN-drop check
df.isna().sum()

track_id            0
track_name          1
popularity          0
duration_ms         0
explicit            0
artist_name         0
artist_id           0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
genres              0
dtype: int64

In [12]:
# drop NaN
df.dropna(axis=0, inplace=True)
df.isna().sum()

track_id            0
track_name          0
popularity          0
duration_ms         0
explicit            0
artist_name         0
artist_id           0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
genres              0
dtype: int64

In [13]:
df.shape

(523474, 21)

In [14]:
# create natural key 
df['track_natural_key'] = df.track_name + "_" + df.artist_name.map(str)


In [15]:
# Transform duration_ms to duration_mins
# credit to drop milliseconds from datetime format https://stackoverflow.com/questions/31487732/simple-way-to-drop-milliseconds-from-python-datetime-datetime-object

converted = pd.to_datetime(df['duration_ms'], unit='ms')
df['duration_mins'] = pd.Series([val.time().replace(microsecond=0) for val in converted])

df.head()

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.451,0.674,0.744,0.151,0.127,104.851,3,"nuevo_regional_mexicano', 'sierreno', 'regiona...",Carve_Uli,00:02:06
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.957,0.797,0.0,0.148,0.655,102.009,1,,Capítulo 2.16 - Banquero Anarquista_Fernando P...,00:01:38
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.0512,0.994,0.0218,0.212,0.457,130.418,5,"vintage_tango', 'tango",Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.0504,0.995,0.918,0.104,0.397,169.98,3,"vintage_tango', 'tango",El Prisionero - Remasterizado_Ignacio Corsini,00:02:56
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922,0.402,0.158,...,0.039,0.989,0.13,0.311,0.196,103.22,4,"lounge', 'big_band', 'deep_adult_standards', '...",Lady of the Evening_Dick Haymes,00:02:43


In [16]:
# convert release_date to datetime object
df['release_date'] = pd.to_datetime(df.release_date, format='%Y-%m-%d')

# add new column for seasons (borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # numeric trick is to wasp the winter separation at 12-31 / 01-01. 
    # By subtracting the end of winter, which is 03-21, and take modulo, you effectively change 01-01 to - 320 = 980 mod 1300, 
    # which is larger than the last day of autumn. So now your winter season is in one chunk instead of two.

date_offset = (df.release_date.dt.month*100 + df.release_date.dt.day - 320)%1300

df['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
df.head()

Unnamed: 0,track_id,track_name,popularity,duration_ms,explicit,artist_name,artist_id,release_date,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genres,track_natural_key,duration_mins,release_season
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,Uli,45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.674,0.744,0.151,0.127,104.851,3,"nuevo_regional_mexicano', 'sierreno', 'regiona...",Carve_Uli,00:02:06,winter
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,Fernando Pessoa,14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.797,0.0,0.148,0.655,102.009,1,,Capítulo 2.16 - Banquero Anarquista_Fernando P...,00:01:38,spring
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.994,0.0218,0.212,0.457,130.418,5,"vintage_tango', 'tango",Vivo para Quererte - Remasterizado_Ignacio Cor...,00:03:01,spring
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,Ignacio Corsini,5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.995,0.918,0.104,0.397,169.98,3,"vintage_tango', 'tango",El Prisionero - Remasterizado_Ignacio Corsini,00:02:56,spring
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.402,0.158,...,0.989,0.13,0.311,0.196,103.22,4,"lounge', 'big_band', 'deep_adult_standards', '...",Lady of the Evening_Dick Haymes,00:02:43,winter


In [17]:
t_dup = df[df.duplicated('track_natural_key')]
len(t_dup)

0

In [19]:
df[df['track_id'] == '4kbj5MwxO1bq9wjT5g9HaA']['track_name']

81500    Shut Up and Dance
Name: track_name, dtype: object

In [20]:
df.shape

(523474, 24)

In [21]:
df.dtypes

track_id                     object
track_name                   object
popularity                    int64
duration_ms                   int64
explicit                      int64
artist_name                  object
artist_id                    object
release_date         datetime64[ns]
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
time_signature                int64
genres                       object
track_natural_key            object
duration_mins                object
release_season             category
dtype: object

In [22]:
df['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')

KeyError: 'genre'

In [None]:
no_genre = df[df['genre'] == '']
len(no_genre)

In [None]:
# drop no_genre songs from 
df.drop(df[df['genre'] == ''].index, inplace=True)
df.drop(df[df['genre'] == 'nan'].index, inplace=True)

len(df)

In [None]:
# separate genres connected by ', ' using split then explode
# credit for solution: https://stackoverflow.com/questions/71175458/splitting-row-into-multiple-rows-in-pandas-dataframe
df = (
    df.assign(genre=df['genre'].str.split("', '")).explode('genre').reset_index(drop=True)
)

df

In [None]:
df.shape

In [None]:
df['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')[50:90]

In [None]:
df[df['artist_name'] == 'Olivia Rodrigo']

In [None]:
df[df['artist_name'] == 'Nirvana']

In [None]:
df[df['artist_name'] == 'Nirvana']['genre'].unique()

In [None]:
df[df['artist_name'] == 'Frank Ocean']

In [None]:
df[df['artist_name'] == 'Frank Ocean']['genre'].unique()

In [None]:
# artist_id 6DIS6PRrLS3wbnZsf7vYic is artist WALK THE MOON
df[df['artist_id'] == '6DIS6PRrLS3wbnZsf7vYic']['genre'].unique()

In [None]:
df[df['genre'] == 'disco']['release_date']

In [None]:
# ideas for next time
# don't limit the songs to the genre list, 
    # see the full extent of genres and how frequent they come by, how is mellow gold is 4th most popular over a generic genre??
# sort by descending popularity per genre and grab the top 3000 popular songs (or some other number)