In [1]:
# Imports
import sqlite3
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import psycopg2 
from config import db_password

This notebook cleans and transforms data from the Kaggle dataset: "8+ M. Spotify Tracks, Genre, Audio Features".

https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [2]:
# Create SQL connection to our sqlite database
cnx = sqlite3.connect('spotify.sqlite')
cnx.text_factory = lambda x: str(x, 'latin1')
Albums = pd.read_sql_query("SELECT * FROM albums", cnx)
Artists = pd.read_sql_query("SELECT * FROM artists", cnx)
R_artist_genre = pd.read_sql_query("SELECT * FROM r_artist_genre", cnx)
R_albums_tracks = pd.read_sql_query("SELECT * FROM r_albums_tracks", cnx)
R_albums_artists = pd.read_sql_query("SELECT * FROM r_albums_artists", cnx)
Audio_features = pd.read_sql_query("SELECT * FROM audio_features", cnx)
Tracks = pd.read_sql_query("SELECT * FROM tracks", cnx)

# Close the SQL connection
cnx.close()

How many rows does each table have?

In [3]:
print(
    len(Albums),
    len(Artists),
    len(R_artist_genre),
    len(R_albums_tracks),
    len(R_albums_artists),
    len(Audio_features),
    len(Tracks)
)

4820754 1066031 487386 9900173 921486 8740043 8741672


### Preparing the Albums dataframe

In [4]:
# See what columns and datatypes we're dealing with
Albums[Albums['id']=='5dGWwsZ9iB2Xc3UKR0gif2']

Unnamed: 0,id,name,album_group,album_type,release_date,popularity
158350,5dGWwsZ9iB2Xc3UKR0gif2,Justice,,album,1616112000000,100


In [5]:
Albums.dtypes

id              object
name            object
album_group     object
album_type      object
release_date     int64
popularity       int64
dtype: object

In [6]:
Albums.isna().sum()

id              0
name            0
album_group     0
album_type      0
release_date    0
popularity      0
dtype: int64

In [7]:
# Keep only album types (no singles, etc.)
Albums = Albums[Albums['album_type'] == 'album']
Albums.shape

(3612835, 6)

In [8]:
# What is album_group and can we get rid of it?
Albums['album_group'].value_counts(ascending=False).rename_axis('album_group').reset_index(name='count')

# The only unique values in album_group are blank entries, so this column can be dropped.

Unnamed: 0,album_group,count
0,,3612835


In [9]:
# Drop album popularity, album_type and album_group. 
# Rename id and name.
Albums.drop(['popularity','album_group','album_type'], axis=1, inplace=True)
Albums.rename(columns={'id':'album_id','name':'album_name'}, inplace=True)
Albums[Albums['album_id']=='5dGWwsZ9iB2Xc3UKR0gif2']

Unnamed: 0,album_id,album_name,release_date
158350,5dGWwsZ9iB2Xc3UKR0gif2,Justice,1616112000000


In [10]:
# Confirm column drops.
Albums.shape

(3612835, 3)

#### Tangent: null release dates

On the first attempt to convert the release_date column, the to_datetime function raised OutOfBound errors when using unit='ms'. The list below, nats, are the album_id's that came up as NaT when to_datetime(errors='coerce'). These will be used to drill down and see what values are returning NaT and if these rows can be dropped.

In [11]:
# Fetched this list of album_ids that came up as NaT when converting ms to date via to_datetime on first try. 
nats = ['63unDXoEwu8EDdK7OyycPQ', '1PfOqxDAaRE9FgM3JFSNi9', '4BZ3fjaLDnugPWmijavGh4', '0OSEpze4gRdRheCQb2tdnT', '5qy4dDcFMNkNTeuGHx7zoz',
 '0SmiOYBHHcxQ2KpPz5APdH', '41pEBMLtapOU6coMypN96S', '1SC42SOTNZOIq2iQ6dDBXO', '0dclVuD1w2eKTnMoEijxIc', '7BGk6YTdIFZiWzwtlkPGtU',
 '5ZTjSfJusJX5NmR4nCBDWr', '55GvEdVhcqtXphA7yqj2LL', '2sJq3bJwdMfDk9Ek2pnmbj', '6AANPXUJhBYA34MaGKd1iO', '4Uf5mmlLZ0ONOxByWsMxVN',
 '6ueVQWgN4debvl6cnVBnKF', '5KwayTbJpDgiFJOvGncsld', '4QYIKLFws9hq2ml6IYRd2D', '4F6ifn8pwK2bQ5S0WgrdM9', '06mWm3nCaNyACElnHE3KHj',
 '2eqnYkYvtWKOk1HO97m2ux', '0BuzeeynKUCIf8EkwlWigw', '0lcZAR1pbnpFJyju1BKG8A', '6JH0MqTu9Jczm7m3tK3XfT', '6bwMFSTRZLXxzgT2ONWTkH',
 '5Xb7o38gPfdPNt29aZdA82', '5gzMLjYxNXocCV4U1JwT3J', '0YUF0VkGtaBhaby01YOTwn', '5mQS4O3BUHKfVWcM1utxyd', '6Hp6QqterTD6IkcNiIXmC9',
 '1MPS4Ofaxxec57hKv5rDiV', '1GJNf1s72ibGmeTC8TitLU', '60xVw7fFlzN802dCboS9Fe', '3kzvwLED2Dx7GW9BOCMNqs', '5ZR3wCUhSc0Bbb6V4dRGfT',
 '6dLozgIdB1MIlndWxmfDJJ', '34CSyWrRJ1ec2FNkeQmp2y', '09F4nf8iQJP3r3QZxNKEsh', '0YJWLbpl6QglAj5BH7p0jN', '2FQhcClwpVbfBMHYcE2m3M',
 '3rJbElD53kTUZigRv4lH80', '0PCaLFCIITetsmuepBmgkZ', '19W2i8u4waA7ZFWuAOLgD8', '7FbVQLsmXxPnuykb9aX3ox', '70K7Q5kbWIKOWuJh9tqmuy',
 '01Yr7yJaPBtXHxWA3tt6W7', '1d3MHof1R92EIIICxkbTUe', '0WEaVS1elgw9ZyZ2KganFX', '76qlsmlAtTUEJMbubNu8KJ', '1JvOXKmtRT788naHmohJYE',
 '76fWk2i2FjZNmNAELcQj0r', '48bp8F6eXvWyRbwcftyIy9', '4nHIRGcWs0g6TNdmCvmxdk', '4tgrpJygWQGOjLM6CTDXSE', '1xt4AcOEYQBIYL988rSvDF', 
 '1qXA0QwhDcldAZ6iYmEAVK', '2pFDpkH5saykRdLsuRTScK', '4YmHvQ0BQXZKeQQFAMcnz1', '0KtjJIEVpBaCYbTCF9o4aL', 
 '1sjQZGg2rfTRYd96cSqDLI', '4EZqpwUvLyNRa7EVgu3wG0', '2HahlYh4MQV8AN9s1C2qa3', '0Cq4vxCY8tdAu1APjtcv7Y', '68AYUbgkfV1EtoR9USiR8A', '1PUKw9RUkR00Z4VjUYj20M', '709aJBwQqGTO6x1KdTF9c4',
 '64qH1SlvRj8kMyBB2Msk97', '58YsankVf0WD6T3gTzOVj6', '45HmXw64xSXEyoP6Fn6ITi', '7nef1k8XDyx7fZAXw7lLJR',
 '1kiUAV9LDw7spIirvBraRp', '0tqtYGZI6KLTSi6ux1DPnU', '6OmCOzIqyYHWt8BlnJHeFK', '78v7GWjRO8DEONPZ807Ak8',
 '4Ye6LKFbvzBfBTdBRLx9HK', '7sxxL9XTJUEwi5fnkkuZxu', '6wt6Ta2PPojzNVaOsu3H5L', '3aO3GbNM44Zz5IeiB9GWN5', '1I3ou8ROUNbzQtNkKKl6ms', '2ajnhw8rVGZcmYJFDFvZx7',
 '33Iz37Y2rQAQvIXJAzoGbi', '1y69Vvr7sJ4eceuFmBChto', '0nL9ZHe5pkrM6MRmSLUme5', '1IDPvr891AJCq1Ovjbv6FM', '6btGgxOaF1bJDleb94Cn7K', '3e4cMfOEf8kMpO2H7nJuqs',
 '1XpX0tQz6iG4n0nOaJcqkh', '5WhAvShZ4cdO3qvW3HTxmg', '1uy0l5dYmBL4j00Pvw2JHW', '6fgfw1WrwN0L0wAgaUZ28D',
 '0D97tWAlL2uP7jtnxHQSoD', '22eru1PSNbwchaKLAhvbRY', '5I4PV0eZ1OJqmraNcfX7vw', '1TVxOSAEpRhfKUOJ99T05u',
 '6S5R9D12jArY0529nbmB9k', '2i8By0hzBzrz1E6d29dIGt', '0zJhfP1GMX6eJmX9cqGFCx', '2RwlWIdADtuYx3Z4MmUaxw',
 '6NpUPJCbtcMsAu5o6g98lJ', '6OLGJhqrx21G7M4CSF9QyN', '1LS4LZk78kuxXGkFHMbobK', '20z7Pj9IH27MblTdY8V8kh',
 '7BH9NxLaqaDgOS0im7igIK', '0WgKPHEusRvJRgFYlkxhyW', '6Ro8Q90tqmV5BrZt8QdKr4', '1jddJxfP3CutD6OvK6jwSf',
 '1WZFg6HbGo4FCo2PNq2oSN', '3dtkhIZGdXr9BRT5731YYC', '6HmpulBVlgKBPdlHUpcc1A', '4KVyXjlue24CMftqA0zBLu',
 '238vIlHHy2OWZJvpnyJYSi', '3SioZzRwB3nBetV5kM9mYg', '0jHXfxpWfxib7Y1dFIhmWU', '2HYlNU6U2zqYPJF7boTGxh',
 '0NX2ZNXWXlA2zOzkEE6sM8', '6HKrA26NYCLW0bIJVVE2uu', '6pQXLCpH9bAj9X3mPMbNRe', '2pbOC1tUcxXzJwPx2afJFO',
 '2UcjwljQ3Zj2mkYmJ3XlQk', '0zQrzxnWnPzOkBmLCpFhlL', '2ypI5IrJsWqnr1yCBTtJUk', '1G88bOocuTBCeOg7OoRodd',
 '5e3gxJkLY8lhHYxhyvsT3U', '07DExY4oVtBSKbKuQlwvMy', '7C54sONTDef2Mg1ZrCK8Q3', '1vHYe43mmPA07Yn25ut5LB',
 '5LpwBrQi1KECw4ozAduc9K', '2itCnKbYsZN5ceN2161ylT', '4RVeBSK9g8T93XjeLMT7e8', '0mV682A6rgxJRwolZuxf4s',
 '2cViZVn8J9tpKVVhjYrIlz', '5Ht11Ie67AjnhR7EVD2VlX', '28WSXIxwCo4HnBw4EMM6zK', '26AkkdbLeIw9lyYlmf4jPO',
 '13CbIRyYQTvYJS2IXz6b95', '0bosldZ429fWqibuBqRvXq', '5FldfvxBKpi6tKVuPfX34x', '0U8uAMRVaT80P5silJnoQa',
 '3289uSGqh0KXHHu6NXNZE3', '6tAhoWZx9S7fXdwLcHf8Ny', '25Gnr7p2AqtvlizFgLJYre', '34pNsdzFZkPx9Wlclp5g4y',
 '5ZOwddFKN36wUuX4uQ1gb1', '5djMmALRlGYBLJspEkB4rq', '0dcHxEA1DOhGBmlw67CiKx', '69ffhg7xlu2eMmEVYmuVI6', '46W8zd6m5Gm8VTbU43Pbju']

In [12]:
df_nats = Albums[Albums['album_id'].isin(nats)]
df_nats

Unnamed: 0,album_id,album_name,release_date
61959,63unDXoEwu8EDdK7OyycPQ,Life is ...,-62135769600000
139926,1PfOqxDAaRE9FgM3JFSNi9,A MENOS CUARTO,-62135769600000
163510,4BZ3fjaLDnugPWmijavGh4,2017 Midwest Clinic: Clovis North Wind Ensembl...,-62135769600000
188743,0OSEpze4gRdRheCQb2tdnT,2017 Midwest Clinic: T.A. Howard Middle School...,-62135769600000
188758,5qy4dDcFMNkNTeuGHx7zoz,2017 Midwest Clinic: Wylie High School Wind Sy...,-62135769600000
...,...,...,...
2115360,5ZOwddFKN36wUuX4uQ1gb1,Fantasma,-62135769600000
2115361,5djMmALRlGYBLJspEkB4rq,yo...,-62135769600000
2115362,0dcHxEA1DOhGBmlw67CiKx,Historia,-62135769600000
2115363,69ffhg7xlu2eMmEVYmuVI6,Directo al alma - al2,-62135769600000


In [13]:
df_nats[df_nats['release_date'] != -62135769600000]

Unnamed: 0,album_id,album_name,release_date
1439504,0zJhfP1GMX6eJmX9cqGFCx,Twisted Reality,-61733491200000
1439505,2RwlWIdADtuYx3Z4MmUaxw,Mastermind Madness,-61733491200000
1439506,6NpUPJCbtcMsAu5o6g98lJ,Pressure In a Vortex,-61733491200000
1439507,6OLGJhqrx21G7M4CSF9QyN,Wake Up,-61733491200000
1439508,1LS4LZk78kuxXGkFHMbobK,Cybernetic Revolt,-61733491200000
1439509,20z7Pj9IH27MblTdY8V8kh,Space Race,-61733491200000
1439513,7BH9NxLaqaDgOS0im7igIK,Artificial Megafactory,-61733491200000
1439514,0WgKPHEusRvJRgFYlkxhyW,Deadly Maneuvers,-61733491200000
1439515,6Ro8Q90tqmV5BrZt8QdKr4,Crawling Under the Surface,-61733491200000
1439516,1jddJxfP3CutD6OvK6jwSf,Finish the Job,-61733491200000


The NaT values correspond to dates that are likely erroneous or omitted the year of release, which won't be useful anyway and can be dropped.

#### Convert release_date to date then drop all the NaT songs

In [14]:
Albums['release_date'] = pd.to_datetime(Albums.release_date, unit='ms', errors='coerce')
Albums.head()

Unnamed: 0,album_id,album_name,release_date
0,2jKoVlU7VAmExKJ1Jh3w9P,"Alkaholik (feat. Erik Sermon, J Ro & Tash)",2000-04-02
2,6YjKAkDYmlasMqYw73iB0w,Bitch Please II,2000-05-23
4,3UOuBNEin5peSRqdzvlnWM,Still D.R.E.,1999-11-16
5,2g8HN35AnVGIk7B8yMucww,Big Poppa - 2005 Remaster,1994-09-13
6,7iL6o9tox1zgHpKUfh9vuC,In Da Club,2003-02-06


In [15]:
Albums.dtypes

album_id                object
album_name              object
release_date    datetime64[ns]
dtype: object

In [16]:
# new column for seasons (solution borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # Sping start March 20
    # Summer start June 20
    # Fall start Sept 22
    # Winter start Dec 20
date_offset = (Albums.release_date.dt.month*100 + Albums.release_date.dt.day - 320)%1300

Albums['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
Albums

Unnamed: 0,album_id,album_name,release_date,release_season
0,2jKoVlU7VAmExKJ1Jh3w9P,"Alkaholik (feat. Erik Sermon, J Ro & Tash)",2000-04-02,spring
2,6YjKAkDYmlasMqYw73iB0w,Bitch Please II,2000-05-23,spring
4,3UOuBNEin5peSRqdzvlnWM,Still D.R.E.,1999-11-16,autumn
5,2g8HN35AnVGIk7B8yMucww,Big Poppa - 2005 Remaster,1994-09-13,summer
6,7iL6o9tox1zgHpKUfh9vuC,In Da Club,2003-02-06,winter
...,...,...,...,...
4820746,7tdpEWykBDTGHraFcc7y5K,Tarpuricusum Sarata - Captain Planet Remix,2015-02-23,winter
4820747,5pU9azu0VQIUk2nTUVxwxY,Cumbia del Olvido,2015-10-16,autumn
4820750,6ClotRUApTN25L3OnkzQjb,Acknowledge the Power (Instrumental),2017-12-18,autumn
4820752,08ITn8qaxbnEyk3N08IC47,Lunarion,2018-11-12,autumn


In [17]:
Albums.release_date.isnull().sum()

147

In [18]:
Albums.shape

(3612835, 4)

In [19]:
Albums.dropna(axis=0, subset=['release_date'], inplace=True)

In [20]:
Albums.shape

(3612688, 4)

In [21]:
Albums.isna().sum()

album_id          0
album_name        0
release_date      0
release_season    0
dtype: int64

In [22]:
Albums.head(2)

Unnamed: 0,album_id,album_name,release_date,release_season
0,2jKoVlU7VAmExKJ1Jh3w9P,"Alkaholik (feat. Erik Sermon, J Ro & Tash)",2000-04-02,spring
2,6YjKAkDYmlasMqYw73iB0w,Bitch Please II,2000-05-23,spring


### Preparing the Artists dataframe

In [23]:
Artists.head(2)

Unnamed: 0,name,id,popularity,followers
0,Xzibit,4tujQJicOnuZRLiBFdp3Ou,69,1193665
1,Erick Sermon,2VX0o9LDIVmKIgpnwdJpOJ,54,142007


In [24]:
Artists.dtypes

name          object
id            object
popularity     int64
followers      int64
dtype: object

In [25]:
Artists.isna().sum()

name          0
id            0
popularity    0
followers     0
dtype: int64

In [26]:
# Drop artist popularity, we're only interested in track popularity.
Artists.drop(['popularity'], axis=1, inplace=True)

In [27]:
# Rename columns
Artists.rename(columns={'id':'artist_id','name':'artist_name'}, inplace=True)

In [28]:
Artists.head(2)

Unnamed: 0,artist_name,artist_id,followers
0,Xzibit,4tujQJicOnuZRLiBFdp3Ou,1193665
1,Erick Sermon,2VX0o9LDIVmKIgpnwdJpOJ,142007


### Preparing R_artist_genre dataframe

In [29]:
R_artist_genre.head(2)

Unnamed: 0,genre_id,artist_id
0,detroit hip hop,4tujQJicOnuZRLiBFdp3Ou
1,g funk,4tujQJicOnuZRLiBFdp3Ou


In [30]:
R_artist_genre.dtypes

genre_id     object
artist_id    object
dtype: object

In [31]:
# Rename genre_id to genre
R_artist_genre.rename(columns={'genre_id':'genre'}, inplace=True)

Tangent: Searching for generic genres

In [32]:
# genre_id when filtered as below, will return based on the entire search (i.e. searching 'funk' will not catch 'g funk')
R_artist_genre[R_artist_genre['genre'] == "g funk"]

Unnamed: 0,genre,artist_id
1,g funk,4tujQJicOnuZRLiBFdp3Ou
21,g funk,6DPYiyq5kWVQS4RGwxzPC7
26,g funk,7hJcb9fa4alzcOq3EaNPoG
32,g funk,1Oa0bMld0A3u5OTYfMzp5h
111,g funk,3zNM2tRfTX6LI1lN2PlrTt
...,...,...
473873,g funk,6kEwu0twnfzngQt3Gy8taP
475932,g funk,1Wfh3Tz3xOB4JjqTk2zT5K
478325,g funk,7vPLHiyrVGx5OmaSPqJNGw
478791,g funk,3drEaBmm4UexsiDfz5AzXV


In [76]:
# Code below catches more types, is that more useful for us?
R_artist_genre[R_artist_genre['genre'].str.contains('reggaeton')]

Unnamed: 0,genre,artist_id
12679,reggaeton,2jSGzJw0ebJLu7OLVSOcBP
12960,reggaeton,3SUT1jjM5hzZj9TLfLZGIP
13588,reggaeton,4VsbQvC84B8Z3IsBY1HLQy
13593,reggaeton,7gR53ad9JjCweCv4f9MWEX
14202,reggaeton,5bv5RplEOwdCvhq0EILh9E
...,...,...
486616,pop reggaeton,1r3tpVQ08jFAA2P4xzV5o4
486617,reggaeton,0OROVBEZCocg0FcgJpyBse
487069,reggaeton flow,7f02bxFbZIOVdSbYRNYvLT
487072,reggaeton,2QUvtLq6oQaX0LNsYY2fas


In [75]:
R_artist_genre[R_artist_genre['genre']=='worship']

Unnamed: 0,genre,artist_id
9284,worship,3vcFXwLEUdfWMu7gTQKyot
9671,worship,7ua35iM0VjwfuHopuQDScm
12430,worship,6APm8EjxOHSYM5B4i3vT3q
12614,worship,5VX8hxrcfJWwaTLiqGUHG3
12832,worship,5wpEBloInversG3zp3CVAk
...,...,...
478438,worship,0S4zLQlUn398bugWd6Inmq
478581,worship,3kBjtO0TFSKuP98rLaaZkX
479075,worship,4yMGs8CtlMVF7RV2XU539m
480846,worship,636Yuw9EBCEsZDAyWJibEJ


In [83]:
R_artist_genre['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count').head(50)

Unnamed: 0,genre,count
0,dance pop,591
1,pop,585
2,rock,579
3,electro house,572
4,latin,515
5,classical performance,508
6,hip hop,498
7,edm,492
8,tropical,490
9,pop rap,490


In [80]:
genre_list = ['rock', 'house', 'latin', 'soul', 'pop', 'jazz', 
             'folk', 'funk', 'metal', 'rap', 'hip hop', 'metal', 
             'trap', 'country', 'k-pop','r&b','edm']

In [81]:
# binning R_artist_genre to specific genres like in genre_list doesn't give many tracks back...
binned = R_artist_genre[R_artist_genre['genre'].isin(genre_list)]

In [82]:
binned['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count')

Unnamed: 0,genre,count
0,pop,585
1,rock,579
2,latin,515
3,hip hop,498
4,edm,492
5,rap,466
6,funk,403
7,trap,375
8,k-pop,374
9,folk,358


In [44]:
reduced = []
for genre in genre_list:
   reduced.append(R_artist_genre[R_artist_genre['genre'].str.contains(genre)])

reduced = pd.concat(reduced)

In [61]:
reduced

Unnamed: 0,genre,artist_id
481,classic soundtrack,6AmG7SNdlV9boPZ7dFVnLY
536,french soundtrack,4e0rPkkLDkX1DtXml9teNU
732,classic soundtrack,3llWEEOW2xNfiKsXXWjrwl
779,classic soundtrack,2e7myQwGa7Vyp7sss8brVC
1026,classic soundtrack,1oQg1ritjU9ESAioz8ufYW
...,...,...
482075,french opera,6jbhsJATFJF8mIjJE2Im0h
483245,italian opera,5kdTUTQWRRbRIfAMjHsRrm
485018,german opera,6xt9i6cvoL0DbNTcwPPzNG
485347,opera,1n9yPqM0i14Ny5KLZ3QJfZ


In [62]:
# however searching for substrings gives back too many alternative genres that may not be constructive to 
reduced['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count').head(50)

Unnamed: 0,genre,count
0,dance pop,1182
1,pop rap,980
2,indie rock,976
3,pop rock,930
4,indie folk,920
5,country rock,904
6,modern alternative rock,862
7,indie poptimism,822
8,pop dance,820
9,indie pop,818


### Preparing R_albums_tracks dataframe

In [None]:
R_albums_tracks.head(2)

### Preparing R_albums_artists dataframe

In [None]:
R_albums_artists.head(2)

### Preparing Audio_features dataframe

In [None]:
Audio_features.head(2)

In [None]:
# Drop anaylsis_url 
Audio_features.drop(['analysis_url'], axis=1, inplace=True)

In [None]:
# Rename id to track_id
Audio_features.rename(columns={'id':'track_id'}, inplace=True)

In [None]:
Audio_features.head()

In [None]:
# Transform duration_ms to duration_mins
# credit to drop milliseconds from datetime format https://stackoverflow.com/questions/31487732/simple-way-to-drop-milliseconds-from-python-datetime-datetime-object

converted = pd.to_datetime(Audio_features['duration'], unit='ms')
Audio_features['duration_mins'] = pd.Series([val.time().replace(microsecond=0) for val in converted])

Audio_features.head()

### Preparing Tracks dataframe

In [None]:
Tracks.head(2)

In [None]:
# Rename columns, drop disc_number, explicit, duration (duplicate with audiofeatures), audio_feature_id, preview_url, track_number, is_playable
Tracks.rename(columns={'id':'track_id','name':'track_name'}, inplace=True)
Tracks.drop(['disc_number','explicit','duration','audio_feature_id','preview_url','track_number','is_playable'], axis=1, inplace=True)
Tracks.head(2)

### Writing dataframes to database

Data to be written into local database then exported as CSV after necessary joins before being hosted on an AWS S3 bucket. From there, the data can be fed into the machine learning model and dashboard. 

In [None]:
# Create connection to database 
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/spotify_db"

# instantiate engine
engine = create_engine(db_string)

Albums.to_sql(name='albums', con=engine, if_exists='replace', index=False)
Artists.to_sql(name='artists', con=engine, if_exists='replace', index=False)
Audio_features.to_sql(name='audio_features', con=engine, if_exists='replace', index=False)
R_artist_genre.to_sql(name='r_artist_genre', con=engine, if_exists='replace', index=False)
R_albums_tracks.to_sql(name='r_albums_tracks', con=engine, if_exists='replace', index=False)
R_albums_artists.to_sql(name='r_albums_artists', con=engine, if_exists='replace', index=False)
Tracks.to_sql(name='tracks', con=engine, if_exists='replace', index=False)