In [62]:
# Imports
import sqlite3
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import psycopg2 
from config import db_password

This notebook cleans and transforms data from the Kaggle dataset: "8+ M. Spotify Tracks, Genre, Audio Features".

https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [2]:
# Create SQL connection to our sqlite database
cnx = sqlite3.connect('spotify.sqlite')
cnx.text_factory = lambda x: str(x, 'latin1')
Albums = pd.read_sql_query("SELECT * FROM albums", cnx)
Artists = pd.read_sql_query("SELECT * FROM artists", cnx)
R_artist_genre = pd.read_sql_query("SELECT * FROM r_artist_genre", cnx)
R_albums_tracks = pd.read_sql_query("SELECT * FROM r_albums_tracks", cnx)
R_albums_artists = pd.read_sql_query("SELECT * FROM r_albums_artists", cnx)
Audio_features = pd.read_sql_query("SELECT * FROM audio_features", cnx)
Tracks = pd.read_sql_query("SELECT * FROM tracks", cnx)

# Close the SQL connection
cnx.close()

How many rows does each table have?

In [3]:
print(
    len(Albums),
    len(Artists),
    len(R_artist_genre),
    len(R_albums_tracks),
    len(R_albums_artists),
    len(Audio_features),
    len(Tracks)
)

4820754 1066031 487386 9900173 921486 8740043 8741672


### Preparing the Albums dataframe

In [4]:
# See what columns and datatypes we're dealing with
Albums[Albums['id']=='5dGWwsZ9iB2Xc3UKR0gif2']

Unnamed: 0,id,name,album_group,album_type,release_date,popularity
158350,5dGWwsZ9iB2Xc3UKR0gif2,Justice,,album,1616112000000,100


In [5]:
Albums.dtypes

id              object
name            object
album_group     object
album_type      object
release_date     int64
popularity       int64
dtype: object

In [6]:
Albums.isna().sum()

id              0
name            0
album_group     0
album_type      0
release_date    0
popularity      0
dtype: int64

In [7]:
# Keep only album types (no singles, etc.)
Albums = Albums[Albums['album_type'] == 'album']
Albums.shape

(3612835, 6)

In [8]:
# What is album_group and can we get rid of it?
Albums['album_group'].value_counts(ascending=False).rename_axis('album_group').reset_index(name='count')

# The only unique values in album_group are blank entries, so this column can be dropped.

Unnamed: 0,album_group,count
0,,3612835


In [9]:
# Drop album popularity, album_type and album_group. 
# Rename id and name.
Albums.drop(['popularity','album_group','album_type'], axis=1, inplace=True)
Albums.rename(columns={'id':'album_id','name':'album_name'}, inplace=True)
Albums[Albums['album_id']=='5dGWwsZ9iB2Xc3UKR0gif2']

Unnamed: 0,album_id,album_name,release_date
158350,5dGWwsZ9iB2Xc3UKR0gif2,Justice,1616112000000


In [10]:
# Confirm column drops.
Albums.shape

(3612835, 3)

#### Tangent: null release dates

On the first attempt to convert the release_date column, the to_datetime function raised OutOfBound errors when using unit='ms'. The list below, nats, are the album_id's that came up as NaT when to_datetime(errors='coerce'). These will be used to drill down and see what values are returning NaT and if these rows can be dropped.

In [11]:
# Fetched this list of album_ids that came up as NaT when converting ms to date via to_datetime on first try. 
nats = ['63unDXoEwu8EDdK7OyycPQ', '1PfOqxDAaRE9FgM3JFSNi9', '4BZ3fjaLDnugPWmijavGh4', '0OSEpze4gRdRheCQb2tdnT', '5qy4dDcFMNkNTeuGHx7zoz',
 '0SmiOYBHHcxQ2KpPz5APdH', '41pEBMLtapOU6coMypN96S', '1SC42SOTNZOIq2iQ6dDBXO', '0dclVuD1w2eKTnMoEijxIc', '7BGk6YTdIFZiWzwtlkPGtU',
 '5ZTjSfJusJX5NmR4nCBDWr', '55GvEdVhcqtXphA7yqj2LL', '2sJq3bJwdMfDk9Ek2pnmbj', '6AANPXUJhBYA34MaGKd1iO', '4Uf5mmlLZ0ONOxByWsMxVN',
 '6ueVQWgN4debvl6cnVBnKF', '5KwayTbJpDgiFJOvGncsld', '4QYIKLFws9hq2ml6IYRd2D', '4F6ifn8pwK2bQ5S0WgrdM9', '06mWm3nCaNyACElnHE3KHj',
 '2eqnYkYvtWKOk1HO97m2ux', '0BuzeeynKUCIf8EkwlWigw', '0lcZAR1pbnpFJyju1BKG8A', '6JH0MqTu9Jczm7m3tK3XfT', '6bwMFSTRZLXxzgT2ONWTkH',
 '5Xb7o38gPfdPNt29aZdA82', '5gzMLjYxNXocCV4U1JwT3J', '0YUF0VkGtaBhaby01YOTwn', '5mQS4O3BUHKfVWcM1utxyd', '6Hp6QqterTD6IkcNiIXmC9',
 '1MPS4Ofaxxec57hKv5rDiV', '1GJNf1s72ibGmeTC8TitLU', '60xVw7fFlzN802dCboS9Fe', '3kzvwLED2Dx7GW9BOCMNqs', '5ZR3wCUhSc0Bbb6V4dRGfT',
 '6dLozgIdB1MIlndWxmfDJJ', '34CSyWrRJ1ec2FNkeQmp2y', '09F4nf8iQJP3r3QZxNKEsh', '0YJWLbpl6QglAj5BH7p0jN', '2FQhcClwpVbfBMHYcE2m3M',
 '3rJbElD53kTUZigRv4lH80', '0PCaLFCIITetsmuepBmgkZ', '19W2i8u4waA7ZFWuAOLgD8', '7FbVQLsmXxPnuykb9aX3ox', '70K7Q5kbWIKOWuJh9tqmuy',
 '01Yr7yJaPBtXHxWA3tt6W7', '1d3MHof1R92EIIICxkbTUe', '0WEaVS1elgw9ZyZ2KganFX', '76qlsmlAtTUEJMbubNu8KJ', '1JvOXKmtRT788naHmohJYE',
 '76fWk2i2FjZNmNAELcQj0r', '48bp8F6eXvWyRbwcftyIy9', '4nHIRGcWs0g6TNdmCvmxdk', '4tgrpJygWQGOjLM6CTDXSE', '1xt4AcOEYQBIYL988rSvDF', 
 '1qXA0QwhDcldAZ6iYmEAVK', '2pFDpkH5saykRdLsuRTScK', '4YmHvQ0BQXZKeQQFAMcnz1', '0KtjJIEVpBaCYbTCF9o4aL', 
 '1sjQZGg2rfTRYd96cSqDLI', '4EZqpwUvLyNRa7EVgu3wG0', '2HahlYh4MQV8AN9s1C2qa3', '0Cq4vxCY8tdAu1APjtcv7Y', '68AYUbgkfV1EtoR9USiR8A', '1PUKw9RUkR00Z4VjUYj20M', '709aJBwQqGTO6x1KdTF9c4',
 '64qH1SlvRj8kMyBB2Msk97', '58YsankVf0WD6T3gTzOVj6', '45HmXw64xSXEyoP6Fn6ITi', '7nef1k8XDyx7fZAXw7lLJR',
 '1kiUAV9LDw7spIirvBraRp', '0tqtYGZI6KLTSi6ux1DPnU', '6OmCOzIqyYHWt8BlnJHeFK', '78v7GWjRO8DEONPZ807Ak8',
 '4Ye6LKFbvzBfBTdBRLx9HK', '7sxxL9XTJUEwi5fnkkuZxu', '6wt6Ta2PPojzNVaOsu3H5L', '3aO3GbNM44Zz5IeiB9GWN5', '1I3ou8ROUNbzQtNkKKl6ms', '2ajnhw8rVGZcmYJFDFvZx7',
 '33Iz37Y2rQAQvIXJAzoGbi', '1y69Vvr7sJ4eceuFmBChto', '0nL9ZHe5pkrM6MRmSLUme5', '1IDPvr891AJCq1Ovjbv6FM', '6btGgxOaF1bJDleb94Cn7K', '3e4cMfOEf8kMpO2H7nJuqs',
 '1XpX0tQz6iG4n0nOaJcqkh', '5WhAvShZ4cdO3qvW3HTxmg', '1uy0l5dYmBL4j00Pvw2JHW', '6fgfw1WrwN0L0wAgaUZ28D',
 '0D97tWAlL2uP7jtnxHQSoD', '22eru1PSNbwchaKLAhvbRY', '5I4PV0eZ1OJqmraNcfX7vw', '1TVxOSAEpRhfKUOJ99T05u',
 '6S5R9D12jArY0529nbmB9k', '2i8By0hzBzrz1E6d29dIGt', '0zJhfP1GMX6eJmX9cqGFCx', '2RwlWIdADtuYx3Z4MmUaxw',
 '6NpUPJCbtcMsAu5o6g98lJ', '6OLGJhqrx21G7M4CSF9QyN', '1LS4LZk78kuxXGkFHMbobK', '20z7Pj9IH27MblTdY8V8kh',
 '7BH9NxLaqaDgOS0im7igIK', '0WgKPHEusRvJRgFYlkxhyW', '6Ro8Q90tqmV5BrZt8QdKr4', '1jddJxfP3CutD6OvK6jwSf',
 '1WZFg6HbGo4FCo2PNq2oSN', '3dtkhIZGdXr9BRT5731YYC', '6HmpulBVlgKBPdlHUpcc1A', '4KVyXjlue24CMftqA0zBLu',
 '238vIlHHy2OWZJvpnyJYSi', '3SioZzRwB3nBetV5kM9mYg', '0jHXfxpWfxib7Y1dFIhmWU', '2HYlNU6U2zqYPJF7boTGxh',
 '0NX2ZNXWXlA2zOzkEE6sM8', '6HKrA26NYCLW0bIJVVE2uu', '6pQXLCpH9bAj9X3mPMbNRe', '2pbOC1tUcxXzJwPx2afJFO',
 '2UcjwljQ3Zj2mkYmJ3XlQk', '0zQrzxnWnPzOkBmLCpFhlL', '2ypI5IrJsWqnr1yCBTtJUk', '1G88bOocuTBCeOg7OoRodd',
 '5e3gxJkLY8lhHYxhyvsT3U', '07DExY4oVtBSKbKuQlwvMy', '7C54sONTDef2Mg1ZrCK8Q3', '1vHYe43mmPA07Yn25ut5LB',
 '5LpwBrQi1KECw4ozAduc9K', '2itCnKbYsZN5ceN2161ylT', '4RVeBSK9g8T93XjeLMT7e8', '0mV682A6rgxJRwolZuxf4s',
 '2cViZVn8J9tpKVVhjYrIlz', '5Ht11Ie67AjnhR7EVD2VlX', '28WSXIxwCo4HnBw4EMM6zK', '26AkkdbLeIw9lyYlmf4jPO',
 '13CbIRyYQTvYJS2IXz6b95', '0bosldZ429fWqibuBqRvXq', '5FldfvxBKpi6tKVuPfX34x', '0U8uAMRVaT80P5silJnoQa',
 '3289uSGqh0KXHHu6NXNZE3', '6tAhoWZx9S7fXdwLcHf8Ny', '25Gnr7p2AqtvlizFgLJYre', '34pNsdzFZkPx9Wlclp5g4y',
 '5ZOwddFKN36wUuX4uQ1gb1', '5djMmALRlGYBLJspEkB4rq', '0dcHxEA1DOhGBmlw67CiKx', '69ffhg7xlu2eMmEVYmuVI6', '46W8zd6m5Gm8VTbU43Pbju']

In [22]:
df_nats = Albums[Albums['album_id'].isin(nats)]
df_nats

Unnamed: 0,album_id,album_name,release_date
61959,63unDXoEwu8EDdK7OyycPQ,Life is ...,-62135769600000
139926,1PfOqxDAaRE9FgM3JFSNi9,A MENOS CUARTO,-62135769600000
163510,4BZ3fjaLDnugPWmijavGh4,2017 Midwest Clinic: Clovis North Wind Ensembl...,-62135769600000
188743,0OSEpze4gRdRheCQb2tdnT,2017 Midwest Clinic: T.A. Howard Middle School...,-62135769600000
188758,5qy4dDcFMNkNTeuGHx7zoz,2017 Midwest Clinic: Wylie High School Wind Sy...,-62135769600000
...,...,...,...
2115360,5ZOwddFKN36wUuX4uQ1gb1,Fantasma,-62135769600000
2115361,5djMmALRlGYBLJspEkB4rq,yo...,-62135769600000
2115362,0dcHxEA1DOhGBmlw67CiKx,Historia,-62135769600000
2115363,69ffhg7xlu2eMmEVYmuVI6,Directo al alma - al2,-62135769600000


In [21]:
df_nats[df_nats['release_date'] != -62135769600000]

Unnamed: 0,album_id,album_name,release_date
1439504,0zJhfP1GMX6eJmX9cqGFCx,Twisted Reality,-61733491200000
1439505,2RwlWIdADtuYx3Z4MmUaxw,Mastermind Madness,-61733491200000
1439506,6NpUPJCbtcMsAu5o6g98lJ,Pressure In a Vortex,-61733491200000
1439507,6OLGJhqrx21G7M4CSF9QyN,Wake Up,-61733491200000
1439508,1LS4LZk78kuxXGkFHMbobK,Cybernetic Revolt,-61733491200000
1439509,20z7Pj9IH27MblTdY8V8kh,Space Race,-61733491200000
1439513,7BH9NxLaqaDgOS0im7igIK,Artificial Megafactory,-61733491200000
1439514,0WgKPHEusRvJRgFYlkxhyW,Deadly Maneuvers,-61733491200000
1439515,6Ro8Q90tqmV5BrZt8QdKr4,Crawling Under the Surface,-61733491200000
1439516,1jddJxfP3CutD6OvK6jwSf,Finish the Job,-61733491200000


The NaT values correspond to dates that are likely erroneous or omitted the year of release, which won't be useful anyway and can be dropped.

#### Convert release_date to date then drop all the NaT songs

In [28]:
Albums['release_date'] = pd.to_datetime(Albums.release_date, unit='ms', errors='coerce')
Albums.head()

Unnamed: 0,album_id,album_name,release_date
0,2jKoVlU7VAmExKJ1Jh3w9P,"Alkaholik (feat. Erik Sermon, J Ro & Tash)",2000-04-02
2,6YjKAkDYmlasMqYw73iB0w,Bitch Please II,2000-05-23
4,3UOuBNEin5peSRqdzvlnWM,Still D.R.E.,1999-11-16
5,2g8HN35AnVGIk7B8yMucww,Big Poppa - 2005 Remaster,1994-09-13
6,7iL6o9tox1zgHpKUfh9vuC,In Da Club,2003-02-06


In [66]:
Albums.dtypes

album_id                  object
album_name                object
release_date      datetime64[ns]
release_season          category
dtype: object

In [72]:
# new column for seasons (solution borrowed from https://stackoverflow.com/questions/60285557/extract-seasons-from-datetime-pandas)
    # Sping start March 20
    # Summer start June 20
    # Fall start Sept 22
    # Winter start Dec 20
date_offset = (Albums.release_date.dt.month*100 + Albums.release_date.dt.day - 320)%1300

Albums['release_season'] = pd.cut(date_offset, [0, 300, 602, 900, 1300], 
                                     labels=['spring', 'summer', 'autumn', 'winter'],
                                     include_lowest = True)
Albums

Unnamed: 0,album_id,album_name,release_date,release_season
0,2jKoVlU7VAmExKJ1Jh3w9P,"Alkaholik (feat. Erik Sermon, J Ro & Tash)",2000-04-02,spring
2,6YjKAkDYmlasMqYw73iB0w,Bitch Please II,2000-05-23,spring
4,3UOuBNEin5peSRqdzvlnWM,Still D.R.E.,1999-11-16,autumn
5,2g8HN35AnVGIk7B8yMucww,Big Poppa - 2005 Remaster,1994-09-13,summer
6,7iL6o9tox1zgHpKUfh9vuC,In Da Club,2003-02-06,winter
...,...,...,...,...
4820746,7tdpEWykBDTGHraFcc7y5K,Tarpuricusum Sarata - Captain Planet Remix,2015-02-23,winter
4820747,5pU9azu0VQIUk2nTUVxwxY,Cumbia del Olvido,2015-10-16,autumn
4820750,6ClotRUApTN25L3OnkzQjb,Acknowledge the Power (Instrumental),2017-12-18,autumn
4820752,08ITn8qaxbnEyk3N08IC47,Lunarion,2018-11-12,autumn


In [29]:
Albums.release_date.isnull().sum()

147

In [31]:
Albums.shape

(3612835, 3)

In [35]:
Albums.dropna(axis=0, subset=['release_date'], inplace=True)

In [36]:
Albums.shape

(3612688, 3)

In [40]:
Albums.isna().sum()

album_id        0
album_name      0
release_date    0
dtype: int64

In [73]:
Albums.head(2)

Unnamed: 0,album_id,album_name,release_date,release_season
0,2jKoVlU7VAmExKJ1Jh3w9P,"Alkaholik (feat. Erik Sermon, J Ro & Tash)",2000-04-02,spring
2,6YjKAkDYmlasMqYw73iB0w,Bitch Please II,2000-05-23,spring


### Preparing the Artists dataframe

In [37]:
Artists.head(2)

Unnamed: 0,name,id,popularity,followers
0,Xzibit,4tujQJicOnuZRLiBFdp3Ou,69,1193665
1,Erick Sermon,2VX0o9LDIVmKIgpnwdJpOJ,54,142007


In [38]:
Artists.dtypes

name          object
id            object
popularity     int64
followers      int64
dtype: object

In [39]:
Artists.isna().sum()

name          0
id            0
popularity    0
followers     0
dtype: int64

In [74]:
# Drop artist popularity, we're only interested in track popularity.
Artists.drop(['popularity'], axis=1, inplace=True)

In [41]:
# Rename columns
Artists.rename(columns={'id':'artist_id','name':'artist_name'}, inplace=True)

In [75]:
Artists.head(2)

Unnamed: 0,artist_name,artist_id,followers
0,Xzibit,4tujQJicOnuZRLiBFdp3Ou,1193665
1,Erick Sermon,2VX0o9LDIVmKIgpnwdJpOJ,142007


### Preparing R_artist_genre dataframe

In [43]:
R_artist_genre.head(2)

Unnamed: 0,genre_id,artist_id
0,detroit hip hop,4tujQJicOnuZRLiBFdp3Ou
1,g funk,4tujQJicOnuZRLiBFdp3Ou


In [44]:
R_artist_genre.dtypes

genre_id     object
artist_id    object
dtype: object

In [45]:
# Rename genre_id to genre
R_artist_genre.rename(columns={'genre_id':'genre'}, inplace=True)

Tangent: Searching for generic genres

In [46]:
# genre_id when filtered as below, will return based on the entire search (i.e. searching 'funk' will not catch 'g funk')
R_artist_genre[R_artist_genre['genre'] == "g funk"]

Unnamed: 0,genre,artist_id
1,g funk,4tujQJicOnuZRLiBFdp3Ou
21,g funk,6DPYiyq5kWVQS4RGwxzPC7
26,g funk,7hJcb9fa4alzcOq3EaNPoG
32,g funk,1Oa0bMld0A3u5OTYfMzp5h
111,g funk,3zNM2tRfTX6LI1lN2PlrTt
...,...,...
473873,g funk,6kEwu0twnfzngQt3Gy8taP
475932,g funk,1Wfh3Tz3xOB4JjqTk2zT5K
478325,g funk,7vPLHiyrVGx5OmaSPqJNGw
478791,g funk,3drEaBmm4UexsiDfz5AzXV


In [47]:
# Code below catches more types, is that more useful for us?
R_artist_genre[R_artist_genre['genre'].str.contains('funk')]

Unnamed: 0,genre,artist_id
1,g funk,4tujQJicOnuZRLiBFdp3Ou
21,g funk,6DPYiyq5kWVQS4RGwxzPC7
26,g funk,7hJcb9fa4alzcOq3EaNPoG
32,g funk,1Oa0bMld0A3u5OTYfMzp5h
111,g funk,3zNM2tRfTX6LI1lN2PlrTt
...,...,...
487252,funky house,1dA7pt23MNLlDsLpABATtG
487257,funk,3S34Unhn5yRcaH5K9aU5Et
487284,afro-funk,7zxcZlBnk8OpLSPAm4Jeyk
487285,cumbia funk,7zxcZlBnk8OpLSPAm4Jeyk


### Preparing R_albums_tracks dataframe

In [48]:
R_albums_tracks.head(2)

Unnamed: 0,album_id,track_id
0,6os2Mv58OYnQClPf7B9E1s,3HnrHGLE9u2MjHtdobfWl9
1,6os2Mv58OYnQClPf7B9E1s,4lDjkpUrpWlMFofIpzuExK


### Preparing R_albums_artists dataframe

In [49]:
R_albums_artists.head(2)

Unnamed: 0,album_id,artist_id
0,6os2Mv58OYnQClPf7B9E1s,2HS2wQTJXpA65XWOKlAVxk
1,5XXN1tFQg7D7U1NSVh5fjf,3VBpsrUi2vV7Uj87ONHu7Z


### Preparing Audio_features dataframe

In [50]:
Audio_features.head(2)

Unnamed: 0,id,acousticness,analysis_url,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,2jKoVlU7VAmExKJ1Jh3w9P,0.18,https://api.spotify.com/v1/audio-analysis/2jKo...,0.893,219160,0.514,0.0,11,0.0596,-5.08,1,0.283,95.848,4,0.787
1,4JYUDRtPZuVNi7FAnbHyux,0.272,https://api.spotify.com/v1/audio-analysis/4JYU...,0.52,302013,0.847,0.0,9,0.325,-5.3,1,0.427,177.371002,4,0.799


In [52]:
R_artist_genre['genre'].value_counts(ascending=False).rename_axis('genre').reset_index(name='count').head(50)

Unnamed: 0,genre,count
0,dance pop,591
1,pop,585
2,rock,579
3,electro house,572
4,latin,515
5,classical performance,508
6,hip hop,498
7,edm,492
8,pop rap,490
9,tropical,490


In [53]:
# Drop anaylsis_url 
Audio_features.drop(['analysis_url'], axis=1, inplace=True)

In [54]:
# Rename id to track_id
Audio_features.rename(columns={'id':'track_id'}, inplace=True)

In [55]:
Audio_features.head()

Unnamed: 0,track_id,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,2jKoVlU7VAmExKJ1Jh3w9P,0.18,0.893,219160,0.514,0.0,11,0.0596,-5.08,1,0.283,95.848,4,0.787
1,4JYUDRtPZuVNi7FAnbHyux,0.272,0.52,302013,0.847,0.0,9,0.325,-5.3,1,0.427,177.371002,4,0.799
2,6YjKAkDYmlasMqYw73iB0w,0.0783,0.918,288200,0.586,0.0,1,0.145,-2.89,1,0.133,95.516998,4,0.779
3,2YlvHjDb4Tyxl4A1IcDhAe,0.584,0.877,243013,0.681,0.0,1,0.119,-6.277,0,0.259,94.834999,4,0.839
4,3UOuBNEin5peSRqdzvlnWM,0.17,0.814,270667,0.781,0.000518,11,0.052,-3.33,1,0.233,93.445,4,0.536


In [83]:
# Transform duration_ms to duration_mins
# credit to drop milliseconds from datetime format https://stackoverflow.com/questions/31487732/simple-way-to-drop-milliseconds-from-python-datetime-datetime-object

converted = pd.to_datetime(Audio_features['duration'], unit='ms')
Audio_features['duration_mins'] = pd.Series([val.time().replace(microsecond=0) for val in converted])

Audio_features.head()

Unnamed: 0,track_id,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duration_mins
0,2jKoVlU7VAmExKJ1Jh3w9P,0.18,0.893,219160,0.514,0.0,11,0.0596,-5.08,1,0.283,95.848,4,0.787,00:03:39
1,4JYUDRtPZuVNi7FAnbHyux,0.272,0.52,302013,0.847,0.0,9,0.325,-5.3,1,0.427,177.371002,4,0.799,00:05:02
2,6YjKAkDYmlasMqYw73iB0w,0.0783,0.918,288200,0.586,0.0,1,0.145,-2.89,1,0.133,95.516998,4,0.779,00:04:48
3,2YlvHjDb4Tyxl4A1IcDhAe,0.584,0.877,243013,0.681,0.0,1,0.119,-6.277,0,0.259,94.834999,4,0.839,00:04:03
4,3UOuBNEin5peSRqdzvlnWM,0.17,0.814,270667,0.781,0.000518,11,0.052,-3.33,1,0.233,93.445,4,0.536,00:04:30


### Preparing Tracks dataframe

In [56]:
Tracks.head(2)

Unnamed: 0,id,disc_number,duration,explicit,audio_feature_id,name,preview_url,track_number,popularity,is_playable
0,1dizvxctg9dHEyaYTFufVi,1,275893,1,1dizvxctg9dHEyaYTFufVi,Gz And Hustlas (feat. Nancy Fletcher),,12,0,
1,2g8HN35AnVGIk7B8yMucww,1,252746,1,2g8HN35AnVGIk7B8yMucww,Big Poppa - 2005 Remaster,https://p.scdn.co/mp3-preview/770e023eb0318270...,13,77,


In [57]:
# Rename columns, drop disc_number, explicit, duration (duplicate with audiofeatures), audio_feature_id, preview_url, track_number, is_playable
Tracks.rename(columns={'id':'track_id','name':'track_name'}, inplace=True)
Tracks.drop(['disc_number','explicit','duration','audio_feature_id','preview_url','track_number','is_playable'], axis=1, inplace=True)
Tracks.head(2)

Unnamed: 0,track_id,track_name,popularity
0,1dizvxctg9dHEyaYTFufVi,Gz And Hustlas (feat. Nancy Fletcher),0
1,2g8HN35AnVGIk7B8yMucww,Big Poppa - 2005 Remaster,77


### Writing dataframes to database

Data to be written into local database then exported as CSV after necessary joins before being hosted on an AWS S3 bucket. From there, the data can be fed into the machine learning model and dashboard. 

In [84]:
# Create connection to database 
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/spotify_db"

# instantiate engine
engine = create_engine(db_string)

Albums.to_sql(name='albums', con=engine, if_exists='replace', index=False)
Artists.to_sql(name='artists', con=engine, if_exists='replace', index=False)
Audio_features.to_sql(name='audio_features', con=engine, if_exists='replace', index=False)
R_artist_genre.to_sql(name='r_artist_genre', con=engine, if_exists='replace', index=False)
R_albums_tracks.to_sql(name='r_albums_tracks', con=engine, if_exists='replace', index=False)
R_albums_artists.to_sql(name='r_albums_artists', con=engine, if_exists='replace', index=False)
Tracks.to_sql(name='tracks', con=engine, if_exists='replace', index=False)