In [33]:
import pandas as pd
import sqlalchemy as sqla

In [34]:

engine = sqla.create_engine("mysql://root:root@127.0.0.1:3310/spotify_db")
conn = engine.connect()


In [35]:

def get_songs_data():
    query = "SELECT * FROM spotify_songs"
    data = pd.read_sql(query, conn)
    return data

def get_data_by_id(id):
    query = f"SELECT * FROM Songs WHERE song_id = {id}"
    data = pd.read_sql(query, conn)
    return data

get_data_by_id(1)

Unnamed: 0,song_id,track_name,artist_id,released_year,released_month,released_day,streams,bpm,key,mode
0,1,Seven (feat. Latto) (Explicit Ver.),1,2023,7,14,141381703,125,B,Major


In [95]:
# dimensions = ["Track", "Artist", "Release Date", "Playlist"]
# metrics = ["Streams", "In Spotify Playlists", "In Spotify Charts", "In Apple Playlists", "In Apple Charts", "In Deezer Playlists", "In Deezer Charts", "In Shazam Charts"]

# The dataset is a table with the following columns:

create_dimensions_table = """
CREATE TABLE IF NOT EXISTS Dimensions_table (
    ID INTEGER PRIMARY KEY,
    Track TEXT,
    Artist TEXT,
    Release_Date Date,
    Spotify_Playlist TEXT);
"""

    
create_streaming_facts_table = """
CREATE TABLE IF NOT EXISTS Streaming_facts_table (
    ID INTEGER PRIMARY KEY,
    Streams INTEGER,
    In_Spotify_Playlists INTEGER,
    In_Spotify_Charts INTEGER,
    In_Apple_Playlists INTEGER,
    In_Apple_Charts INTEGER,
    In_Deezer_Playlists INTEGER,
    In_Deezer_Charts INTEGER,
    In_Shazam_Charts INTEGER
    );
"""

create_song_facts_table = """
CREATE TABLE IF NOT EXISTS Song_facts_table (
    ID INTEGER PRIMARY KEY,
    Play_Count INTEGER,
    Time_Since_Launched INTEGER
    );
"""

In [96]:

# Execute the create table queries
try:
    conn.execute(sqla.text(create_dimensions_table))
    print("Table 'Dimensions' created successfully.")
    
    conn.execute(sqla.text(create_streaming_facts_table))
    print("Table 'Streaming Facts' created successfully.")

    conn.execute(sqla.text(create_song_facts_table))
    print("Table 'Song Facts' created successfully.")
    
except Exception as e:
    print(f"An error occurred while creating tables: {e}")

Table 'Dimensions' created successfully.
Table 'Streaming Facts' created successfully.
Table 'Song Facts' created successfully.


In [74]:
# Start populating the tables
songs_data = get_songs_data()

# dimensions_data must be the table created called dimensions_table
dimensions_data = pd.DataFrame(columns=["ID", "Track", "Artist", "Release_Date", "Spotify_Playlist"])

date_df = songs_data[['released_year', 'released_month', 'released_day']].rename(columns={'released_year': 'year', 'released_month': 'month', 'released_day': 'day'})
dimensions_data['Release_Date'] = pd.to_datetime(date_df)
dimensions_data["ID"] = songs_data["id"]
dimensions_data["Track"] = songs_data["track_name"]
dimensions_data["Artist"] = songs_data["artist_s_name"]
dimensions_data["Spotify_Playlist"] = songs_data["in_spotify_playlists"]

# Now lets populate the dimensions table with the dimensions_data
dimensions_data.to_sql("Dimensions_table", conn, if_exists='replace', index=False)

dimensions_data

Unnamed: 0,ID,Track,Artist,Release_Date,Spotify_Playlist
0,1,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2023-07-14,553
1,2,LALA,Myke Towers,2023-03-23,1474
2,3,vampire,Olivia Rodrigo,2023-06-30,1397
3,4,Cruel Summer,Taylor Swift,2019-08-23,7858
4,5,WHERE SHE GOES,Bad Bunny,2023-05-18,3133
...,...,...,...,...,...
948,949,My Mind & Me,Selena Gomez,2022-11-03,953
949,950,Bigger Than The Whole Sky,Taylor Swift,2022-10-21,1180
950,951,A Veces (feat. Feid),"Feid, Paulo Londra",2022-11-03,573
951,952,En La De Ella,"Feid, Sech, Jhayco",2022-10-20,1320


In [72]:
pd.read_sql_query("SELECT * FROM Dimensions_table", conn)

Unnamed: 0,ID,Track,Artist,Release_Date,Spotify_Playlist
0,1,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2023-07-14,553
1,2,LALA,Myke Towers,2023-03-23,1474
2,3,vampire,Olivia Rodrigo,2023-06-30,1397
3,4,Cruel Summer,Taylor Swift,2019-08-23,7858
4,5,WHERE SHE GOES,Bad Bunny,2023-05-18,3133
...,...,...,...,...,...
2854,949,My Mind & Me,Selena Gomez,2022-11-03,953
2855,950,Bigger Than The Whole Sky,Taylor Swift,2022-10-21,1180
2856,951,A Veces (feat. Feid),"Feid, Paulo Londra",2022-11-03,573
2857,952,En La De Ella,"Feid, Sech, Jhayco",2022-10-20,1320


In [75]:

# Lets proceed with Streaming Facts table

stream_facts_data = pd.DataFrame(columns="ID, Streams, In_Spotify_Playlists, In_Spotify_Charts, In_Apple_Playlists, In_Apple_Charts, In_Deezer_Playlists, In_Deezer_Charts, In_Shazam_Charts".split(", "))
stream_facts_data["ID"] = songs_data["id"]
stream_facts_data["Streams"] = songs_data["streams"]
stream_facts_data["In_Spotify_Playlists"] = songs_data["in_spotify_playlists"]
stream_facts_data["In_Spotify_Charts"] = songs_data["in_spotify_charts"]
stream_facts_data["In_Apple_Playlists"] = songs_data["in_apple_playlists"]
stream_facts_data["In_Apple_Charts"] = songs_data["in_apple_charts"]
stream_facts_data["In_Deezer_Playlists"] = songs_data["in_deezer_playlists"]
stream_facts_data["In_Deezer_Charts"] = songs_data["in_deezer_charts"]
stream_facts_data["In_Shazam_Charts"] = songs_data["in_shazam_charts"]

stream_facts_data.to_sql("Streaming_facts_table", conn, if_exists='replace', index=False)

953

In [76]:
pd.read_sql_query("SELECT * FROM Streaming_facts_table", conn)


Unnamed: 0,ID,Streams,In_Spotify_Playlists,In_Spotify_Charts,In_Apple_Playlists,In_Apple_Charts,In_Deezer_Playlists,In_Deezer_Charts,In_Shazam_Charts
0,1,141381703,553,147,43,263,45,10,826
1,2,133716286,1474,48,48,126,58,14,382
2,3,140003974,1397,113,94,207,91,14,949
3,4,800840817,7858,100,116,207,125,12,548
4,5,303236322,3133,50,84,133,87,15,425
...,...,...,...,...,...,...,...,...,...
948,949,91473363,953,0,61,13,37,1,0
949,950,121871870,1180,0,4,0,8,0,0
950,951,73513683,573,0,2,0,7,0,0
951,952,133895612,1320,0,29,26,17,0,0


In [110]:
# Now the last talbe to populate is the Song Facts table

song_facts_data = pd.DataFrame(columns="ID, Play_Count, Time_Since_Launched, Average_Plays_Per_Day".split(", "))
song_facts_data["ID"] = songs_data["id"]
song_facts_data["Play_Count"] = pd.to_numeric(songs_data['streams'], errors='coerce')
song_facts_data["Time_Since_Launched"] = (pd.to_datetime("today") - dimensions_data["Release_Date"]).dt.days
song_facts_data["Time_Since_Launched"] = pd.to_numeric(song_facts_data["Time_Since_Launched"])
song_facts_data["Average_Plays_Per_Day"] = pd.to_numeric(song_facts_data["Play_Count"]) / song_facts_data["Time_Since_Launched"]

song_facts_data.to_sql("Song_facts_table", conn, if_exists='replace', index=False)


953

In [111]:
pd.read_sql_query("SELECT * FROM Song_facts_table", conn)


Unnamed: 0,ID,Play_Count,Time_Since_Launched,Average_Plays_Per_Day
0,1,141381703.0,288,490908.690972
1,2,133716286.0,401,333457.072319
2,3,140003974.0,302,463589.317881
3,4,800840817.0,1709,468601.999415
4,5,303236322.0,345,878945.860870
...,...,...,...,...
948,949,91473363.0,541,169082.001848
949,950,121871870.0,554,219985.324910
950,951,73513683.0,541,135884.811460
951,952,133895612.0,555,241253.354955
