In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DV_A3/most_streamed_spotify_songs_2024.csv")

In [None]:
df['Release Date'] = pd.to_datetime(df['Release Date'], format='%d-%m-%Y')
df_filtered = df[df['Release Date'] < '01-01-2023']
print(df_filtered.head())

                                               Track  \
2                                  For the Last Time   
3  ...And to Those I Love, Thanks for Sticking Ar...   
4                                             Avalon   
5                                        1000 Blunts   
6                                        Matte Black   

                               Album Name       Artist Release Date  \
2                       For the Last Time  $uicideboy$   2017-05-09   
3             STOP STARING AT THE SHADOWS  $uicideboy$   2020-02-14   
4                                  Avalon  $uicideboy$   2021-05-21   
5  Sing Me a Lullaby, My Sweet Temptation  $uicideboy$   2022-07-29   
6  Sing Me a Lullaby, My Sweet Temptation  $uicideboy$   2022-07-29   

           ISRC All Time Rank  Track Score  Spotify Streams  \
2  QM8DG1703420         4,585         19.4      305049963.0   
3  QZAPK1900020         1,709         35.3      675082623.0   
4  QZAPK2000176         3,723         22.2     

In [None]:
df_filtered.to_csv('most_streamed_spotify_songs_till2022.csv')

In [None]:
!pip install ipympl



In [None]:
!pip install plotly



In [None]:
import plotly.express as px

def format_streams(value):
    if value >= 1e12:
        return f"{value / 1e12:.1f}T"
    elif value >= 1e9:
        return f"{value / 1e9:.1f}B"

platform_columns = [
    "Spotify Streams",
    "YouTube Views",
    "TikTok Views",
    "Soundcloud Streams",
    "Pandora Streams",
    "Shazam Counts",
]

df = df_filtered

df[platform_columns] = df[platform_columns].apply(pd.to_numeric, errors="coerce").fillna(0)

platform_totals = df[platform_columns].sum().reset_index()
# print(platform_totals)

platform_totals.columns = ["Platform", "Total Streams"]


platform_totals = platform_totals.sort_values(by="Total Streams", ascending=True)

platform_totals["Formatted Streams"] = platform_totals["Total Streams"].apply(format_streams)

print(platform_totals)

fig = px.bar(
    platform_totals,
    x="Total Streams",
    y="Platform",
    orientation="h",
    title="Total Streams by Platform (Linear Scale)",
    labels={"Total Streams": "Total Streams", "Platform": "Platforms"},
    text="Formatted Streams",
)

fig.update_layout(
    xaxis=dict(title="Total Streams", type="linear"),
    yaxis_title="Platforms",
    showlegend=False,
    height=400,
    template="plotly_white",
)

fig.update_traces(texttemplate="%{text}", textposition="outside")
fig.show()


             Platform  Total Streams Formatted Streams
5       Shazam Counts   8.046175e+09              8.0B
3  Soundcloud Streams   1.631401e+10             16.3B
4     Pandora Streams   2.681359e+11            268.1B
1       YouTube Views   1.274672e+12              1.3T
0     Spotify Streams   1.532293e+12              1.5T
2        TikTok Views   3.232355e+12              3.2T




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
df["Combined Streams"] = df[["Spotify Streams","YouTube Views", "TikTok Views", "Soundcloud Streams", "Pandora Streams", "Shazam Counts"]].sum(axis=1)
print(df.head())


                                               Track  \
2                                  For the Last Time   
3  ...And to Those I Love, Thanks for Sticking Ar...   
4                                             Avalon   
5                                        1000 Blunts   
6                                        Matte Black   

                               Album Name       Artist Release Date  \
2                       For the Last Time  $uicideboy$   2017-05-09   
3             STOP STARING AT THE SHADOWS  $uicideboy$   2020-02-14   
4                                  Avalon  $uicideboy$   2021-05-21   
5  Sing Me a Lullaby, My Sweet Temptation  $uicideboy$   2022-07-29   
6  Sing Me a Lullaby, My Sweet Temptation  $uicideboy$   2022-07-29   

           ISRC All Time Rank  Track Score  Spotify Streams  \
2  QM8DG1703420         4,585         19.4      305049963.0   
3  QZAPK1900020         1,709         35.3      675082623.0   
4  QZAPK2000176         3,723         22.2     

In [None]:
df_sorted = df.sort_values(by="Combined Streams",ascending=False)
top_songs = df_sorted.head(10)
print(top_songs)
# df.to_csv("test.csv")
# print(df.head())

                          Track                 Album Name  \
1817   Monkeys Spinning Monkeys          SteamFun - Single   
3276                Love You So   The King Khan & BBQ Show   
1859                      Oh No                Cold As Ice   
628                  Funny Song   Vintage Oddities, Vol. 4   
3352                  Aesthetic                  Aesthetic   
792                   Spongebob                  Spongebob   
3021            She Share Story            She Share Story   
3269  STAY (with Justin Bieber)  STAY (with Justin Bieber)   
787                      Pieces                     Pieces   
693    love nwantiti (ah ah ah)             CKay The First   

                        Artist Release Date          ISRC All Time Rank  \
1817             Kevin MacLeod   2011-05-16  USUAN1400011         1,018   
3276  The King Khan & BBQ Show   2005-01-01  USA3D0700101         1,918   
1859                    Kreepa   2019-12-13  USUYG1287925           473   
628            Ca

In [None]:
import plotly.express as px
import pandas as pd

# Assuming `top_songs` is your DataFrame containing the song data
sunburst_data = []

for _, row in top_songs.iterrows():
    track = row["Track"]
    for platform in ["Spotify Streams", "YouTube Views", "TikTok Views", "Soundcloud Streams", "Pandora Streams", "Shazam Counts"]:
        sunburst_data.append({
            "Song": track,
            "Platform": platform,
            "Streams": row[platform]
        })

sunburst_df = pd.DataFrame(sunburst_data)

# Calculate total streams per song for the upper level of the Sunburst
song_totals = sunburst_df.groupby('Song')['Streams'].sum().reset_index()
song_totals.columns = ['Song', 'Total Streams']
sunburst_df = sunburst_df.merge(song_totals, on='Song', how='left')

sunburst_df["Streams"] = sunburst_df["Streams"].replace(0, 1e-9)

# Add a new column for formatted text
sunburst_df["Display Streams"] = sunburst_df["Streams"].replace(1e-9, 0).astype(int)

# Create the Sunburst chart
fig = px.sunburst(
    sunburst_df,
    path=["Song", "Platform"],
    values="Streams",
    title="Sunburst Chart of Top 10 Songs by Streams on Each Platform",
    color='Streams',
    color_continuous_scale='Blues'
)

fig.update_layout(
    showlegend=True,
    modebar_add=["toImage"],  # Ensure the download option is available
)

fig.show()


Removing TikTok due to its unbalanced influence

In [None]:
df["Combined Streams wo TikTok"] = df[["Spotify Streams","YouTube Views", "Soundcloud Streams", "Pandora Streams", "Shazam Counts"]].sum(axis=1)
print(df.head())

                                               Track  \
2                                  For the Last Time   
3  ...And to Those I Love, Thanks for Sticking Ar...   
4                                             Avalon   
5                                        1000 Blunts   
6                                        Matte Black   

                               Album Name       Artist Release Date  \
2                       For the Last Time  $uicideboy$   2017-05-09   
3             STOP STARING AT THE SHADOWS  $uicideboy$   2020-02-14   
4                                  Avalon  $uicideboy$   2021-05-21   
5  Sing Me a Lullaby, My Sweet Temptation  $uicideboy$   2022-07-29   
6  Sing Me a Lullaby, My Sweet Temptation  $uicideboy$   2022-07-29   

           ISRC All Time Rank  Track Score  Spotify Streams  \
2  QM8DG1703420         4,585         19.4      305049963.0   
3  QZAPK1900020         1,709         35.3      675082623.0   
4  QZAPK2000176         3,723         22.2     

In [None]:
df_sorted = df.sort_values(by="Combined Streams wo TikTok",ascending=False)
top_songs = df_sorted.head(10)

sunburst_data = []

for _, row in top_songs.iterrows():
    track = row["Track"]
    for platform in ["Spotify Streams", "YouTube Views", "Soundcloud Streams", "Pandora Streams", "Shazam Counts"]:
        sunburst_data.append({
            "Song": track,
            "Platform": platform,
            "Streams": row[platform]
        })

sunburst_df = pd.DataFrame(sunburst_data)

# Calculate total streams per song for the upper level of the Sunburst
song_totals = sunburst_df.groupby('Song')['Streams'].sum().reset_index()
song_totals.columns = ['Song', 'Total Streams']
sunburst_df = sunburst_df.merge(song_totals, on='Song', how='left')

sunburst_df["Streams"] = sunburst_df["Streams"].replace(0, 1e-9)

# Add a new column for formatted text
sunburst_df["Display Streams"] = sunburst_df["Streams"].replace(1e-9, 0).astype(int)

# Create the Sunburst chart
fig = px.sunburst(
    sunburst_df,
    path=["Song", "Platform"],
    values="Streams",
    title="Sunburst Chart of Top 10 Songs by Streams on Each Platform, excluding TikTok",
    color='Streams',
    color_continuous_scale='Blues'
)

fig.show()
