In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

spotify_df = pd.read_csv("../data/Cleaned_Spotify_2024_Global_Streaming_Data.csv")
spotify_df.head()

In [None]:
# group by genre/avg total streams
avg_streams = spotify_df.groupby("Genre")["Total Streams (Millions)"].mean().sort_values(ascending=False)
avg_streams

In [None]:
plt.bar(avg_streams.index, avg_streams.values)

In [None]:
plt.bar(avg_streams.index, avg_streams.values)
plt.xlabel("Genre")
plt.ylabel("Average Total Streams (Millions)")
plt.title("Average Total Streams by Genre")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.bar(avg_streams.index, avg_streams.values)

plt.xlabel("Genre")
plt.ylabel("Average Total Streams (Millions)")
plt.title("Average Total Streams by Genre")
plt.xticks(rotation=45, ha='center')
plt.tight_layout()

plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.bar(avg_streams.index, avg_streams.values)

ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.xlabel("Genre")
plt.ylabel("Average Total Streams (Millions)")
plt.title("Average Total Streams by Genre")
plt.xticks(rotation=45, ha='center')
plt.tight_layout()

plt.show()

In [None]:
avg_artist_streams = spotify_df.groupby("Artist")["Total Streams (Millions)"].mean().sort_values(ascending=False)
avg_artist_streams

In [None]:
plt.figure(figsize=(10,6))

special_artist = "Ariana Grande"
colors = ["green" if artist == special_artist else "grey" for artist in avg_artist_streams.index]
plt.bar(avg_artist_streams.index, avg_artist_streams.values, color = colors, width=0.8)
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.xlabel("Artist")
plt.ylabel("Average Total Streams (Millions)")
plt.title("Ariana Grande Ranks 2nd")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()

In [None]:
colors = ["grey"]*len(avg_artist_streams)

max_index = avg_artist_streams.idxmax()
highlight_position = avg_artist_streams.index.get_loc(max_index)
colors[highlight_position] = "green"

In [None]:
spotify_df.columns
spotify_df.info()

In [None]:
plt.scatter(spotify_df["Streams Last 30 Days (Millions)"],spotify_df["Total Streams (Millions)"])
plt.show

In [None]:
plt.figure(figsize=(10,6))
plt.xlabel("Streams this Month")
plt.ylabel("Total Streams")
plt.title("No Apparent Correlation Between Monthly and Lifetime Streams")

plt.scatter(
    spotify_df["Streams Last 30 Days (Millions)"],
    spotify_df["Total Streams (Millions)"],
    color = "#133105",
    alpha = 0.8,
     s = 25)

ax = plt.gca()

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))

featured_genre = "Pop"
colors = ["#133105" if genre == featured_genre else"#EEF3EC" for genre in spotify_df["Genre"]]
plt.xlabel("Streams this Month")
plt.ylabel("Total Streams")

plt.scatter(
    spotify_df["Streams Last 30 Days (Millions)"],
    spotify_df["Total Streams (Millions)"],
    color = colors,
    alpha = 0.8,
     s = 25)

ax = plt.gca()

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_title("Do Pop songs Achieve Higher Monthly Listens or Total Streams", fontsize = 18, pad=35)

for spine in ax.spines.values():
    spine.set_linewidth(0.25)
    spine.set_alpha(0.5)

plt.tight_layout()
plt.show()

In [None]:
yearly_avg = spotify_df.groupby("Release Year")["Total Streams (Millions)"].mean().round(2)
yearly_avg

In [None]:
plt.figure(figsize=(10,6))
plt.plot(
    yearly_avg.index, 
    yearly_avg.values,
    marker="|",
    linestyle = "--",
    color = "#b31f1f",
    linewidth = 0.5)

plt.xlabel("Release Year")
plt.ylabel("Average Total Streams")
plt.title("Did Listeners Stream Less in 2022?", fontsize = 18, pad = 20)

ax = plt.gca()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

ax.axvline(
    2022,
    color = "Green",
    linewidth = 0.5,
    alpha = 0.7)
ax.axhline(2375)

ax.text(
    2022+0.05,
    2225,
    "Dip In Total Streams",
    color = "Green",
    fontsize = 8
)

plt.ylim(0, 3500)

plt.tight_layout()
plt.show()

In [None]:
category = "Platform Type"
values = "Total Streams (Millions)"

platform_totals = spotify_df.groupby(category)[values].sum()
platform_totals



In [None]:
plt.figure(figsize=(8,8))
plt.pie(
    platform_totals,
    labels=None,
    startangle=110,
    colors = ["Gray","Orange"],
    autopct="%1.1f%%")

plt.title("Near Even Split of Premium/Free Users")

plt.legend(
    labels=platform_totals.index,
    loc="lower right",
    title = "Platform"
)

plt.tight_layout
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))


wedges, texts, autotexts = ax.pie(
    platform_totals,
    autopct="%1.1f%%",
    startangle=110,
    colors = ["Gray","Orange"]
    )

ax.set_title(
    "Near Even Split of Premium/Free Users",
    fontsize = 30, 
    pad=15,
    loc="center"
)

for autotext in autotexts:
    autotext.set_fontsize(16)
    autotext.set_color("white")
    autotext.set_fontweight("bold")

ax.legend(
    wedges,
    platform_totals.index,
    loc="lower center",
    bbox_to_anchor = (0.5, 0.95),
    ncol=2
)

plt.tight_layout
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.hist(
    spotify_df["Avg Stream Duration (Min)"],
    bins=20,
    color="Green",
    edgecolor="white"
)

plt.xlabel("Average Stream Duration (Minutes)")
plt.ylabel("Number of Tracks")
plt.title("Distribution of Average Stream Duration", fontsize = 16, pad = 20)

ax = plt.gca()
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
for spine in ax.spines.values():
    spine.set_linewidth(0.25)
    spine.set_alpha(0.5)

plt.show()

In [None]:
plt.boxplot(
    [spotify_df[spotify_df["Platform Type"]=="Free"]["Skip Rate (%)"],
     spotify_df[spotify_df["Platform Type"]=="Premium"]["Skip Rate (%)"]]
)

plt.xticks([1,2], ["Free","Premium"])
plt.xlabel("Platform Type")
plt.ylabel("Skip Rate (%)")
plt.title("Skip Rate (%) by Platform Type")


plt.show()

In [None]:
numeric_spotify_df = spotify_df.select_dtypes(include=[np.number])
corr = numeric_spotify_df.corr()

im = plt.imshow(corr, vmin=-1, vmax=1, cmap="coolwarm")
plt.colorbar(im, label="Pearson Correlation")

plt.xticks(range(len(corr.columns)), corr.columns, rotation = 45, ha = "right")
plt.yticks(range(len(corr.columns)), corr.columns)

plt.show

In [None]:
genre_counts = spotify_df["Genre"].value_counts().sort_values(ascending=False)
genre_counts