In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
spotify = pd.read_csv("C:/Users/VIVEK/OneDrive/Desktop/Data Analytics Project/Project Spotify/spotify-2023.csv", encoding='ISO-8859-1')

In [None]:
pd.set_option("display.max_columns", None)
spotify.head()

In [None]:
spotify.dtypes

In [None]:
spotify["released_year"] = spotify["released_year"].astype(str)
spotify["released_month"] = spotify["released_month"].astype(str).str.zfill(2)
spotify["released_day"] = spotify["released_day"].astype(str).str.zfill(2)
spotify.dtypes

spotify["released_date"] = spotify["released_year"] + "-" + spotify["released_month"] + "-" + spotify["released_day"]
spotify["released_date"] = pd.to_datetime(spotify["released_date"])
spotify.dtypes

spotify.drop(['released_year', 'released_month', 'released_day'], axis = 1, inplace = True)

In [None]:
spotify.head()

In [None]:
print(spotify.in_shazam_charts.isna().sum())
print(spotify.key.isna().sum())

In [None]:
spotify['in_shazam_charts'] = spotify.in_shazam_charts.fillna(0)
# print(spotify.in_shazam_charts.unique())
# print(spotify['key'].value_counts())

In [None]:
spotify['in_shazam_charts'] = spotify.in_shazam_charts.astype(str).str.replace(',', '', regex = False).replace('nan', '0').astype(int)

spotify['key'] = spotify.key.fillna('Unknown')

In [None]:
spotify.columns

In [None]:
spotify.rename(columns = {"artist(s)_name" : "artist_name"}, inplace=True)

In [None]:
pd.set_option('display.max_colwidth', 1000)
spotify[spotify.artist_name == "Edison Lighthouse"]

spotify.loc[spotify['artist_name'] == "Edison Lighthouse", 'streams'] = '0'
spotify['streams'] = spotify.streams.astype(int)

## Questions

In [None]:
# 1. What are the top 10 most streamed tracks overall?
most_streamed_tracks = spotify[['track_name', 'artist_name', 'streams']].sort_values(by = 'streams', 
                       ascending = False).head(10)
most_streamed_tracks

In [None]:
# 2. Which artists have the most tracks in Spotify playlists?
artist_with_most_tracks = spotify.groupby("artist_name").track_name.count().sort_values(
                          ascending = False).head(10)
artist_with_most_tracks

In [None]:
# 3. How has the number of releases changed over the years?
spotify["release_year"] = spotify.released_date.dt.year
releases_per_year = spotify.groupby("release_year").track_name.count().reset_index()
# releases_per_year


plt.figure(figsize = (10, 6))
sns.lineplot(x = 'release_year', y = 'track_name', data = releases_per_year, marker = 'o')
plt.title('Number of Track Released Every Year', fontsize = 18)
plt.xlabel('Release Year', fontsize = 14)
plt.ylabel('Number of Tracks Released', fontsize = 14)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# 4. Which month sees the highest number of releases on average?
spotify['released_month'] = spotify.released_date.dt.month_name()
spotify['released_month_num'] = spotify.released_date.dt.month
# released_month
most_releases_monthwise = spotify.groupby(['released_month_num', 'released_month']).track_name.count().reset_index().sort_values(
    by = 'released_month_num')
most_releases_monthwise

plt.figure(figsize = (12, 8))
sns.lineplot(x = 'released_month', y = 'track_name', data = most_releases_monthwise, marker = 'o')
plt.title('Number of Tracks released Every Month')
plt.xlabel('Months')
plt.ylabel('Number of tracks Released')
plt.grid()
plt.show()

In [None]:
# 5. What is the correlation between danceability, energy, and valence?
correlation = spotify[['danceability_%', 'energy_%', 'valence_%']].corr()
#  +1: perfect positive correlation           0: no correlation          -1: perfect negative correlation

plt.figure(figsize = (10, 8))
sns.heatmap(correlation, vmin = -1, vmax = 1, cmap = 'coolwarm', annot = True, fmt='.2f')
plt.title("Correlation between Danceability, Energy, and Valence")
plt.tight_layout()
plt.show()

In [None]:
# 6. What are the average audio features (danceability, energy, etc.) by genre/artist?
# danceability_%	valence_%	energy_%	acousticness_%	instrumentalness_%	liveness_%	speechiness_%
spotify.groupby('artist_name')[['danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 
         'liveness_%', 'speechiness_%']].mean().reset_index()

In [None]:
# 7. How do audio features differ between top 100 and bottom 100 streamed songs?
top_100 = spotify.sort_values('streams', ascending = False).head(100)
bottom_100 = spotify.sort_values('streams', ascending = True).head(100)

audio_features = ['danceability_%',	'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%',
                  'liveness_%',	'speechiness_%']

top_avg = top_100[audio_features].mean().to_frame(name = 'Top 100')
bottom_avg = bottom_100[audio_features].mean().to_frame(name = 'Bottom 100')
differ = pd.concat([top_avg, bottom_avg], axis = 1)

differ.plot(kind='bar', figsize=(12, 8))
plt.title('Average Audio Features: Top 100 vs Bottom 100 Streamed Tracks')
plt.xlabel('Audio Features')
plt.ylabel('Average Percentage')
plt.xticks(rotation=45)
plt.legend()
plt.grid()
plt.show()

In [None]:
# 8. Distribution of BPMs across all songs.
plt.figure(figsize=(10, 6))
sns.histplot(spotify.bpm, bins=30, kde=True, color='skyblue', edgecolor='black')
plt.title('Distribution of BPM Across All Songs', fontsize=16)
plt.xlabel('BPM (Beats Per Minute)', fontsize=12)
plt.ylabel('Number of Songs', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# 9. Which keys (musical) are most common in popular tracks?
popular_tracks = spotify.sort_values('streams', ascending = False).head(100)
# popular_tracks
most_common_keys = popular_tracks['key'].value_counts().sort_values(ascending = False)

sns.barplot(x = most_common_keys.index, y = most_common_keys.values, hue = most_common_keys.index, 
            palette = 'viridis')
plt.title('Most Common Keys in Top 100 Streamed Tracks', fontsize = 16)
plt.xlabel('Musical Keys')
plt.ylabel('Number of Tracks')
plt.xticks(rotation = 45)
plt.grid(axis = 'y')
plt.show()

In [None]:
# 10. Compare the average streams for songs released in different years.
avg_streams = spotify.groupby('release_year').streams.mean().reset_index()
avg_streams

plt.figure(figsize=(12, 6))
sns.lineplot(data=avg_streams, x='release_year', y='streams', marker='o')
plt.title('Average Streams by Release Year', fontsize=16)
plt.xlabel('Release Year')
plt.ylabel('Average Streams')
plt.grid(True)
plt.show()

In [None]:
# 11. Do songs with high instrumentalness get fewer streams?
# spotify['instrumentalness_%'].describe()
# min is 0 and max is 91

bins = [0 ,20, 40, 60, 80, 100]
labels = ['0 - 20', '20 - 40', '40 - 60', '60 - 80', '80-100']
spotify['instrumentalness_bin'] = pd.cut(spotify['instrumentalness_%'], bins = bins, labels=labels,
                                        include_lowest=True)

avg_streams = spotify.groupby('instrumentalness_bin',  observed=True).streams.mean().reset_index()
avg_streams

plt.figure(figsize=(10, 6))
sns.barplot(data=avg_streams, x='instrumentalness_bin', y='streams', hue = 'instrumentalness_bin', palette='magma')
plt.title('Average Streams vs Instrumentalness Levels')
plt.xlabel('Instrumentalness % Range')
plt.ylabel('Average Streams')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()


In [None]:
# 12. Are songs with higher speechiness more likely to appear in charts?
bins = [0 ,20, 40, 60, 80, 100]
labels = ['0 - 20', '20 - 40', '40 - 60', '60 - 80', '80 - 100']
spotify['speech_bins'] = pd.cut(spotify['speechiness_%'], bins = bins, labels = labels, include_lowest = True)
spotify.speech_bins

speechiness_charts = spotify.groupby('speech_bins', observed = True)[['in_spotify_charts', 
                'in_apple_charts', 'in_deezer_charts', 'in_shazam_charts']].mean().reset_index()
speechiness_charts

plt.figure(figsize = (12, 6))
sns.lineplot(data = speechiness_charts, x = 'speech_bins', y = 'in_spotify_charts', 
             marker = 'o', label = 'Spotify Chart')
sns.lineplot(data = speechiness_charts, x = 'speech_bins', y = 'in_apple_charts', 
             marker = 'o', label = 'Apple Chart')
sns.lineplot(data = speechiness_charts, x = 'speech_bins', y = 'in_deezer_charts', 
             marker = 'o', label = 'Deezer Chart')
sns.lineplot(data = speechiness_charts, x = 'speech_bins', y = 'in_shazam_charts', 
             marker = 'o', label = 'Shazam Chart')
plt.title('Speechiness Chart Appearances')
plt.xlabel('Speechiness % Range')
plt.ylabel('Avg Charts')
plt.grid()
plt.legend()
plt.show()

In [None]:
# 13. What is the relationship between acousticness and valence?
correlation = spotify[['acousticness_%', 'valence_%']].corr()
correlation              # -1 perfect negative correlation  0 no correlation   1 perfect positive correlation 

plt.figure(figsize = (14, 8))
sns.heatmap(data = correlation, vmin = -1, vmax = 1, cmap = 'coolwarm', annot = True, fmt='.2f')
plt.title('Correlation between Accoustic and Valence Music', fontsize = 18)
plt.show()

In [None]:
# 14. Top 10 songs that appear in both Apple and Spotify playlists.
top_tracks = spotify[['track_name', 'in_spotify_charts', 'in_apple_charts']].copy()
top_tracks.drop_duplicates(subset='track_name', inplace = True)

top_tracks = top_tracks[(top_tracks['in_spotify_charts'] > 0) & (top_tracks['in_apple_charts'] > 0)]
top_tracks['top_playlists'] = top_tracks['in_spotify_charts'] + top_tracks['in_apple_charts']

top_10_songs = top_tracks.sort_values('top_playlists', ascending = False).head(10)
top_10_songs

plt.figure(figsize=(12, 8))
sns.barplot(data=top_10_songs, x='track_name', y='top_playlists')
plt.xticks(rotation=90)
plt.title('Top 10 Songs in Both Apple and Spotify Charts', fontsize = 16)
plt.grid()
plt.show()

In [None]:
# 15. Identify and plot outliers in terms of streams.
# print(spotify.streams.sort_values().unique())
# print(spotify.streams.describe())

spotify['streams'] = pd.to_numeric(spotify['streams'], errors='coerce')
q1 = spotify['streams'].quantile(0.25)
q3 = spotify['streams'].quantile(0.75)
IQR = q3 - q1

lower_bound = q1 - (1.5 * IQR)
upper_bound = q3 + (1.5 * IQR)

outliers = spotify[(spotify['streams'] < lower_bound) | (spotify['streams'] > upper_bound)]
outliers[['track_name', 'artist_name', 'streams']]

plt.figure(figsize=(12, 6))
sns.stripplot(data=spotify, x='streams', color='green', alpha=0.3, label='All Songs')
sns.stripplot(data=outliers, x='streams', color='red', label = 'Outliers')
plt.title('Outliers In Terms of Streams')
plt.xlabel('Streams')
plt.ylabel('Songs')
plt.grid(axis = 'y')
plt.legend()
plt.show()