In [1]:
#importing necessary libraries
!pip install pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [2]:
#Use THE CHARDET library to detect the encoding of the dataset
import chardet

with open('Most Streamed Spotify Songs 2024.csv', 'rb') as file:
    result = chardet.detect(file.read(10000))  # Check a sample of the file
    print(result['encoding'])


FileNotFoundError: [Errno 2] No such file or directory: 'Most Streamed Spotify Songs 2024.csv'

In [None]:
#Loading the dataset
spotify_songs=pd.read_csv('Most Streamed Spotify Songs 2024.csv', encoding='latin-1')


# Set pandas option to display all columns
pd.set_option('display.max_columns', None)

print(spotify_songs)


In [None]:
#number of rows and columns
spotify_songs.shape

In [None]:
#General information of the dataset 
spotify_songs.info()

In [None]:
#checking the number of the columns
spotify_songs.columns

## Data Cleaning

In [None]:
#Handling missing data
#Checking for total number of missing values in descending order
spotify_songs.isnull().sum().sort_values(ascending=False)

In [None]:
import missingno as msno

msno.bar(spotify_songs)
plt.show()

In [None]:
#dropping irrelevant columns

spotify_songs.drop('ISRC', axis = 1, inplace = True)
spotify_songs.drop('TIDAL Popularity', axis=1, inplace = True)
spotify_songs.drop('Soundcloud Streams', axis =1, inplace = True)

spotify_songs.columns

In [None]:
spotify_songs.isnull().sum().sort_values(ascending=False)

In [None]:
#Removing duplicates

spotify_songs.duplicated().sum()

In [None]:
spotify_songs = spotify_songs.drop_duplicates(keep = 'first')
spotify_songs.duplicated().sum()

In [None]:
spotify_songs = spotify_songs.map(lambda x: x.replace(',', '') if isinstance(x, str) else x)
spotify_songs.head(20)

In [None]:
#Converting Release Date to datetime format
spotify_songs['Release Date'] = pd.to_datetime(spotify_songs['Release Date'], format = '%m/%d/%Y')

spotify_songs['Year'] = spotify_songs['Release Date'].dt.year
spotify_songs['Month'] = spotify_songs['Release Date'].dt.month

In [None]:
#Mean Imputation to handle missing numerical values
columns_to_impute = spotify_songs.columns[6:]

# Convert the selected columns to numeric, coercing errors to NaN
spotify_songs[columns_to_impute] = spotify_songs[columns_to_impute].apply(pd.to_numeric, errors='coerce')

# Impute missing values with the mean for the selected columns
spotify_songs[columns_to_impute] = spotify_songs[columns_to_impute].fillna(spotify_songs[columns_to_impute].mean())

spotify_songs.replace([float('inf'), -float('inf')], pd.NA, inplace=True)


# Print the count of missing values in each column
print(spotify_songs.isnull().sum())


In [None]:
spotify_songs.head(10)

In [None]:
spotify_songs['Release Date']

In [None]:
#Statistical Summary of the dataset
spotify_songs.describe()

## Feature Engineering
### From the dataset, we can extract columns related to track, artist, release date, Spotify streams, and popularity metrics

In [None]:
#Data Transformation
# Extract relevant columns
features = spotify_songs[['Track', 'Album Name', 'Artist', 'Release Date', 'Spotify Streams', 'Spotify Popularity', 
               'YouTube Views', 'TikTok Views', 'Shazam Counts','Explicit Track','Year']]

# Check the data types and null values
features.info()

### Create New Features
#### Streaming Velocity
Calculating the streaming velocity (Spotify streams per year), compute the difference between the current year and the release year

In [None]:
from datetime import datetime

# Create an explicit copy of the DataFrame slice
features = spotify_songs[['Track', 'Album Name', 'Artist', 'Release Date', 'Spotify Streams', 
               'Spotify Popularity', 'YouTube Views', 'TikTok Views', 'Shazam Counts','Explicit Track', 'Year']].copy()

# Calculate the current year
current_year = datetime.now().year


# Calculate the number of years since the song was released
features['Years Since Release'] = current_year - spotify_songs['Year'].copy()

# Avoid division by zero for songs released this year
features['Years Since Release'].replace(0, 1, inplace=True)

# Calculate streaming velocity (Spotify Streams per year)
features['Spotify Streaming Velocity'] = features['Spotify Streams'] / features['Years Since Release']

features.replace([float('inf'), -float('inf')], pd.NA, inplace=True)


# View the updated features
features[['Track', 'Artist', 'Spotify Streams', 'Years Since Release', 'Spotify Streaming Velocity']].head(10)


### Cross-Platform Presence
Creating a new feature for cross-platform presence by summing views across YouTube, TikTok, Shazam,

In [None]:
# Calculating cross-platform presence (total views and reach across YouTube, TikTok, and Shazam)
features['Cross-Platform Presence'] = features['YouTube Views'] + features['TikTok Views'] + features['Shazam Counts']

# View the updated features
features[['Track', 'Artist', 'YouTube Views', 'TikTok Views', 'Shazam Counts', 'Cross-Platform Presence']].head(10)


### Data Aggregation
a. Group by Artist
Calculating the total streams, average popularity, and cross-platform reach for each artist:

In [None]:
# Group by artist and calculate total streams, average popularity, and total cross-platform presence
artist_agg = features.groupby('Artist').agg({'Spotify Streams': 'sum','Spotify Popularity': 'mean',
                                             'Cross-Platform Presence': 'sum'}).reset_index()

# View the aggregated data
artist_agg.columns = ['Artist', 'Total Spotify Streams', 'Average Spotify Popularity', 'Total Cross-Platform Presence']
artist_agg.head(10)

In [None]:
# Group by Album Name and calculate total streams, average popularity, and total cross-platform presence
artist_agg = features.groupby('Album Name').agg({'Spotify Streams': 'sum','Spotify Popularity': 'mean',
                                             'Cross-Platform Presence': 'sum'}).reset_index()

# View the aggregated data
artist_agg.columns = ['Album Name', 'Total Spotify Streams', 'Average Spotify Popularity', 'Total Cross-Platform Presence']
artist_agg.head(10)

In [None]:
# Group by explicit content and calculate average streams and popularity

explicit_agg = features.groupby('Explicit Track').agg({'Spotify Streams': 'mean','Spotify Popularity': 'mean'
                                                      }).reset_index()
# View the aggregated data
explicit_agg.columns = ['Explicit Track', 'Average Spotify Streams', 'Average Spotify Popularity']
explicit_agg.head()



In [None]:

Explicit_content= features.groupby('Explicit Track')['Spotify Streams'].mean()

plt.pie(Explicit_content, labels= Explicit_content.index, autopct= '%1.1f%%', explode=[0.1, 0], shadow= True)

plt.axis('equal')
sns.set_style('darkgrid')
plt.show()

# Top 10 Most streamed Spotify Songs

In [None]:


top_songs = features.sort_values(by='Spotify Streams', ascending=False).head(14)

plt.figure(figsize=(10, 6))
sns.barplot(x='Spotify Streams', y='Track', data= top_songs, palette="icefire")

plt.title('Top 10 Most Streamed Songs on Spotify')
plt.xlabel('Spotify Streams')
plt.ylabel('Track')
plt.show()


## Top Artists by Total Streams

In [None]:

top_artists = features.groupby('Artist')[['Spotify Streams', 'YouTube Views']].sum().sort_values(by='Spotify Streams', 
        ascending=False).reset_index().head()

plt.figure(figsize=(12, 6))
top_artists_melted = top_artists.melt(id_vars=['Artist'], var_name='Platform', value_name='Streams')
sns.barplot(x='Streams', y='Artist', hue='Platform', data=top_artists_melted, palette='Set2')
plt.title('Top Artists by Total Streams (Spotify & YouTube)')
plt.xlabel('Total Streams')
plt.ylabel('Artist')
plt.show()



In [None]:
# Group by 'Artist', sum the 'Spotify Streams' for each artist, and get the top 5 artists
top_5_artists = features.groupby('Artist')['Spotify Streams'].sum().reset_index().sort_values(by='Spotify Streams', ascending=False).head(5)['Artist']

# Filter the dataset to include only the top 5 artists
filtered_features = features[features['Artist'].isin(top_5_artists)]

# Group by 'Year' and 'Artist', then sum up 'Spotify Streams' for the top 5 artists
artist_popularity = filtered_features.groupby(['Year', 'Artist'])['Spotify Streams'].sum().reset_index()

# Plotting the artist popularity over time for the top 5 artists
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Spotify Streams', hue='Artist', data=artist_popularity, marker='o')
plt.title('Top 5 Artist Popularity Over Time on Spotify')
plt.xlabel('Year')
plt.ylabel('Spotify Streams')
plt.xticks(rotation=45)
plt.legend(loc='upper left')
plt.show()
