In [639]:
#import pandas library for data wrangling
import pandas as pd

In [641]:
#read the data in the csv file
spotifyclean = pd.read_csv('spotify-2023.csv', encoding = 'latin1')
spotifyclean

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,My Mind & Me,Selena Gomez,1,2022,11,3,953,0,91473363,61,...,144,A,Major,60,24,39,57,0,8,3
949,Bigger Than The Whole Sky,Taylor Swift,1,2022,10,21,1180,0,121871870,4,...,166,F#,Major,42,7,24,83,1,12,6
950,A Veces (feat. Feid),"Feid, Paulo Londra",2,2022,11,3,573,0,73513683,2,...,92,C#,Major,80,81,67,4,0,8,6
951,En La De Ella,"Feid, Sech, Jhayco",3,2022,10,20,1320,0,133895612,29,...,97,C#,Major,82,67,77,8,0,12,5


# CHECK FOR NULL VALUES

In [643]:
#check for any null values 
spotifyclean.isnull().any()

track_name              False
artist(s)_name          False
artist_count            False
released_year           False
released_month          False
released_day            False
in_spotify_playlists    False
in_spotify_charts       False
streams                 False
in_apple_playlists      False
in_apple_charts         False
in_deezer_playlists     False
in_deezer_charts        False
in_shazam_charts         True
bpm                     False
key                      True
mode                    False
danceability_%          False
valence_%               False
energy_%                False
acousticness_%          False
instrumentalness_%      False
liveness_%              False
speechiness_%           False
dtype: bool

In [644]:
#checks for null values in the columns 'in_shazam_charts' and 'key'.
shazamchartsNA = spotifyclean[spotifyclean['in_shazam_charts'].isna()]
keyNA = spotifyclean[spotifyclean['key'].isna()]
#checks for rows that have both the columnns as null values
collectiveNA = (spotifyclean[spotifyclean['key'].isna() & spotifyclean['in_shazam_charts'].isna()]).shape[0]

In [645]:
#print the resulting number of null values in the dataset.
NAvalues = shazamchartsNA.shape[0] + keyNA.shape[0]
print(f"The Spotify dataset has {NAvalues} rows with null values.\n")
print(f"The null values are in the columns 'in_shazam_charts' with {shazamchartsNA.shape[0]} values and 'key' with {keyNA.shape[0]} values.\n")
print(f"The dataset also has {collectiveNA} rows that have both the columns as null values.")

The Spotify dataset has 145 rows with null values.

The null values are in the columns 'in_shazam_charts' with 50 values and 'key' with 95 values.

The dataset also has 9 rows that have both the columns as null values.


In [646]:
#print the remaining rows after dropping the null values
spotifyclean = spotifyclean.dropna()
print(f"After dropping the rows with null values, we are now working with {spotifyclean.shape[0]} songs.")

After dropping the rows with null values, we are now working with 817 songs.


# CHECK FOR DUPLICATES

In [648]:
#checks for duplicate songs (rows that have the same song name and artist)
spotifyclean[spotifyclean.duplicated(subset=['track_name', 'artist(s)_name'], keep = False)].sort_values(by='track_name')

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
372,About Damn Time,Lizzo,1,2022,7,15,2332,2,723894473,0,...,109,A#,Minor,84,72,74,10,0,34,7
764,About Damn Time,Lizzo,1,2022,4,14,9021,0,723894473,242,...,109,A#,Minor,84,72,74,10,0,34,7
345,SPIT IN MY FACE!,ThxSoMch,1,2022,10,31,629,14,303216294,32,...,94,G#,Major,73,65,79,5,2,11,6
482,SPIT IN MY FACE!,ThxSoMch,1,2022,10,31,573,0,301869854,1,...,166,C#,Major,70,57,57,9,20,11,7
512,Take My Breath,The Weeknd,1,2021,8,6,2597,0,130655803,17,...,121,A#,Minor,70,35,77,1,0,26,4
616,Take My Breath,The Weeknd,1,2021,8,6,6392,0,432702334,174,...,121,G#,Major,75,53,74,2,0,11,5


In [649]:
#print the resulting number of duplicated rows
print(f"The Spotify dataset contains 3 duplicates which must be removed.")

The Spotify dataset contains 3 duplicates which must be removed.


In [650]:
spotifyclean = spotifyclean.drop_duplicates(subset=['track_name', 'artist(s)_name'], keep = 'last')
print(f"After dropping the duplicated rows, we are now working with {spotifyclean.shape[0]} songs.")

After dropping the duplicated rows, we are now working with 814 songs.


# CHECK FOR UNWANTED VALUES

In [652]:
#checks the datatypes of each column if each column has the correct datatype
spotifyclean.dtypes

track_name              object
artist(s)_name          object
artist_count             int64
released_year            int64
released_month           int64
released_day             int64
in_spotify_playlists     int64
in_spotify_charts        int64
streams                 object
in_apple_playlists       int64
in_apple_charts          int64
in_deezer_playlists     object
in_deezer_charts         int64
in_shazam_charts        object
bpm                      int64
key                     object
mode                    object
danceability_%           int64
valence_%                int64
energy_%                 int64
acousticness_%           int64
instrumentalness_%       int64
liveness_%               int64
speechiness_%            int64
dtype: object

In [653]:
#since the column 'streams' has mixed datatypes, drop the values that aren't numbers
spotifyclean['streams'] = spotifyclean['streams'].astype(float, errors = 'ignore')
#drop the index of the unwanted value but sort the values first so its easier to find the index
spotifyclean = (spotifyclean.sort_values(by='streams', ascending = False)).drop(index=574)

In [654]:
print(f"After dropping the unwanted values, we are left with {spotifyclean.shape[0]} songs")

After dropping the unwanted values, we are left with 813 songs


# SORTING OF ROWS

In [671]:
#Sort the values by the number of streams in descending order
spotifyclean = spotifyclean.sort_values(by='streams', ascending = False).reset_index()
spotifyclean

Unnamed: 0,level_0,index,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,0,33,Anti-Hero,Taylor Swift,1,2022,10,21,9082,56,...,97,E,Major,64,51,63,12,0,19,5
1,1,253,Glimpse of Us,Joji,1,2022,6,10,6330,6,...,170,G#,Major,44,27,32,89,0,14,5
2,2,455,Seek & Destroy,SZA,1,2022,12,9,1007,0,...,152,C#,Major,65,35,65,44,18,21,7
3,3,98,Summertime Sadness,Lana Del Rey,1,2011,1,1,20333,52,...,112,C#,Minor,56,24,66,7,0,12,3
4,4,891,"Come Back Home - From ""Purple Hearts""",Sofia Carson,1,2022,7,12,367,0,...,145,G,Major,56,43,53,24,0,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,808,750,Falling,Harry Styles,1,2019,12,13,7556,0,...,110,E,Major,57,6,27,84,0,9,3
809,809,366,Revenge,XXXTENTACION,1,2017,8,25,3600,11,...,140,B,Minor,75,18,25,78,0,11,26
810,810,744,Right On,Lil Baby,1,2022,4,8,1116,0,...,166,D,Major,70,22,61,2,0,10,34
811,811,515,Best Friends,The Weeknd,1,2022,1,7,1292,0,...,87,E,Minor,49,49,59,44,0,35,21


# SAVING THE CLEANED DATASET

In [684]:
#write the dataset in a different csv for later use
spotifyclean.to_csv('Spotify-2023_Cleaned')

In [686]:
#read the saved csv file to check if it actually saved
pd.read_csv('Spotify-2023_Cleaned')

Unnamed: 0.1,Unnamed: 0,level_0,index,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,0,0,33,Anti-Hero,Taylor Swift,1,2022,10,21,9082,...,97,E,Major,64,51,63,12,0,19,5
1,1,1,253,Glimpse of Us,Joji,1,2022,6,10,6330,...,170,G#,Major,44,27,32,89,0,14,5
2,2,2,455,Seek & Destroy,SZA,1,2022,12,9,1007,...,152,C#,Major,65,35,65,44,18,21,7
3,3,3,98,Summertime Sadness,Lana Del Rey,1,2011,1,1,20333,...,112,C#,Minor,56,24,66,7,0,12,3
4,4,4,891,"Come Back Home - From ""Purple Hearts""",Sofia Carson,1,2022,7,12,367,...,145,G,Major,56,43,53,24,0,12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,808,808,750,Falling,Harry Styles,1,2019,12,13,7556,...,110,E,Major,57,6,27,84,0,9,3
809,809,809,366,Revenge,XXXTENTACION,1,2017,8,25,3600,...,140,B,Minor,75,18,25,78,0,11,26
810,810,810,744,Right On,Lil Baby,1,2022,4,8,1116,...,166,D,Major,70,22,61,2,0,10,34
811,811,811,515,Best Friends,The Weeknd,1,2022,1,7,1292,...,87,E,Minor,49,49,59,44,0,35,21
