## This notebook focuses on cleaning spotify_top_songs_audio_features.csv, sourced from Kaggle. Our primary objectives are to handle missing values, correct data types, remove duplicates and checking if there are any out of bound values to prepare for detailed analysis.

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns' , None) 
pd.set_option('display.max_rows' , None)

In [3]:
spotify_df = pd.read_csv('spotify_top_songs_audio_features.csv')
spotify_df.head()

Unnamed: 0,id,artist_names,track_name,source,key,mode,time_signature,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,loudness,tempo,duration_ms,weeks_on_chart,streams
0,000xQL6tZNLJzIrtIgxqSl,"ZAYN, PARTYNEXTDOOR",Still Got Time (feat. PARTYNEXTDOOR),RCA Records Label,G,Major,4 beats,0.748,0.627,0.0639,0.131,0.0,0.0852,0.524,-6.029,120.963,188491,17,107527761
1,003eoIwxETJujVWmNFMoZy,Alessia Cara,Growing Pains,Def Jam Recordings,C#/Db,Minor,4 beats,0.353,0.755,0.733,0.0822,0.0,0.39,0.437,-6.276,191.153,193680,2,9944865
2,003vvx7Niy0yvhvHt4a68B,The Killers,Mr. Brightside,Island Records,C#/Db,Major,4 beats,0.352,0.911,0.0747,0.00121,0.0,0.0995,0.236,-5.23,148.033,222973,125,512388123
3,00B7TZ0Xawar6NZ00JFomN,"Cardi B, Chance the Rapper",Best Life (feat. Chance The Rapper),Atlantic/KSR,A,Major,4 beats,0.62,0.625,0.553,0.287,0.0,0.314,0.665,-7.438,167.911,284856,2,11985346
4,00Blm7zeNqgYLPtW6zg8cj,"Post Malone, The Weeknd",One Right Now (with The Weeknd),Republic Records,C#/Db,Major,4 beats,0.687,0.781,0.053,0.0361,0.0,0.0755,0.688,-4.806,97.014,193507,30,301860377


## Data Cleaning and Preparation

In [4]:
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6513 entries, 0 to 6512
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                6513 non-null   object 
 1   artist_names      6513 non-null   object 
 2   track_name        6513 non-null   object 
 3   source            6513 non-null   object 
 4   key               6513 non-null   object 
 5   mode              6513 non-null   object 
 6   time_signature    6513 non-null   object 
 7   danceability      6513 non-null   float64
 8   energy            6513 non-null   float64
 9   speechiness       6513 non-null   float64
 10  acousticness      6513 non-null   float64
 11  instrumentalness  6513 non-null   float64
 12  liveness          6513 non-null   float64
 13  valence           6513 non-null   float64
 14  loudness          6513 non-null   float64
 15  tempo             6513 non-null   float64
 16  duration_ms       6513 non-null   int64  


In [7]:
spotify_df.shape

(6513, 19)

In [8]:
spotify_df['id'].duplicated().sum()

0

In [9]:
spotify_df.isnull().sum()

id                  0
artist_names        0
track_name          0
source              0
key                 0
mode                0
time_signature      0
danceability        0
energy              0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
loudness            0
tempo               0
duration_ms         0
weeks_on_chart      0
streams             0
dtype: int64

#### Checking if any values exceed the limits defined to these columns.

In [10]:
columns = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']
query = "or".join([f"({col} < 0) | ({col} > 1)" for col in columns])
spotify_df.query(query)

Unnamed: 0,id,artist_names,track_name,source,key,mode,time_signature,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,loudness,tempo,duration_ms,weeks_on_chart,streams


In [11]:
query_loudness = "(loudness < -60) | (loudness > 0)"
query_tempo = "( tempo < 50) | (tempo > 220)"

spotify_df = spotify_df.drop(spotify_df.query(query_loudness or query_tempo).index)

In [13]:
spotify_df.shape

(6509, 19)

In [16]:
spotify_df['key'].value_counts()

key
C#/Db    942
C        705
G        595
B        567
G#/Ab    553
F        531
A        523
D        523
F#/Gb    501
A#/Bb    465
E        424
D#/Eb    180
Name: count, dtype: int64

In [17]:
spotify_df['mode'].value_counts()

mode
Major    3745
Minor    2764
Name: count, dtype: int64

#### Changing the value '1' in time_signature column to '1 beat' for better understanding

In [18]:
spotify_df['time_signature'].value_counts()

time_signature
4 beats    6053
3 beats     324
5 beats     107
1            25
Name: count, dtype: int64

In [22]:
spotify_df.loc[spotify_df['time_signature'] == '1', 'time_signature'] = '1 beats'

In [23]:
spotify_df['time_signature'].value_counts()

time_signature
4 beats    6053
3 beats     324
5 beats     107
1 beats      25
Name: count, dtype: int64

#### Inserting a column with duration in hh\:mm\:ss format and dropping the column with duration in ms

In [42]:
spotify_df['duration'] = pd.to_timedelta(spotify_df['duration_ms'], unit= 'ms')
spotify_df = spotify_df.drop('duration_ms', axis = 1)
spotify_df.head()

Unnamed: 0,id,artist_names,track_name,source,key,mode,time_signature,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,loudness,tempo,weeks_on_chart,streams,duration
0,000xQL6tZNLJzIrtIgxqSl,"ZAYN, PARTYNEXTDOOR",Still Got Time (feat. PARTYNEXTDOOR),RCA Records Label,G,Major,4 beats,0.748,0.627,0.0639,0.131,0.0,0.0852,0.524,-6.029,120.963,17,107527761,0 days 00:03:08.491000
1,003eoIwxETJujVWmNFMoZy,Alessia Cara,Growing Pains,Def Jam Recordings,C#/Db,Minor,4 beats,0.353,0.755,0.733,0.0822,0.0,0.39,0.437,-6.276,191.153,2,9944865,0 days 00:03:13.680000
2,003vvx7Niy0yvhvHt4a68B,The Killers,Mr. Brightside,Island Records,C#/Db,Major,4 beats,0.352,0.911,0.0747,0.00121,0.0,0.0995,0.236,-5.23,148.033,125,512388123,0 days 00:03:42.973000
3,00B7TZ0Xawar6NZ00JFomN,"Cardi B, Chance the Rapper",Best Life (feat. Chance The Rapper),Atlantic/KSR,A,Major,4 beats,0.62,0.625,0.553,0.287,0.0,0.314,0.665,-7.438,167.911,2,11985346,0 days 00:04:44.856000
4,00Blm7zeNqgYLPtW6zg8cj,"Post Malone, The Weeknd",One Right Now (with The Weeknd),Republic Records,C#/Db,Major,4 beats,0.687,0.781,0.053,0.0361,0.0,0.0755,0.688,-4.806,97.014,30,301860377,0 days 00:03:13.507000


In [45]:
col = spotify_df.pop('duration')
spotify_df.insert(3, 'duration', col)
spotify_df.head()

Unnamed: 0,id,artist_names,track_name,duration,source,key,mode,time_signature,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,loudness,tempo,weeks_on_chart,streams
0,000xQL6tZNLJzIrtIgxqSl,"ZAYN, PARTYNEXTDOOR",Still Got Time (feat. PARTYNEXTDOOR),0 days 00:03:08.491000,RCA Records Label,G,Major,4 beats,0.748,0.627,0.0639,0.131,0.0,0.0852,0.524,-6.029,120.963,17,107527761
1,003eoIwxETJujVWmNFMoZy,Alessia Cara,Growing Pains,0 days 00:03:13.680000,Def Jam Recordings,C#/Db,Minor,4 beats,0.353,0.755,0.733,0.0822,0.0,0.39,0.437,-6.276,191.153,2,9944865
2,003vvx7Niy0yvhvHt4a68B,The Killers,Mr. Brightside,0 days 00:03:42.973000,Island Records,C#/Db,Major,4 beats,0.352,0.911,0.0747,0.00121,0.0,0.0995,0.236,-5.23,148.033,125,512388123
3,00B7TZ0Xawar6NZ00JFomN,"Cardi B, Chance the Rapper",Best Life (feat. Chance The Rapper),0 days 00:04:44.856000,Atlantic/KSR,A,Major,4 beats,0.62,0.625,0.553,0.287,0.0,0.314,0.665,-7.438,167.911,2,11985346
4,00Blm7zeNqgYLPtW6zg8cj,"Post Malone, The Weeknd",One Right Now (with The Weeknd),0 days 00:03:13.507000,Republic Records,C#/Db,Major,4 beats,0.687,0.781,0.053,0.0361,0.0,0.0755,0.688,-4.806,97.014,30,301860377


In [46]:
spotify_df.to_csv('cleaned_spotify_top_songs_audio_features.csv', index=False)