In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
from scripts.data_cleaning import get_data, clean_data, check_missing_values

# Load the data

In [2]:
dataset = get_data("../data/spotify_data.csv")
dataset.replace("None", np.nan, inplace=True)
dataset.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


# Data cleaning

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1159764 non-null  int64  
 1   artist_name       1159749 non-null  object 
 2   track_name        1159763 non-null  object 
 3   track_id          1159764 non-null  object 
 4   popularity        1159764 non-null  int64  
 5   year              1159764 non-null  int64  
 6   genre             1159764 non-null  object 
 7   danceability      1159764 non-null  float64
 8   energy            1159764 non-null  float64
 9   key               1159764 non-null  int64  
 10  loudness          1159764 non-null  float64
 11  mode              1159764 non-null  int64  
 12  speechiness       1159764 non-null  float64
 13  acousticness      1159764 non-null  float64
 14  instrumentalness  1159764 non-null  float64
 15  liveness          1159764 non-null  float64
 16  

In [4]:
dataset.shape

(1159764, 20)

In [5]:
check_missing_values(dataset)

There are 16 missing values in the dataset


16

We can get some descriptive statistics of the dataset

In [15]:
pd.set_option("display.float_format", "{:.3f}".format)
dataset.describe()
dataset.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.043,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.026,0.477,0.0,0.097,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.032,0.338,0.0,0.089,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.036,0.807,0.0,0.08,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.03,0.073,0.019,0.11,0.217,171.864,244320,4


we can now remove the missing values
we also want to remove the redundant `id` column as it does not provide any useful information

we can see that `loudness` goes up to 6.172, which is not possible according to the Spotify API documentation.

we can also see that `time_signature` has a minimum value of 0 while the minimum value should be 3.

In [7]:
dataset = clean_data(dataset)
assert check_missing_values(dataset) == 0

No missing values in the dataset


In [8]:
dataset.head()
dataset.describe()

Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0,1144681.0
mean,18.419,2011.953,0.54,0.642,5.289,-8.926,0.634,0.092,0.319,0.251,0.223,0.458,121.619,250022.154,3.922
std,15.896,6.803,0.183,0.269,3.555,5.598,0.482,0.125,0.353,0.364,0.201,0.268,29.509,148231.968,0.335
min,0.0,2000.0,0.022,0.0,0.0,-58.1,0.0,0.022,0.0,0.0,0.006,0.0,30.766,15000.0,3.0
25%,5.0,2006.0,0.416,0.458,2.0,-10.756,0.0,0.037,0.006,0.0,0.098,0.229,99.021,181707.0,4.0
50%,15.0,2012.0,0.552,0.695,5.0,-7.425,1.0,0.051,0.144,0.002,0.134,0.44,121.99,226080.0,4.0
75%,29.0,2018.0,0.678,0.873,8.0,-5.27,1.0,0.089,0.632,0.606,0.291,0.675,139.93,287200.0,4.0
max,100.0,2023.0,0.993,1.0,11.0,0.0,1.0,0.97,0.996,1.0,1.0,1.0,249.993,6000461.0,5.0


We can now get some more insights from the dataset

In [14]:
dataset[["track_id", "artist_name", "genre"]].nunique().rename("unique_values")

track_id       1144681
artist_name      64017
genre               82
Name: unique_values, dtype: int64

We can see that there are over 1 million unique tracks in the dataset, with over 60,000 unique artists and 82 unique genres

# Data visualization

We can now visualize the data to get some more insights

Let's start by analyzing the trend of the number of tracks released over the years