In [17]:
import pandas as pd 

# Load the dataset
df_spotify = pd.read_csv('../data/raw/dataset.csv', index_col = 0)

df_spotify.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [18]:

# Number of rows and columns
nrows, ncols = df_spotify.shape
print(f'The dataset has {nrows} rows and {ncols} columns')

The dataset has 114000 rows and 20 columns


In [19]:

# Check for duplicated rows
duplicated_rows = df_spotify.duplicated().sum()

print(f'The dataset has {duplicated_rows} duplicated rows')

# Drop duplicated rows
df_spotify.drop_duplicates(inplace = True)
print('Duplicated rows were dropped')

# Number of rows and columns
nrows, ncols = df_spotify.shape
print(f'The dataset has {nrows} rows and {ncols} columns after dropping duplicated rows')

The dataset has 450 duplicated rows
Duplicated rows were dropped
The dataset has 113550 rows and 20 columns after dropping duplicated rows


## Column Descriptions

| **Column**         | **Description**                                                                                        |
|--------------------|--------------------------------------------------------------------------------------------------------|
| `track_id`         | The unique Spotify ID for each track.                                                                  |
| `artists`          | Names of the artists who performed the track, separated by `;`.                                        |
| `album_name`       | The name of the album in which the track appears.                                                      |
| `track_name`       | The title of the track.                                                                                |
| `popularity`       | A value between 0 and 100, indicating the track's popularity based on recent plays.                    |
| `duration_ms`      | The length of the track in milliseconds.                                                               |
| `explicit`         | Boolean indicating whether the track contains explicit content.                                        |
| `danceability`     | Describes how suitable a track is for dancing (0.0 = least danceable, 1.0 = most danceable).           |
| `energy`           | Represents the intensity and activity of a track (0.0 = low energy, 1.0 = high energy).                |
| `key`              | The musical key of the track mapped using standard Pitch Class notation.                               |
| `loudness`         | Overall loudness of the track in decibels (dB).                                                        |
| `mode`             | Indicates the modality (major or minor) of the track.                                                  |
| `speechiness`      | Detects the presence of spoken words in the track.                                                     |
| `acousticness`     | Confidence measure of whether the track is acoustic (0.0 = not acoustic, 1.0 = highly acoustic).       |
| `instrumentalness` | Predicts whether a track contains vocals (0.0 = contains vocals, 1.0 = instrumental).                  |
| `liveness`         | Detects the presence of an audience in the recording (0.0 = studio recording, 1.0 = live performance). |
| `valence`          | Measures the musical positiveness conveyed by a track (0.0 = negative, 1.0 = positive).                |
| `tempo`            | Estimated tempo of the track in beats per minute (BPM).                                                |
| `time_signature`   | Estimated time signature of the track (3 to 7).                                                        |
| `track_genre`      | The specific genre associated with the track.                                                          |


In [20]:

# Check the data types of the columns
df_datatypes = df_spotify.dtypes.to_frame('Data Types')
df_datatypes

Unnamed: 0,Data Types
track_id,object
artists,object
album_name,object
track_name,object
popularity,int64
duration_ms,int64
explicit,bool
danceability,float64
energy,float64
key,int64


In [21]:

# Select object columns
obj_cols = df_spotify.select_dtypes(include='object').columns

# Create a dictionary with column names and their unique data types
unique_dtypes = {col: set(map(type, df_spotify[col].dropna())) for col in obj_cols}

# Add the unique data types to the df_datatypes DataFrame
df_datatypes['Unique Object Data Types'] = pd.Series(unique_dtypes)
df_datatypes


Unnamed: 0,Data Types,Unique Object Data Types
track_id,object,{<class 'str'>}
artists,object,{<class 'str'>}
album_name,object,{<class 'str'>}
track_name,object,{<class 'str'>}
popularity,int64,
duration_ms,int64,
explicit,bool,
danceability,float64,
energy,float64,
key,int64,


In [22]:

# Missing values in each row
missing_values_per_row = df_spotify.isnull().sum(axis=1)
count_per_missing_value = missing_values_per_row.value_counts().sort_index()

# Print the results
for missing, rows in count_per_missing_value.items():
    print(f'{rows} row(s) have {missing} missing values')

total_rows_with_missing_values = (df_spotify.isnull().any(axis=1)).sum()
print(f'Total number of rows with missing values: {total_rows_with_missing_values}')

113549 row(s) have 0 missing values
1 row(s) have 3 missing values
Total number of rows with missing values: 1
