In [2]:
!pip install librosa



In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 2.1 Getting your data 

We want to read the datasets to try to understand which key we can use to merge them together.

First of all we want to fill empty values in the datasets.
The below function take as an input a dataframe and check if in the columns there are any missing values of numeric or string types and fills this value with an empty string, in case of strings, or with a mean of the values of that column in case of numbers.

In [4]:
def filler(dataframe):
    for col in dataframe.columns:
        if dataframe[col].isnull().any():
            if is_string_dtype(dataframe[col]):
                dataframe[col] = dataframe[col].fillna("")
            elif is_numeric_dtype(dataframe[col]):
                dataframe[col] = dataframe[col].fillna(dataframe[col].mean())

We take the datasets needed:

In [6]:
tracks = pd.read_csv("tracks.csv")
echonest = pd.read_csv("echonest.csv")
features = pd.read_csv("features.csv")

In [7]:
tracks.head()

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [8]:
tracks.shape

(106574, 53)

In [9]:
echonest.head()

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


In [10]:
echonest.shape

(13129, 250)

In [11]:
features.head()

Unnamed: 0,track_id,chroma_cens_kurtosis_01,chroma_cens_kurtosis_02,chroma_cens_kurtosis_03,chroma_cens_kurtosis_04,chroma_cens_kurtosis_05,chroma_cens_kurtosis_06,chroma_cens_kurtosis_07,chroma_cens_kurtosis_08,chroma_cens_kurtosis_09,...,tonnetz_std_04,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_median_01,zcr_min_01,zcr_skew_01,zcr_std_01
0,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
1,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
2,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
3,10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
4,20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


In [12]:
features.shape

(106574, 519)

We can notice that a common column is **track_id**, so let's check if there are any missing values:

In [13]:
tracks["track_id"].isnull().any()

False

In [14]:
echonest["track_id"].isnull().any()

False

In [15]:
features["track_id"].isnull().any()

False

Given that there are no missing values in "**track_id**" columns in any dataset, we choose to use it as the key for the merge of the datasets.

I build a copy and work on it to don't damage the original files.

In [16]:
tracks_c = tracks.copy()
echonest_c = echonest.copy()
features_c = features.copy()

In [17]:
# First we are gonna fill the datasets using the function filler()
filler(tracks_c)
filler(echonest_c)
filler(features_c)

In [18]:
tracks_c.head()

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [19]:
echonest_c.head()

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


In [20]:
features_c.head()

Unnamed: 0,track_id,chroma_cens_kurtosis_01,chroma_cens_kurtosis_02,chroma_cens_kurtosis_03,chroma_cens_kurtosis_04,chroma_cens_kurtosis_05,chroma_cens_kurtosis_06,chroma_cens_kurtosis_07,chroma_cens_kurtosis_08,chroma_cens_kurtosis_09,...,tonnetz_std_04,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_median_01,zcr_min_01,zcr_skew_01,zcr_std_01
0,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
1,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
2,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
3,10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
4,20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


Once we got all the datasets well filled, we can start merging them using as key the **track_id** column.

In [21]:
final = tracks_c.merge(features_c.merge(echonest_c, on = "track_id"), on = "track_id")
print(final.shape)
final.head()

(13129, 820)


Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


As we can see, the number of rows is almost 13k, as demanded.

## 2.2 Choose your features (variables)!

We notice that the number of features is very big (820 columns!) and we want to reduce the number of variables we have to work with.

So let's apply dimensionality reduction!

### Principal Component Analysis

Principal component analysis, is a statistical technique to convert high dimensional data to low dimensional data by selecting the most important features that capture maximum information about the dataset. The features are selected on the basis of variance that they cause in the output. The feature that causes highest variance is the first principal component. The feature that is responsible for second highest variance is considered the second principal component, and so on. It is important to mention that principal components do not have any correlation with each other.

#### Standardizing the Numeric Features

Since PCA is effected by scale, we need to scale the features in our data before applying PCA. We used StandardScaler to standardize the dataset’s features onto unit scale (mean = 0 and variance = 1) which is a requirement for the optimal performance of algorithms. 

In [24]:
# First we need to chose numeric columns 
num_cols = [col for col in final.columns if final[col].dtypes != "O"]
num_cols

['track_id',
 'album_comments',
 'album_favorites',
 'album_id',
 'album_listens',
 'album_tracks',
 'artist_comments',
 'artist_favorites',
 'artist_id',
 'artist_latitude',
 'artist_longitude',
 'track_bit_rate',
 'track_comments',
 'track_duration',
 'track_favorites',
 'track_interest',
 'track_listens',
 'track_number',
 'chroma_cens_kurtosis_01',
 'chroma_cens_kurtosis_02',
 'chroma_cens_kurtosis_03',
 'chroma_cens_kurtosis_04',
 'chroma_cens_kurtosis_05',
 'chroma_cens_kurtosis_06',
 'chroma_cens_kurtosis_07',
 'chroma_cens_kurtosis_08',
 'chroma_cens_kurtosis_09',
 'chroma_cens_kurtosis_10',
 'chroma_cens_kurtosis_11',
 'chroma_cens_kurtosis_12',
 'chroma_cens_max_01',
 'chroma_cens_max_02',
 'chroma_cens_max_03',
 'chroma_cens_max_04',
 'chroma_cens_max_05',
 'chroma_cens_max_06',
 'chroma_cens_max_07',
 'chroma_cens_max_08',
 'chroma_cens_max_09',
 'chroma_cens_max_10',
 'chroma_cens_max_11',
 'chroma_cens_max_12',
 'chroma_cens_mean_01',
 'chroma_cens_mean_02',
 'chroma_cens

In [29]:
# We changed the types of numeric columns as float to apply Standard Scaler
final[num_cols] = final[num_cols].astype(float)
final.dtypes

track_id                 float64
album_comments           float64
album_date_created        object
album_date_released       object
album_engineer            object
                          ...   
temporal_features_219    float64
temporal_features_220    float64
temporal_features_221    float64
temporal_features_222    float64
temporal_features_223    float64
Length: 820, dtype: object

In [40]:
# Separating out the non-numeric features
df_1 = final.drop(columns=num_cols)
df_1.dtypes

album_date_created          object
album_date_released         object
album_engineer              object
album_information           object
album_producer              object
album_tags                  object
album_title                 object
album_type                  object
artist_active_year_begin    object
artist_active_year_end      object
artist_associated_labels    object
artist_bio                  object
artist_date_created         object
artist_location             object
artist_members              object
artist_name                 object
artist_related_projects     object
artist_tags                 object
artist_website              object
artist_wikipedia_page       object
set_split                   object
set_subset                  object
track_composer              object
track_date_created          object
track_date_recorded         object
track_genre_top             object
track_genres                object
track_genres_all            object
track_information   

In [31]:
# Separating out the numeric features
x = final.loc[:, num_cols].values

In [32]:
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [48]:
# Initialize the PCA class by passing the number of components to the constructor.
pca = PCA(n_components=5)
# Call the fit and then transform methods by passing the feature set to these methods. 
# The transform method returns the specified number of principal components.
principalComponents = pca.fit_transform(x)
# Creating Principal data frame
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2','principal component 3','principal component 4','principal component 5'])

In [49]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [0.10472581 0.07410088 0.0353129  0.02836919 0.02501076]


In [50]:
finalDf = pd.concat([principalDf, df_1], axis = 1)

In [52]:
finalDf.head(10)

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,album_date_created,album_date_released,album_engineer,album_information,album_producer,...,track_license,track_lyricist,track_publisher,track_tags,track_title,metadata_album_date,metadata_album_name,metadata_artist_location,metadata_artist_name,metadata_release
0,3.074543,-2.769912,-5.858218,4.197439,3.259071,2008-11-26 01:44:45,2009-01-05 00:00:00,,<p></p>,,...,Attribution-NonCommercial-ShareAlike 3.0 Inter...,,,[],Food,,,"Georgia, US",AWOL,AWOL - A Way Of Life
1,5.77479,-5.364765,-5.397493,-1.474541,-3.646719,2008-11-26 01:44:45,2009-01-05 00:00:00,,<p></p>,,...,Attribution-NonCommercial-ShareAlike 3.0 Inter...,,,[],Electric Ave,,,"Georgia, US",AWOL,AWOL - A Way Of Life
2,2.652435,-4.121103,-4.686471,0.05951,-2.877743,2008-11-26 01:44:45,2009-01-05 00:00:00,,<p></p>,,...,Attribution-NonCommercial-ShareAlike 3.0 Inter...,,,[],This World,,,"Georgia, US",AWOL,AWOL - A Way Of Life
3,9.135772,3.874803,-5.557803,0.749018,-1.304397,2008-11-26 01:45:08,2008-02-06 00:00:00,,,,...,Attribution-NonCommercial-NoDerivatives (aka M...,,,[],Freeway,2008-03-11,Constant Hitmaker,"Philadelphia, PA, US",Kurt Vile,Constant Hitmaker
4,1.307063,-9.400518,-4.458017,0.951766,-4.015965,2008-11-26 01:44:45,2009-01-05 00:00:00,,<p></p>,,...,Attribution-NonCommercial-ShareAlike 3.0 Inter...,,,[],Street Music,,,"Georgia, US",AWOL,AWOL - A Way Of Life
5,4.063851,2.598676,1.049471,1.67879,3.190033,2008-11-26 01:49:57,2009-01-16 00:00:00,,"<p>A full ensamble of strings, drums, electron...",,...,Attribution-Noncommercial-No Derivative Works ...,,,[],CandyAss,,,"Providence, RI, US",Alec K. Redfearn and the Eyesores,Every Man For Himself
6,-6.592735,-3.592278,4.241709,-1.613454,0.856779,2008-11-26 01:49:59,2007-05-22 00:00:00,,<p>Alec K. Redfearn &amp; The Eyesores: Ellen ...,"Alec K. Refearn, Rob Pemberton",...,Attribution-Noncommercial-No Derivative Works ...,,,[],Queen Of The Wires,,,"Providence, RI, US",Alec K. Redfearn and the Eyesores,The Blind Spot
7,-12.18282,2.051396,1.524762,-3.027803,-1.933654,2008-11-26 01:49:57,2009-01-16 00:00:00,,"<p>A full ensamble of strings, drums, electron...",,...,Attribution-Noncommercial-No Derivative Works ...,,,[],Ohio,,,"Providence, RI, US",Alec K. Redfearn and the Eyesores,Every Man For Himself
8,-4.686128,-1.072051,-3.85467,-6.691118,-3.151397,2008-11-26 01:50:03,2005-01-25 00:00:00,,<p>Recorded at Sound Station Seven and at home...,,...,Attribution-Noncommercial-No Derivative Works ...,,,[],Punjabi Watery Grave,2005,The Quiet Room,"Providence, RI, US",Alec K. Redfearn and the Eyesores,The Quiet Room
9,8.312074,3.402502,-5.405963,-1.735372,1.878881,2008-11-26 01:50:07,2009-01-06 00:00:00,,<p><em>A</em>lthough recorded in Providence in...,Tom Buckland,...,Attribution-Noncommercial-No Derivative Works ...,,,[],Wire Up,,,"Providence, RI, US",Amoebic Ensemble,Amoebiasis
