In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.decomposition import PCA
from sklearn import preprocessing
import librosa
import scipy.spatial.distance as dist

# 2.1 Getting your data 

We want to read the datasets to try to understand which key we can use to merge them together.

First of all we want to fill empty values in the datasets.
The below function take as an input a dataframe and check if in the columns there are any missing values of numeric or string types and fills this value with an empty string, in case of strings, or with a mean of the values of that column in case of numbers.

In [3]:
def filler(dataframe):
    for col in dataframe.columns:
        if dataframe[col].isnull().any():
            if is_string_dtype(dataframe[col]):
                dataframe[col] = dataframe[col].fillna("")
            elif is_numeric_dtype(dataframe[col]):
                dataframe[col] = dataframe[col].fillna(dataframe[col].mean())

We take the datasets needed:

In [4]:
tracks = pd.read_csv("tracks.csv")
echonest = pd.read_csv("echonest.csv")
features = pd.read_csv("features.csv")

In [5]:
tracks.head()

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [16]:
tracks.shape

(106574, 53)

In [9]:
echonest.head()

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


In [10]:
echonest.shape

(13129, 250)

In [11]:
features.head()

Unnamed: 0,track_id,chroma_cens_kurtosis_01,chroma_cens_kurtosis_02,chroma_cens_kurtosis_03,chroma_cens_kurtosis_04,chroma_cens_kurtosis_05,chroma_cens_kurtosis_06,chroma_cens_kurtosis_07,chroma_cens_kurtosis_08,chroma_cens_kurtosis_09,...,tonnetz_std_04,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_median_01,zcr_min_01,zcr_skew_01,zcr_std_01
0,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
1,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
2,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
3,10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
4,20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


In [12]:
features.shape

(106574, 519)

(note that: features and tracks have the same number of rows)

We can notice that a common column is **track_id**, so let's check if there are any missing values:

In [27]:
tracks["track_id"].isnull().any()

False

In [9]:
echonest["track_id"].isnull().any()

False

In [10]:
features["track_id"].isnull().any()

False

Given that there are no missing values in "**track_id**" columns in any dataset, we choose to use it as the key for the merge of the datasets.

I build a copy and work on it to don't damage the original files.

In [28]:
tracks_c = tracks.copy()
echonest_c = echonest.copy()
features_c = features.copy()

In [29]:
# First we are gonna fill the datasets using the function filler()
filler(tracks_c)
filler(echonest_c)
filler(features_c)

In [30]:
tracks_c.head()

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [31]:
tracks_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   track_id                  106574 non-null  int64  
 1   album_comments            106574 non-null  int64  
 2   album_date_created        106574 non-null  object 
 3   album_date_released       106574 non-null  object 
 4   album_engineer            106574 non-null  object 
 5   album_favorites           106574 non-null  int64  
 6   album_id                  106574 non-null  int64  
 7   album_information         106574 non-null  object 
 8   album_listens             106574 non-null  int64  
 9   album_producer            106574 non-null  object 
 10  album_tags                106574 non-null  object 
 11  album_title               106574 non-null  object 
 12  album_tracks              106574 non-null  int64  
 13  album_type                106574 non-null  o

We can see that of the 52 total attributes, 35 of them are strings, while the remaining 18 are numbers (float64 or int64)

In [19]:
echonest_c.head()

Unnamed: 0,track_id,audio_features_acousticness,audio_features_danceability,audio_features_energy,audio_features_instrumentalness,audio_features_liveness,audio_features_speechiness,audio_features_tempo,audio_features_valence,metadata_album_date,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,2008-03-11,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


In [21]:
echonest_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13129 entries, 0 to 13128
Columns: 250 entries, track_id to temporal_features_223
dtypes: float64(244), int64(1), object(5)
memory usage: 25.0+ MB


We see, as we did for "tracks", that there are 5 string values over the 250 attributes.

In [20]:
features_c.head()

Unnamed: 0,track_id,chroma_cens_kurtosis_01,chroma_cens_kurtosis_02,chroma_cens_kurtosis_03,chroma_cens_kurtosis_04,chroma_cens_kurtosis_05,chroma_cens_kurtosis_06,chroma_cens_kurtosis_07,chroma_cens_kurtosis_08,chroma_cens_kurtosis_09,...,tonnetz_std_04,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_median_01,zcr_min_01,zcr_skew_01,zcr_std_01
0,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
1,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
2,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
3,10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.0,3.542325,0.0408
4,20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.81641,0.043851,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993


In [22]:
features_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Columns: 519 entries, track_id to zcr_std_01
dtypes: float64(518), int64(1)
memory usage: 422.0 MB


# Once we have all the datasets, we want to apply PCA before merging them together, so doing we will have dimensionality reduction without loosing too many useful informations

"features" has no attribute of type string, so they are all numerical.

Once we got all the datasets well filled, we can start merging them using as key the **track_id** column.

In [21]:
#final = tracks_c.merge(features_c.merge(echonest_c, on = "track_id"), on = "track_id")
#print(final.shape)
#final.head()

(13129, 820)


Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,temporal_features_214,temporal_features_215,temporal_features_216,temporal_features_217,temporal_features_218,temporal_features_219,temporal_features_220,temporal_features_221,temporal_features_222,temporal_features_223
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-1.992303,6.805694,0.23307,0.19288,0.027455,0.06408,3.67696,3.61288,13.31669,262.929749
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-1.582331,8.889308,0.258464,0.220905,0.081368,0.06413,6.08277,6.01864,16.673548,325.581085
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-2.288358,11.527109,0.256821,0.23782,0.060122,0.06014,5.92649,5.86635,16.013849,356.755737
3,10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,...,-3.662988,21.508228,0.283352,0.26707,0.125704,0.08082,8.41401,8.33319,21.317064,483.403809
4,134,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,-1.452696,2.356398,0.234686,0.19955,0.149332,0.0644,11.26707,11.20267,26.45418,751.147705


As we can see, the number of rows is almost 13k, as demanded.

## 2.2 Choose your features (variables)!

We want to reduce the dimension of the datasets.
We aim to remove string type attributes given that they can't fit in PCA. 

For "tracks" file we want first to see which columns can be useful given that they are plenty of useful information.

In [39]:
tracks_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Data columns (total 53 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   track_id                  106574 non-null  int64  
 1   album_comments            106574 non-null  int64  
 2   album_date_created        106574 non-null  object 
 3   album_date_released       106574 non-null  object 
 4   album_engineer            106574 non-null  object 
 5   album_favorites           106574 non-null  int64  
 6   album_id                  106574 non-null  int64  
 7   album_information         106574 non-null  object 
 8   album_listens             106574 non-null  int64  
 9   album_producer            106574 non-null  object 
 10  album_tags                106574 non-null  object 
 11  album_title               106574 non-null  object 
 12  album_tracks              106574 non-null  int64  
 13  album_type                106574 non-null  o

We decide to keep only useful variables:

In [56]:
tracks_c = tracks_c[['track_id', 'album_id', 'artist_id', 'track_bit_rate', 
                    'track_duration', 'track_listens']]

In [57]:
tracks_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106574 entries, 0 to 106573
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   track_id        106574 non-null  int64
 1   album_id        106574 non-null  int64
 2   artist_id       106574 non-null  int64
 3   track_bit_rate  106574 non-null  int64
 4   track_duration  106574 non-null  int64
 5   track_listens   106574 non-null  int64
dtypes: int64(6)
memory usage: 4.9 MB


As we can see, now there are only 3 string values.

In [58]:
tracks_c

Unnamed: 0,track_id,album_id,artist_id,track_bit_rate,track_duration,track_listens
0,2,1,1,256000,168,1293
1,3,1,1,256000,237,514
2,5,1,1,256000,206,1151
3,10,6,6,192000,161,50135
4,20,4,4,256000,311,361
...,...,...,...,...,...,...
106569,155316,22940,24357,320000,162,102
106570,155317,22940,24357,320000,217,165
106571,155318,22940,24357,320000,404,168
106572,155319,22940,24357,320000,146,294


For the other datasets, we can remove the string values since they don't give us much information.

In [59]:
echonest_c = echonest_c.select_dtypes(exclude = ['object'])
features_c = features_c.select_dtypes(exclude = ['object'])

In [60]:
total = tracks_c.merge(features_c.merge(echonest_c, on = 'track_id'), on = 'track_id')

In [61]:
total.shape

(13129, 768)

We can see that our dataset is formed by almost 13'000 rows, as demanded.

## PCA

In [73]:
st_sc = preprocessing.StandardScaler()
total_st = pd.DataFrame(st_sc.fit_transform(total[total.columns[1:]].values), columns = total.columns[1:])

In [74]:
pca = PCA(n_components = 768)
total_pca = pca.fit_transform(total)

print("The variance is:", sum(pca.explained_variance_ratio_))

The variance is: 0.9999999999999998


## 2.3 K-Means

Let's implement, by scratch, our K-Means algorithm:

We will pass as **Input**:
- **X**: which stands for the data (dataframe)
- **k**: the number of clusters
- **max_iterations**: the max number of allowed iterations before establishing the clusters (set at 100 by default)

We will receive as **Output**:
- **P**: an array holding the class in which each data point belongs to

In [22]:
cdist([[1,2,3,4]], [[1,2,3,4]], 'euclidean')

array([[0.]])

In [30]:
final.shape

(13129, 820)

In [31]:
len(final)

13129

In [59]:
num_cols = [[col for col in final.columns if final[col].dtypes != "O"]]


In [70]:
def euc_dist(X1, X2):
    return dist.euclidean(X1, X2)

In [85]:
def init_centroids(data, k):
    # Number of columns in the dataset
    cols = np.shape(data)[1]
    
    # Matrix of 0s to initialize centroids
    centroids = np.mat(np.zeros((k, cols)))
    
    # Picking centroids at random
    for i in range(cols):
        minim = min(data[:, i])
        range_i = float(max(data[:, i] - minim))
        centroids[:, i] = minim + range_i * np.random.rand(k, 1)
    
    # Returning array of random generated centroids
    return centroids

In [94]:
def kmeans(data, k):
    rows = np.shape(data)[0]
    
    # Hold cluster assignments instance
    cluster_ass = np.mat(np.zeros((rows, 2)))
    
    # Initialize centroids
    centroids = init_centroids(data, k)
    
    # We take the original centroids to check later on
    original_cent = centroids.copy()
    
    changed = True
    num_iter = 0
    
    while changed:
        changed = False
        
        # every row
        for r in range(cols):
            # track minimum distance and the vector's index of the cluster
            min_dist = np.inf
            min_index = -1
            
            # Compute distances
            for j in range(k):
                distance = euc_dist(centroids[j, :], ds[i, :])
                if distance < min_dist:
                    min_dist = distance
                    min_index = j
                
                # Check if cluster assignment of instance has changed
                if cluster_ass[r, 0] != min_index:
                    changed = True
                
                # Assign instance to appropriate cluster
                cluster_ass[r, :] = min_index, min_dist**2
                
                # Update centroid location
            for cent in range(k):
                points = data[np.nonzero(cluster_ass[:, 0].A==cent)[0]]
                centroids[cent, :] = np.mean(points, axis=0)
                    
            # Update the number of iterations
            num_iter += 1
            
        return centroids, cluster_ass, num_iter, original_cent

In [95]:
kmeans(final, 4)

TypeError: '(slice(None, None, None), 0)' is an invalid key