In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree
from sklearn.cluster import KMeans
import postgres_helpers as pgh
import os
import sys

The goal of this notebook is to do: 

- Exploratory Data Analysis (EDA) on Spotify Audio Features to understand more about data
- Compare different unsupervised learning algorithms
- Create recommendations for each track and store them as csv, in sqlite and postgres database.

# EDA

## 1. Data Pre-processing

### Load Dataset

In [7]:
def reduce_mem_usage(df, verbose=True):
    """ Function iterates through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        Credit to: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        Parameters
        ----------
        df : Pandas DataFrame
        verbose: (True) by default, prints out before and after memory usage
        Returns
        -------
        df : Reduced Memory Pandas DataFrame
    """

    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem))

    return df


def load_dataset():
    """Loads dataset, and combines them into a single dataframe."""
    try:
        fpaths = ['../data/SpotifyAudioFeaturesApril2019.csv',
                  '../data/SpotifyAudioFeaturesNov2018.csv']
        df1 = pd.read_csv(fpaths[0])
        df2 = pd.read_csv(fpaths[1])
        df = pd.concat([df1, df2], ignore_index=True)
        df = reduce_mem_usage(df)  # Reduces Memory Usage

    except Exception as e:
        print(
            'Error Occurred while reading the Spotify dataset: {e}'.format(e))
        raise e

    # Check total sum of rows match
    assert df.shape[0] == (df1.shape[0] + df2.shape[0])

    file_names = [fpath.split('/')[-1] for fpath in fpaths]
    print(
        f'''
        -------------------- SHAPE ---------------------
        DF1 {file_names[0]}: {df1.shape}
        DF2 {file_names[1]}: {df2.shape}
        MERGED DF df: {df.shape}
        ------------------------------------------------
        ''')

    return df

df = load_dataset()
df.head()

Memory usage of dataframe is 32.04 MB
Memory usage after optimization is: 11.78 MB
Decreased by 63.2%

        -------------------- SHAPE ---------------------
        DF1 SpotifyAudioFeaturesApril2019.csv: (130663, 17)
        DF2 SpotifyAudioFeaturesNov2018.csv: (116372, 17)
        MERGED DF df: (247035, 17)
        ------------------------------------------------
        


Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.005821,0.743164,238373,0.339111,0.0,1,0.081177,-7.679688,1,0.408936,203.875,4,0.117981,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.024399,0.846191,214800,0.557129,0.0,8,0.285889,-7.257812,1,0.457031,159.0,4,0.371094,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.024994,0.603027,138913,0.723145,0.0,9,0.082397,-5.890625,0,0.04541,114.9375,4,0.38208,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.029404,0.799805,125381,0.579102,0.912109,5,0.099426,-12.117188,0,0.070129,123.0,4,0.641113,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783203,124016,0.791992,0.87793,7,0.033203,-10.273438,1,0.066101,120.0625,4,0.928223,0


Let's check total tracks, and than we will remove the duplicates ones.

In [8]:
print(f"Total number of audio tracks in dataset are: {len(df)}.")

Total number of audio tracks in dataset are: 247035.


### Remove Duplicates

In [9]:
# group the entries by artist_name and track_name and check for duplicates
grouped = df.groupby(['artist_name','track_name'], as_index=True).size()
grouped[grouped > 1].count()

116310

There are `116310` duplicate tracks that needs to be dropped.

In [10]:
df1 = df.drop_duplicates(subset=['artist_name','track_name'])
print(f"After dropping duplicate tracks, total no. of tracks: {len(df1)}.")

After dropping duplicate tracks, total no. of tracks: 130725.


Let's verify we have no duplicates left:

In [11]:
df1[df1.duplicated(subset=['artist_name', 'track_name'],keep=False)].count()

artist_name         0
track_id            0
track_name          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
popularity          0
dtype: int64

Now we can be sure no duplicate tracks exists.

# 2. Spotify Audio Features

### `acousticness`

In [12]:
df1.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.005821,0.743164,238373,0.339111,0.0,1,0.081177,-7.679688,1,0.408936,203.875,4,0.117981,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.024399,0.846191,214800,0.557129,0.0,8,0.285889,-7.257812,1,0.457031,159.0,4,0.371094,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.024994,0.603027,138913,0.723145,0.0,9,0.082397,-5.890625,0,0.04541,114.9375,4,0.38208,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.029404,0.799805,125381,0.579102,0.912109,5,0.099426,-12.117188,0,0.070129,123.0,4,0.641113,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783203,124016,0.791992,0.87793,7,0.033203,-10.273438,1,0.066101,120.0625,4,0.928223,0
