## Building Music Recommendation System using Spotify Dataset

### Import Libraries

In [101]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

# Define IAM role
role = get_execution_role()
prefix = 'new_kmeans/'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


### Create bucket on s3

In [102]:
bucket_name = 'bucket-rs-spotify' # <--- Your bucket name cannot contain capital letters.
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


### Read Data

In [103]:
try:
    data = pd.read_csv("./data.csv")
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


### Upload data to s3 bucket

In [104]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/data.csv')).upload_file('data.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

### Clustering Songs with K-Means
Use SageMaker in-built model - KMeans
Upload built KMeans model to s3 bucket 

In [105]:
# clustering songs with kmeans
from sagemaker import KMeans

n_clusters = 20
kmeans = KMeans(
            k=n_clusters,
            n_jobs=4,
            role=role,
            instance_count=1,
            instance_type="ml.c4.xlarge",
            output_path="s3://" + bucket_name + "/kmeans/" )                                                                      
                                                                            
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', kmeans)
                                 ], verbose=False)



In [106]:
X = data.select_dtypes(np.number)
number_cols = list(X.columns)

new_data = data
new_data = new_data.drop(['artists', 'id', 'name', 'release_date'], 1)
train_data = new_data.values.astype("float32")

song_cluster_pipeline.steps[1][1].fit(song_cluster_pipeline.steps[1][1].record_set(train_data))


Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2022-02-26 20:08:54 Starting - Starting the training job...
2022-02-26 20:08:59 Starting - Launching requested ML instancesProfilerReport-1645906134: InProgress
.........
2022-02-26 20:10:37 Starting - Preparing the instances for training......
2022-02-26 20:11:52 Downloading - Downloading input data
2022-02-26 20:11:52 Training - Downloading the training image.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/26/2022 20:12:34 INFO 139985502349120] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', '_disable_wait_to_read': 'false', '_enable_

In [115]:
kmeans_song_predictor = song_cluster_pipeline.steps[1][1].deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
# ml.t2 medium

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


----------!

In [153]:
train_data1 = train_data[0:50000]
train_data2 = train_data[50000:100000]
train_data3 = train_data[100000:150000]
train_data4 = train_data[150000:]
result1=kmeans_song_predictor.predict(train_data1)
result2=kmeans_song_predictor.predict(train_data2)
result3=kmeans_song_predictor.predict(train_data3)
result4=kmeans_song_predictor.predict(train_data4)


result = result1 + result2 + result3 + result4

data['cluster_label'] = result

In [198]:
from sklearn.cluster import KMeans

song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)


song_cluster_pipeline.fit(X)

Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=20, verbose=False))])

### Build Recommender System
* Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
* This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
* Spotipy is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using pip install spotipy
* After installing Spotipy, you will need to create an app on the Spotify Developer’s page and save your Client ID and secret key.

In [178]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= "01619dbbc94443d5a0828c00a490ab72",
                                                           client_secret="f569c086f1934fa08806682496564ddb"))
 
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [179]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

In [180]:
recommend_songs_data = recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993},
                {'name': 'Stay Away', 'year': 1993},
                {'name': 'Billetes Azules (with J Balvin)', 'year': 2020}],  data)

### Upload the result to s3

In [None]:
import csv
with open('recommend_songs_result.csv', "w") as f:
    writer = csv.writer(f)
    writer.writerow({"name", "year", "artists"})
    for line in recommend_songs_data:
        newList = [str(line["name"]) , str(line["year"]), str(line["artists"])]
        writer.writerow(newList)

In [None]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'result/recommend_songs_result.csv')).upload_file('recommend_songs_result.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/result'.format(bucket_name, prefix), content_type='csv')