# API wrappers

-- create a collection of songs with their audio features - as large as possible!

1. These are the songs that we will cluster. 
2. And, later, when the user inputs a song, we will find the cluster to which the song belongs and recommend a song from the same cluster. 

-- you might want to make sure the collected songs are "curated" in a certain way. Try to find playlists of songs that are diverse, but also that meet certain standards.

An idea for collecting as many songs as possible is 
1. to start with all the songs of a big, diverse playlist and 
2. then go to every artist present in the playlist and grab every song of every album of that artist. 


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
# df_songs = pd.read_csv('df_songs.csv', encoding = 'utf8')
# df_songs.drop(columns='Unnamed: 0', inplace=True) 
# df_songs = df_songs.drop_duplicates()

In [None]:
# https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks?select=data_by_artist.csv
df_songs  = pd.read_csv('data.csv', encoding = 'utf8')
# df_songs.drop(columns='count', inplace=True) 
# df_songs.drop(columns='popularity', inplace=True) 

cols = ['name', 'artists','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms']
df_songs = df_songs[cols]
df_songs = df_songs.rename(columns={"name": "song", "artists": "artist"})

In [None]:
df_songs.columns

# Spotify work

In [None]:
from configSpoti import *
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials


#Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= clientid,
                                                           client_secret= clientsecret))

In [None]:
from time import sleep

# get song features for all the songs
def get_songs_features(dataframe):
#     dataframe = dataframe[0:100]

    for i in range(len(dataframe)):
        print(i)
        results = sp.search(q=i, limit=1)
        results["tracks"]["items"][0]["uri"]
        
#         wait_time = np.random.randint(1,3)
#         print("I will sleep for " + str(wait_time) + " seconds.")
#         sleep(wait_time)

        results = sp.audio_features(results["tracks"]["items"][0]["uri"])
    
        for key in results[0]:
            dataframe.loc[i, key] = results[0][key]
                
    return dataframe




In [None]:
df_songs = get_songs_features(df_songs)

In [None]:
df_songs

In [None]:
df_songs.to_csv('df_songs_featured.csv')

In [None]:
df_songs = df_songs[df_songs['song'].isna() == False]
df_songs

In [None]:
df_songs = df_songs[df_songs['key'].isna() == False]
df_songs

In [None]:
df_songs= df_songs.reset_index()
df_songs

# Unsupervised learning intro

-- ultimate goal: to improve the recommendations of artists. 
-- Clustering the songs will allow the recommendation system to limit the scope of the recommendations to only songs that belong to the same cluster - songs with similar audio features.

The experiments you did with the Spotify API and the Billboard web scraping will allow you to create a pipeline such that when the user enters a song, you:

1. Check whether or not the song is in the Billboard Hot 200.
2. Collect the audio features from the Spotify API.
3. After that, you want to send the Spotify audio features of the submitted song to the clustering model, which should return a cluster number.

We want to have as many songs as possible to create the clustering model, so we will add the songs you collected to a bigger dataset available on Kaggle containing 160 thousand songs.

## Collect the audio features from the Spotify API.

In [None]:
df_songs.drop(columns='index', inplace=True) 
df_songs.drop(columns='analysis_url', inplace=True) 
df_songs.drop(columns='type', inplace=True)
df_songs.drop(columns='track_href', inplace=True) 


In [None]:
df_songs.drop(columns='id', inplace=True) 
df_songs.drop(columns='uri', inplace=True) 

In [None]:
df_songs

## Send the Spotify audio features of the submitted song to the clustering model, which should return a cluster number.

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
df_songs

In [None]:
df_songs.describe()

In [None]:
df_songs.dtypes

In [None]:
X = df_songs.select_dtypes('float64')

In [None]:
X_prep = df_songs.select_dtypes('float64')

In [None]:
X_prep

In [None]:
X_prep = StandardScaler().fit_transform(X_prep)

In [None]:
pd.DataFrame(X_prep, columns=X.columns).head()

In [None]:
# test model
kmeans = KMeans(n_clusters=8, random_state=1234)
kmeans.fit(X_prep)

In [None]:
# get the clusters and how many observations has each cluster.
clusters = kmeans.predict(X_prep)
pd.Series(clusters).value_counts().sort_index()

In [None]:
# checking in which group they are
X_df = pd.DataFrame(X)
X_df["cluster"] = clusters
X_df.head()

In [None]:
kmeans = KMeans(n_clusters=117,
#                 init="random",
                n_init=30,  # try with 1, 4, 8, 20, 30, 100...
                max_iter=2,
                tol=0,
                algorithm="full",
                random_state=1234)
kmeans.fit(X_prep)
print(kmeans.inertia_)

In [None]:
#  "elbow method" to choose the best K.
K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_prep)
    inertia.append(kmeans.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

In [None]:
#silhouette
K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_prep)
    silhouette.append(silhouette_score(X_prep, kmeans.predict(X_prep)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhouette Method showing the optimal k')

#  the greater the separation between the clusters, In other words, the clusters will be more defined. (high peak)

In [None]:
kmeans = KMeans(n_clusters=17,
#                 init="random",
                n_init=14,  # try with 1, 4, 8, 20, 30, 100...
                max_iter=2,
                tol=0,
                algorithm="full",
                random_state=1234)
kmeans.fit(X_prep)
print(kmeans.inertia_)

In [None]:
df_songs

# Getting the song

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from configSpoti import *
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials


# Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=clientid,
                                                           client_secret=clientsecret))
# transformer = StandardScaler().fit(data)
# x_standardized=transformer.transform(data)

# get song features for just one song


def get_song_feature(track):
    rslt_df = pd.DataFrame()

    results = sp.search(q=track, limit=1)
    results["tracks"]["items"][0]["uri"]

    results = sp.audio_features(results["tracks"]["items"][0]["uri"])

    for key in results[0]:
        rslt_df.loc[0, key] = results[0][key]

    return rslt_df


def recommendator():
    track = input('Input a song: ')
#     track = 'Killer Queen'

    X = df_songs.select_dtypes('float64')
    X_prep = df_songs.select_dtypes('float64')

    # standarizing
    scaler = StandardScaler()
    transformer = scaler.fit(X_prep)
    X_prep = transformer.transform(X_prep)

    # build into a dataframe
    X_prep = pd.DataFrame(X_prep, columns=X.columns)

    # performing K-means clustering on the songs database
    kmeans = KMeans(n_clusters=19,
                    #                 init="random",
                    n_init=30,  # try with 1, 4, 8, 20, 30, 100...
                    max_iter=2,
                    tol=0,
                    algorithm="full",
                    random_state=1234)
    kmeans.fit(X_prep)
    clusters = kmeans.predict(X_prep)

    # getting the clusters number
    df_songs["cluster"] = clusters

    # if the song is in the database
    if track in df_songs['song'].values:

        # checking that it works
        print()
        print('\033[1;31mYour song is hot! Here\'s a new recommendation:\033[0m')

        # getting the index of the song
        index = np.where(df_songs['song'].values == track)

        # PENDING: OFFER CHOICES WHEN SAME NAME SONGS
        index = int(index[0][0])

        cluster_num = int(df_songs.loc[index]['cluster'])

        # get a df with the songs in the same cluster
        same_cluster_songs = pd.DataFrame(
            df_songs[df_songs['cluster'] == cluster_num], columns=df_songs.columns)

        # return a song
        new_track = same_cluster_songs[same_cluster_songs['song'] != track].sample(
        )
        print()
        print('***\033[1m \033[92m' + new_track['song'].to_string(index=False) +
              ' by' + new_track['artist'].to_string(index=False) + '\033[0m \033[0m ***')

    # if the song is not in the database
    else:
        print()
        print('\033[94mYour song was cold! Here\'s a new recommendation:\033[0m')

        # get features
        cold_song = get_song_feature(track)
        cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                'instrumentalness', 'liveness', 'valence', 'tempo']
        cold_song = cold_song[cols]
        X_cold = cold_song.select_dtypes('float64')

        # transform the data and clustering
        X_cold = transformer.transform(X_cold)
        clusters = kmeans.predict(X_cold)

        # build into dataframe
        X_cold = pd.DataFrame(X_cold, columns=X_prep.columns)

        # get cluster
        clusters = kmeans.predict(X_cold)
        X_cold["cluster"] = clusters
        cluster_num = int(X_cold['cluster'])

        # get a df with the songs in the same cluster
        same_cluster_songs = pd.DataFrame(
            df_songs[df_songs['cluster'] == cluster_num], columns=df_songs.columns)

        # return a song
        new_track = same_cluster_songs[same_cluster_songs['song'] != track].sample(
        )
        print()
        print('***\033[1m \033[92m' + new_track['song'].to_string(index=False) +
              ' by' + new_track['artist'].to_string(index=False) + '\033[0m \033[0m ***')

In [None]:
recommendator()