Improvements
- additional artist
- better regex filter
- making function to filter spotify more readable
- User input programm flow
- modules to increase readability
- silhoutee
- write functions for data wrangling

# Scrape Billboard Top 100

## Parse HTML Page

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://www.billboard.com/charts/hot-100'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
soup.select('span.chart-element__information__song')[1].text

## Extract Data

In [None]:
#song_names
song_lst = []
song_names = soup.select('span.chart-element__information__song')

for song in song_names:
    song_lst.append(song.text)

In [None]:
#artist_names
artist_lst = []
artist_names = soup.select('span.chart-element__information__artist')

for artist in artist_names:
    artist_lst.append(artist.text)

In [None]:
#ranking
ranking_lst = []
ranking_names = soup.select('span.chart-element__rank__number')

for rank in ranking_names:
    ranking_lst.append(rank.text)

## Convert to DataFrame and save

In [None]:
billboard_top = pd.DataFrame({'rank':ranking_lst, 'artist': artist_lst, 'song': song_lst})

In [None]:
billboard_top.head()

In [None]:
billboard_top.to_csv('billboard_top.csv')

# User input validation

In [None]:
a,b,c=input("Enter three inputs ").split()

In [509]:
import re

artist_name = input('Artist: ')
if not re.match(".{4,}", artist_name):
        print ("Please enter a valid artist")
        artist_name = "9999999999xxxx"
        
song = input('Song: ')
if not re.match(".{4,}", artist_name):
        print ("Please enter a valid artist")
        song = "9999999999xxxx"
result = str(hot_or_not(artist_name, song))
print(result + ': recommendation process starts...')

Artist: Cardi B
Song: Up
True: recommendation process starts...


In [None]:
def hot_or_not (artist, song):
    '''
    function determines if artist is in the scraped database
    '''
    contains_artist = billboard_top['artist'].str.contains(artist_name)
    contains_title = billboard_top['song'].str.contains(song)
    results = pd.DataFrame({'contains_artist': contains_artist, 'contains_title': contains_title, "match": contains_artist & contains_title})

    if results['match'].any():
        s = True
    else:
        s = False
    return s

# Creating Dataframe from Spotify Wrapper

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import getpass
import pandas as pd
from tqdm.notebook import tqdm

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7ac25cd2df894d4587f18918f0776908",
                                                           client_secret="947331b0345946ef847d79f23802ee84"))

In [None]:
spotifydf = pd.DataFrame()
artistdf = pd.DataFrame()
track_lst = []
artist_1 = []
artist_2 = []

#looping through categories
try:
    for category in tqdm(sp.categories(limit=15)['categories']['items']):

        #looping through playlists, ignoring ids that do not exist
        try:
            for playlist in sp.category_playlists(category_id=category['id'], country=None, limit=15, offset=0)['playlists']['items']:

                #looping through tracks, ignoring ids that do not exist
                try:
                    for tracks in sp.playlist_tracks(playlist['id'])['items']:
                        ids = tracks['track']['id']

                        #adding rows of audio_features
                        spotifydf = spotifydf.append(sp.audio_features(tracks['track']['id']))

                        #lst of track names
                        track_lst.append(sp.track(ids)['name'])

                        #lst of artist 1
                        artist_1.append(sp.track(ids)['artists'][0]['name'])

                        #lst of artist 2
                        try: artist_2.append(sp.track(ids)['artists'][1]['name'])
                        except: artist_2.append('')
                except:
                    print('Track-ID not found error')
                    pass
        except:
            print('Playlist-ID not found error')
            pass
                    
except requests.exceptions.ReadTimeout:
    print("ReadTimeout")

finally: 
    spotifydf['track_names'] = track_lst
    spotifydf['artist_1'] = artist_1
    spotifydf['artist_2'] = artist_2

In [None]:
len(spotifydf)

In [None]:
spotifydf.dtypes

In [None]:
spotifydf.head()

In [None]:
# Extracting to CSV
spotifydf.to_csv('spotifydf.csv')

In [None]:
df = spotifydf

# Data preparation

- strip unnecessary columns
- normalize

In [None]:
# stripping dataset of unncessary columns
df = df.drop([0 , 'analysis_url', 'id', 'track_href', 'type', 'uri'], axis = 1)

In [None]:
# assign index
df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
#filling Nulls with empty string, can be removed once new dataset was downloaded
df['artist_2']=df['artist_2'].fillna('')

In [None]:
# drop null rows
df = df.dropna(axis = 0)

In [None]:
df.info()

# Applying Model

In [388]:
# drop categorical columns - track_names, artist1, artist2
X_prep = df.select_dtypes(include = ['float64'])

In [389]:
# Standardize 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_prep)
X_prep_np = scaler.transform(X_prep)

In [390]:
# Transform to dataframe and attach header
X_prep_df = pd.DataFrame(X_prep_np, columns=X_prep.columns)

In [None]:
# Assessing optimum for number of clusters
elbow_inertia(50)

In [None]:
def elbow_inertia(r):
    '''
    plots the elbow graph for a given number of clusters for k-mean algorithm
    '''
    K = range(1,r)
    inertias = []

    for k in K:
        kmeans = KMeans(n_clusters = k, random_state=1234)
        kmeans.fit(X_prep_df)
        inertias.append(kmeans.inertia_)
    
    import matplotlib.pyplot as plt
    plt.plot(K, inertias, 'bx-')
    plt.xlabel('# Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    plt.show()

In [391]:
# applying Model
from sklearn.cluster import KMeans

#setting clusters
kmeans = KMeans(n_clusters = 15, random_state=1234)
kmeans.fit(X_prep_df)

KMeans(n_clusters=15, random_state=1234)

In [392]:
X_prep_df

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,1.573942,0.293467,0.479716,-0.922321,-0.588607,1.046386,0.506947,-0.439256,-1.282197,-0.070937,0.446439,0.213465,-0.873854
1,-0.908473,0.370987,-0.329351,0.953944,-0.589531,-1.463931,-0.556853,0.725325,0.779911,-0.597633,0.049210,0.213465,0.623145
2,0.524576,-0.535027,-0.287499,0.227760,-0.589598,0.488538,2.151840,0.411726,0.779911,0.675397,2.305948,0.213465,1.014667
3,-0.943451,0.690756,-0.189998,0.915724,-0.586812,0.767462,-0.605278,0.638249,0.779911,-0.526606,0.011855,0.213465,0.538698
4,-0.898711,0.690756,-0.582303,0.488352,-0.589598,-0.069310,-0.461542,0.576425,0.779911,-0.485082,0.390527,0.213465,-0.340308
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11597,-0.644097,0.947541,0.057721,-0.050205,-0.589598,0.767462,-0.938868,0.472306,0.779911,-0.580150,0.050069,0.213465,1.862966
11598,-0.459713,0.748896,1.011934,0.370217,-0.589592,1.604234,0.991191,0.479521,-1.282197,0.391288,-0.914601,0.213465,0.949413
11599,-0.692905,0.923316,2.407483,0.536996,-0.589071,0.488538,-0.723648,0.328755,0.779911,-0.117925,-1.341968,0.213465,1.893674
11600,-0.730053,0.177187,-0.071976,0.276404,-0.589598,-0.906082,1.529241,0.250511,-1.282197,2.390984,1.831913,0.213465,0.899513


- Silhouette vizualisation

In [393]:
#add clusters to initial spotify dataset
clusters = kmeans.predict(X_prep_df)

In [None]:
df['clusters'] = clusters

In [396]:
df.head(20)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,track_names,artist_1,artist_2,clusters
0,0.937,0.634,245897.0,0.285,0.000359,9.0,0.232,-14.007,0.0,0.0807,129.65,4.0,0.203,Your Power,Billie Eilish,,13
1,0.0215,0.65,191014.0,0.825,2.4e-05,0.0,0.0936,-4.645,1.0,0.0325,118.091,4.0,0.593,Save Your Tears (with Ariana Grande) (Remix),The Weeknd,Ariana Grande,9
2,0.55,0.463,193853.0,0.616,0.0,7.0,0.446,-7.166,1.0,0.149,183.76,4.0,0.695,tystnar i luren,Miriam Bryant,Victor Leksell,6
3,0.0086,0.716,200467.0,0.814,0.00101,8.0,0.0873,-5.345,1.0,0.039,117.004,4.0,0.571,Nån annan nu,Molly Sandén,,0
4,0.0251,0.716,173855.0,0.691,0.0,5.0,0.106,-5.842,1.0,0.0428,128.023,4.0,0.342,Lose my mind,Myra Granberg,,0
5,0.0697,0.532,184615.0,0.754,0.0,1.0,0.061,-6.834,0.0,0.223,90.992,4.0,0.717,VHS - Spotify Studio It’s Hits Recording,Benjamin Ingrosso,Cherrie,2
6,0.0833,0.689,226268.0,0.706,0.0,1.0,0.105,-4.48,0.0,0.0312,106.52,4.0,0.623,Little Bit of Love,Tom Grennan,,2
7,0.455,0.837,149706.0,0.585,0.0,2.0,0.0747,-5.69,1.0,0.0554,94.081,4.0,0.726,Ensam,Norlie & KKV,,9
8,0.169,0.819,183000.0,0.425,0.0,0.0,0.056,-4.81,1.0,0.0495,120.04,4.0,0.829,Instruktionsboken,Miss Li,,9
9,0.321,0.677,198082.0,0.696,0.0,0.0,0.42,-6.181,1.0,0.119,90.03,4.0,0.464,Peaches (feat. Daniel Caesar & Giveon),Justin Bieber,Daniel Caesar,9


# Cluster Prediction for User Input

- put user input into kmeans model

In [None]:
# retrieve accoustics from Spotifty database for user input
# if result = False:
    #artist_name
    #song

In [510]:
# find track id based on user input
track_id = sp.search(q="artist:" + artist_name + " track:" + song, type="track")['tracks']['items'][0]['id']

In [511]:
#extract audio features for track id
audio_features = sp.audio_features(track_id)

In [512]:
audio_features

[{'danceability': 0.868,
  'energy': 0.795,
  'key': 11,
  'loudness': -6.044,
  'mode': 1,
  'speechiness': 0.269,
  'acousticness': 0.0012,
  'instrumentalness': 0.0193,
  'liveness': 0.0461,
  'valence': 0.819,
  'tempo': 166.0,
  'type': 'audio_features',
  'id': '1XXimziG1uhM0eDNCZCrUl',
  'uri': 'spotify:track:1XXimziG1uhM0eDNCZCrUl',
  'track_href': 'https://api.spotify.com/v1/tracks/1XXimziG1uhM0eDNCZCrUl',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1XXimziG1uhM0eDNCZCrUl',
  'duration_ms': 156945,
  'time_signature': 4}]

In [513]:
# append audio_features to a dataframe
X_song = pd.DataFrame()
X_song = X_song.append(audio_features)

In [514]:
X_song

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.868,0.795,11,-6.044,1,0.269,0.0012,0.0193,0.0461,0.819,166.0,audio_features,1XXimziG1uhM0eDNCZCrUl,spotify:track:1XXimziG1uhM0eDNCZCrUl,https://api.spotify.com/v1/tracks/1XXimziG1uhM...,https://api.spotify.com/v1/audio-analysis/1XXi...,156945,4


In [515]:
# drop columns
X_song.drop(['id', 'track_href', 'type', 'uri', 'analysis_url'], axis = 1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.868,0.795,11,-6.044,1,0.269,0.0012,0.0193,0.0461,0.819,166.0,156945,4


In [516]:
# sort columns to initial dataframe
X_song = X_song[X_prep_df.columns]

In [517]:
# normalize user input result
X_song_prep = scaler.transform(X_song)

In [518]:
# Predict
kmeans.predict(X_song_prep)

array([5], dtype=int32)

In [519]:
int(kmeans.predict(X_song_prep))

5

# Recommendation

- with result filter spotify dataset
- select randome result
- print out result for user

In [520]:
song_recommendation = df[df['clusters']==int(kmeans.predict(X_song_prep))]

In [521]:
recommendation_lst = song_recommendation.sample(n = 1).values.tolist()

In [522]:
print('Song Name ' + str(recommendation_lst[0]).split(',')[13])
print('Artist 1 ' + str(recommendation_lst[0]).split(',')[14])
print('Artist 2 ' +str(recommendation_lst[0]).split(',')[15])

Song Name  'MVP'
Artist 1  'Lil Eazzyy'
Artist 2  ''


In [None]:
results["tracks"]["items"][0]["name"]