# Prototype

## Business case

Step 1:
- scraping data form wikipedia (https://en.wikipedia.org/wiki/Triple_J_Hottest_100) and billboard (https://www.billboard.com/charts/hot-100)
- Create user input
- check if song is in the list
- if song is in the list, recommend 3 other songs from the list (random)
- if song is not in the list, return no recommendations

Step 2:
- Accept multiple values for '&' and '+' (and)
- add a link to recommended songs on spotify
- Scrape data every week, check for updates, remove songs that are not in the list, add songs that are new
- Split python file web scraping and python file recommendating songs

## Scraping websites

In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from time import sleep
import numpy as np
import random

In [2]:
url = "https://www.billboard.com/charts/hot-100"
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [3]:
soup = BeautifulSoup(response.content, "html.parser")

In [4]:
song_2021 = []
artist_2021 = []
year_2021 = []
songs_2021 = soup.findAll('span',attrs={"class":"chart-element__information"})
for x in songs_2021:
    try:
        song = str(x.find('span', attrs={'class': 'chart-element__information__song text--truncate color--primary'}).text).strip()
        song_2021.append(song)
    except:
        song_2021.append('NA')
    try:
        artist = str(x.find('span', attrs={'class': 'chart-element__information__artist text--truncate color--secondary'}).text).strip()
        artist_2021.append(artist)
    except:
        artist_2021.append('NA')
    year_2021.append('2021')
    
df_2021 = pd.DataFrame({'song':song_2021, 'artist':artist_2021, 'year': year_2021})

In [5]:
df_2021

Unnamed: 0,song,artist,year
0,Drivers License,Olivia Rodrigo,2021
1,34+35,Ariana Grande,2021
2,Calling My Phone,Lil Tjay Featuring 6LACK,2021
3,Blinding Lights,The Weeknd,2021
4,Up,Cardi B,2021
...,...,...,...
95,Almost Maybes,Jordan Davis,2021
96,Back To The Streets,Saweetie Featuring Jhene Aiko,2021
97,Bad Boy,Juice WRLD & Young Thug,2021
98,Opp Stoppa,YBN Nahmir Featuring 21 Savage,2021


In [6]:
url2 = "https://en.wikipedia.org/wiki/Triple_J_Hottest_100,_2020"
response2 = requests.get(url2)
response2.status_code # 200 status code means OK!

200

In [7]:
soup2 = BeautifulSoup(response2.content, "html.parser")
soup2

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Triple J Hottest 100, 2020 - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YDYl9usft2eIhV-5lQQatgAAAJc","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Triple_J_Hottest_100,_2020","wgTitle":"Triple J Hottest 100, 2020","wgCurRevisionId":1007684741,"wgRevisionId":1007684741,"wgArticleId":65741235,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages which use infobox templates with ignored data cells","Triple J Hottest 100"],"wgPageContentLanguage":"en","wgPageContentModel":"w

In [8]:
link_songs = soup2.select("table.wikitable.sortable > tbody > tr > td")[2].get_text()
link_songs 

'Glass Animals'

In [9]:
song_2020 = []
artist_2020 = []
year_2020 = []

if (response2.status_code == 200):
    wait_time = np.random.randint(1,4)
    sleep(wait_time)
    soup2 = BeautifulSoup(response2.content, "html.parser")
    songs_2020 = soup2.select("table.wikitable.sortable > tbody > tr")
    for i in songs_2020:
        response2 = requests.get(url2)
        try:
            #sub = i.select("td")
            #song_2020.append(sub[1]).get_text()
            song_2020.append(i.select("td")[1].get_text().replace("\n",""))
        except:
            song_2020.append('NA')
        try:
            #sub = i.select("td")
            #artist_2020.append(sub[2]).get_text()
            artist_2020.append(i.select("td")[2].get_text().replace("\n",""))
        except:
            artist_2020.append('NA')
        year_2020.append('2020')
    
df_2020 = pd.DataFrame({'song':song_2020[0:101], 'artist':artist_2020[0:101], 'year': year_2020[0:101]})

In [10]:
df_2020

Unnamed: 0,song,artist,year
0,,,2020
1,Heat Waves,Glass Animals,2020
2,Booster Seat,Spacey Jane,2020
3,The Difference,Flume and Toro y Moi,2020
4,Cherub,Ball Park Music,2020
...,...,...,...
96,Germaphobe,Hockey Dad,2020
97,Audacity,Stormzy featuring Headie One,2020
98,Your Man,Joji,2020
99,Itch,Hockey Dad,2020


In [11]:
df_final = pd.concat([df_2020,df_2021], ignore_index=True)
df_final

Unnamed: 0,song,artist,year
0,,,2020
1,Heat Waves,Glass Animals,2020
2,Booster Seat,Spacey Jane,2020
3,The Difference,Flume and Toro y Moi,2020
4,Cherub,Ball Park Music,2020
...,...,...,...
196,Almost Maybes,Jordan Davis,2021
197,Back To The Streets,Saweetie Featuring Jhene Aiko,2021
198,Bad Boy,Juice WRLD & Young Thug,2021
199,Opp Stoppa,YBN Nahmir Featuring 21 Savage,2021


In [12]:
df_final = df_final[1:]
df_final

Unnamed: 0,song,artist,year
1,Heat Waves,Glass Animals,2020
2,Booster Seat,Spacey Jane,2020
3,The Difference,Flume and Toro y Moi,2020
4,Cherub,Ball Park Music,2020
5,Lost in Yesterday,Tame Impala,2020
...,...,...,...
196,Almost Maybes,Jordan Davis,2021
197,Back To The Streets,Saweetie Featuring Jhene Aiko,2021
198,Bad Boy,Juice WRLD & Young Thug,2021
199,Opp Stoppa,YBN Nahmir Featuring 21 Savage,2021


In [13]:
spec_chars = ["!",'"',"#","%","(",")",
              "*",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    df_final['song'] = df_final['song'].str.replace(char, '')

df_final['song']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['song'] = df_final['song'].str.replace(char, '')


1                 Heat Waves
2               Booster Seat
3             The Difference
4                     Cherub
5          Lost in Yesterday
               ...          
196            Almost Maybes
197      Back To The Streets
198                  Bad Boy
199               Opp Stoppa
200    How They Remember You
Name: song, Length: 200, dtype: object

In [14]:
song_dict = {'é': 'e', 'à' : 'a', ' (like a version)' : '', ' (flume remix)' : '', ' (go baby)' : '', ' (okokok)' : ''}
df_final['song'].replace(song_dict, regex=True, inplace=True)
df_final['song']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


1                 Heat Waves
2               Booster Seat
3             The Difference
4                     Cherub
5          Lost in Yesterday
               ...          
196            Almost Maybes
197      Back To The Streets
198                  Bad Boy
199               Opp Stoppa
200    How They Remember You
Name: song, Length: 200, dtype: object

In [15]:
song_lower = map(lambda x:x.lower(),df_final['song'])
songlist = list(song_lower)
songlist

['heat waves',
 'booster seat',
 'the difference',
 'cherub',
 'lost in yesterday',
 'wap',
 'hyperfine',
 'sending me ur loving',
 "i'm good",
 'therefore i am',
 'on our own',
 'get on the beers',
 'rockstar',
 'tombstone',
 'skin',
 'screw loose',
 'is it true',
 'tangerine',
 'you should be sad',
 'addicted to the sunshine',
 'energy',
 'complicated',
 'good news',
 'blue world',
 'reality check please',
 'no plans to make plans',
 'under the thunder',
 'straightfaced',
 'dribble',
 'bagilam bargan',
 'reasons',
 'criminals',
 'breathe deeper',
 'everybody rise',
 'running red lights',
 'forget me too',
 'so done',
 'parasite eve',
 'righteous',
 'come & go',
 'i still dream about you',
 'whats poppin',
 'together',
 'you & i',
 'as long as you care',
 'pretty lady',
 'animals',
 "ain't it different",
 'wishing well',
 'nothing to love about love',
 'your love deja vu',
 'the glow',
 'in your eyes',
 'blue flume remix',
 "i think you're great",
 'on the line',
 'sobercoaster',
 'fl

In [27]:
def get_song_input():
    while True:
        song_title = input("Please enter your favorite song: ").lower()
        if not song_title in list(songlist):
            print("We have no recommendations for you")
            continue
        else:
            #print("You're favorite song is:", song_title, "\n")
            print("We found some awesome songs for you that you probably like:\n", ",  ".join(random.sample(songlist, k=3)))
            break
    return song_title

In [28]:
song_title = get_song_input()
song_title

Please enter your favorite song: skin
You're favorite song is: skin 

We found some awesome songs for you that you probably like:
 you got it,  bagilam bargan,  down to one


'skin'

In [18]:
import sys
sys.path.insert(1, 'C:/Users/ROB3942/ironhack/unit7/')
from config import * # config.py
import requests
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials

In [30]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= my_spotify_clientid,
                                                           client_secret= my_spotify_clientsecret))

In [31]:
chosen_song = sp.search(q=song_title, limit=1) 
chosen_song["tracks"]["items"][0]["uri"]
my_dict = sp.audio_features(skin["tracks"]["items"][0]["uri"])[0] # my_dict is now a dictionary

my_dict
# Change it to a pandas dataframe
#my_dict_new = { key:[my_dict[key]] for key in list(my_dict.keys()) }
#pd.DataFrame(my_dict_new)

{'danceability': 0.557,
 'energy': 0.457,
 'key': 7,
 'loudness': -5.372,
 'mode': 1,
 'speechiness': 0.0335,
 'acousticness': 0.428,
 'instrumentalness': 0,
 'liveness': 0.0567,
 'valence': 0.328,
 'tempo': 105.918,
 'type': 'audio_features',
 'id': '03B2SfXuvDh1m9F4tqrX07',
 'uri': 'spotify:track:03B2SfXuvDh1m9F4tqrX07',
 'track_href': 'https://api.spotify.com/v1/tracks/03B2SfXuvDh1m9F4tqrX07',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/03B2SfXuvDh1m9F4tqrX07',
 'duration_ms': 177500,
 'time_signature': 4}

In [32]:
#Reading data
df = pd.read_csv('../dataset/data.csv')
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [34]:
df = df.set_index("name")

In [35]:
num = df.select_dtypes(np.number)
num

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Keep A Song In Your Soul,0.991000,0.598,168333,0.224,0,0.000522,5,0.3790,-12.628,0,12,0.0936,149.976,0.6340,1920
I Put A Spell On You,0.643000,0.852,150200,0.517,0,0.026400,5,0.0809,-7.261,0,7,0.0534,86.889,0.9500,1920
Golfing Papa,0.993000,0.647,163827,0.186,0,0.000018,0,0.5190,-12.098,1,4,0.1740,97.600,0.6890,1920
True House Music - Xavier Santos & Carlos Gomix Remix,0.000173,0.730,422087,0.798,0,0.801000,2,0.1280,-7.311,1,17,0.0425,127.997,0.0422,1920
Xuniverxe,0.295000,0.704,165224,0.707,1,0.000246,10,0.4020,-6.036,0,2,0.0768,122.076,0.2990,1920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The One,0.009170,0.792,147615,0.866,0,0.000060,6,0.1780,-5.089,0,0,0.0356,125.972,0.1860,2020
A Little More,0.795000,0.429,144720,0.211,0,0.000000,4,0.1960,-11.665,1,0,0.0360,94.710,0.2280,2021
Together,0.806000,0.671,218147,0.589,0,0.920000,4,0.1130,-12.393,0,0,0.0282,108.058,0.7140,2020
champagne problems,0.920000,0.462,244000,0.240,1,0.000000,0,0.1130,-12.077,1,69,0.0377,171.319,0.3200,2021


In [40]:
num.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
count,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0,174389.0
mean,0.499228,0.536758,232810.0,0.482721,0.068135,0.197252,5.205305,0.211123,-11.750865,0.702384,25.693381,0.105729,117.0065,0.524533,1977.061764
std,0.379936,0.176025,148395.8,0.272685,0.251978,0.334574,3.518292,0.180493,5.691591,0.457211,21.87274,0.18226,30.254178,0.264477,26.90795
min,0.0,0.0,4937.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,1920.0
25%,0.0877,0.414,166133.0,0.249,0.0,0.0,2.0,0.0992,-14.908,0.0,1.0,0.0352,93.931,0.311,1955.0
50%,0.517,0.548,205787.0,0.465,0.0,0.000524,5.0,0.138,-10.836,1.0,25.0,0.0455,115.816,0.536,1977.0
75%,0.895,0.669,265720.0,0.711,0.0,0.252,8.0,0.27,-7.499,1.0,42.0,0.0763,135.011,0.743,1999.0
max,0.996,0.988,5338302.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,100.0,0.971,243.507,1.0,2021.0


In [37]:
from sklearn.preprocessing import StandardScaler # based on the distance, like clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [41]:
df_prep = StandardScaler().fit_transform(num)

In [42]:
pd.DataFrame(df_prep, columns=num.columns).head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
0,1.294358,0.347919,-0.434495,-0.948791,-0.270401,-0.588004,-0.058354,0.930106,-0.154111,-1.536239,-0.62605,-0.066549,1.089753,0.413903,-2.120635
1,0.378411,1.790898,-0.556689,0.12571,-0.270401,-0.510657,-0.058354,-0.721489,0.788862,-1.536239,-0.854645,-0.287113,-0.995485,1.608718,-2.120635
2,1.299622,0.626289,-0.46486,-1.088146,-0.270401,-0.589511,-1.479502,1.705763,-0.060991,0.65094,-0.991803,0.37458,-0.64145,0.621861,-2.120635
3,-1.313529,1.097814,1.275491,1.156204,-0.270401,1.804534,-0.911043,-0.460536,0.780077,0.65094,-0.397454,-0.346918,0.363273,-1.823729,-2.120635
4,-0.537536,0.950107,-0.455446,0.822485,3.698207,-0.588829,1.362794,1.057535,1.004092,-1.536239,-1.083241,-0.158725,0.167564,-0.852753,-2.120635


In [49]:
kmeans = KMeans(n_clusters=20, random_state=1234)
kmeans.fit(df_prep)

KMeans(n_clusters=20, random_state=1234)

In [50]:
clusters = kmeans.predict(df_prep)
pd.Series(clusters).value_counts().sort_index()

0     13326
1     13631
2     10261
3     13922
4     11159
5      8156
6      9182
7      4643
8      2571
9     11218
10     5639
11    15161
12     7815
13     4353
14     2561
15    11350
16     7612
17    10200
18    11481
19      148
dtype: int64

In [51]:
X_df = pd.DataFrame(num)
X_df["cluster"] = clusters
X_df.head()

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year,cluster
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Keep A Song In Your Soul,0.991,0.598,168333,0.224,0,0.000522,5,0.379,-12.628,0,12,0.0936,149.976,0.634,1920,17
I Put A Spell On You,0.643,0.852,150200,0.517,0,0.0264,5,0.0809,-7.261,0,7,0.0534,86.889,0.95,1920,17
Golfing Papa,0.993,0.647,163827,0.186,0,1.8e-05,0,0.519,-12.098,1,4,0.174,97.6,0.689,1920,0
True House Music - Xavier Santos & Carlos Gomix Remix,0.000173,0.73,422087,0.798,0,0.801,2,0.128,-7.311,1,17,0.0425,127.997,0.0422,1920,12
Xuniverxe,0.295,0.704,165224,0.707,1,0.000246,10,0.402,-6.036,0,2,0.0768,122.076,0.299,1920,6


In [52]:
kmeans.inertia_

1096603.9623190837

In [47]:
n_init = [1,4,8,20,30,100]
for i in n_init:
    kmeans = KMeans(n_clusters=8, init="random", n_init=i, max_iter=2, tol=0, algorithm="full", random_state=1234)
    kmeans.fit(df_prep)
    print(kmeans.inertia_)
    clusters2 = kmeans.predict(df_prep)

1637448.4634598221
1593882.5069351266
1542393.7333554165
1542393.733355416
1542393.733355416
1506427.7310653303


In [55]:
cluster_nr =[10,20,40,50,60]
init_nr =[3,5,7,9]
iter_nr = [2,3,4,5]
tol_nr = [0,1,2]
for x in cluster_nr:
    for y in init_nr:
        for z in iter_nr:
            for w in tol_nr:
            
                kmeans = KMeans(n_clusters=x,
                                init="random",
                                n_init=y,  # try with 1, 4, 8, 20, 30, 100...
                                max_iter=z,
                                tol=w,
                                algorithm="full",
                                random_state=1234)
                kmeans.fit(df_prep)
                if kmeans.inertia_ < 860000:
                    print(x,y,z,w)
                    print(kmeans.inertia_)

In [None]:
K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(df_prep)
    inertia.append(kmeans.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

In [None]:
K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(X_prep)
    silhouette.append(silhouette_score(df_prep, kmeans.predict(X_prep)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Silhouette Method showing the optimal k')