# Scrapping the Billboard for 100 hot songs

Billboard maintains a weekly Top 100 of "hot" songs here: https://www.billboard.com/charts/hot-100.

Scrape the current top 100 songs and their respective artists, and put the information into a pandas dataframe.

In [2]:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd

In [3]:
url = "https://www.billboard.com/charts/hot-100"

response = requests.get(url)

In [4]:
#check response status code 
response.status_code

200

In [5]:
#parse and store the contents of the url call
soup = BeautifulSoup(response.content,'html.parser')

In [6]:
#prettify the soup 
# soup.prettify()

## Query the soup to get songs and their respective artists

In [7]:
# get song title
soup.select('h3.c-title.a-no-trucate')[0].text

'\nEasy On Me\n'

In [8]:
soup.select('span.c-label.a-font-primary-s')[0].text

'\nAdele\n'

## Creating a dataframe from scraped data

In [9]:
song= []
artist = []

len_song = len(soup.select('h3.c-title.a-no-trucate'))

In [10]:
for i in range(len_song):
    song.append(soup.select('h3.c-title.a-no-trucate')[i].text)
    artist.append(soup.select('span.c-label.a-font-primary-s')[i].text)

In [11]:
hot_songs = pd.DataFrame({'song':song,'artist':artist})

In [12]:
hot_songs.head()

Unnamed: 0,song,artist
0,\nEasy On Me\n,\nAdele\n
1,\nStay\n,\nThe Kid LAROI & Justin Bieber\n
2,\nIndustry Baby\n,\nLil Nas X & Jack Harlow\n
3,\nAll Too Well (Taylor's Version)\n,\nTaylor Swift\n
4,\nOh My God\n,\nAdele\n


## Cleaning / Wrangling steps for the scraped data 

In [13]:
def remove_text_inside_brackets(text, brackets="()[]"):
    count = [0] * (len(brackets) // 2) # count open/close brackets
    saved_chars = []
    for character in text:
        for i, b in enumerate(brackets):
            if character == b: # found bracket
                kind, is_close = divmod(i, 2)
                count[kind] += (-1)**is_close # `+1`: open, `-1`: close
                if count[kind] < 0: # unbalanced bracket
                    count[kind] = 0  # keep it
                else:  # found bracket to remove
                    break
        else: # character is not a [balanced] bracket
            if not any(count): # outside brackets
                saved_chars.append(character)
    return ''.join(saved_chars)

In [14]:
hot_songs['song'] = hot_songs['song'].str.replace('\n','').apply(lambda x: remove_text_inside_brackets(x, brackets="()[]")).str.lower()
hot_songs['artist'] = hot_songs['artist'].str.replace('\n','').str.lower()



In [None]:
hot_songs.tail(100)

# Bringing in spottify data

In [17]:
spotify_data = pd.read_csv('spottify_data.csv')

In [18]:
spotify_data.reset_index(drop=True, inplace=True)

In [20]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18682 entries, 0 to 18681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   song     18682 non-null  object
 1   uri      18682 non-null  object
 2   artist   18682 non-null  object
 3   cluster  18682 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 583.9+ KB


In [24]:
#some more cleaning and getting song_id, uri doesn't work very well.
spotify_data['song']=spotify_data['song'].apply(lambda x: remove_text_inside_brackets(x, brackets="()[]")).str.strip().str.lower()
spotify_data['artist']= spotify_data['artist'].apply(lambda x: remove_text_inside_brackets(x, brackets="()[]")).str.strip().str.lower()
spotify_data['song_id']= spotify_data['uri'].apply(lambda x: x.rsplit(':')[-1])


In [25]:
spotify_data.head()

Unnamed: 0,song,uri,artist,cluster,song_id
0,johanna,spotify:track:7xOh2WlB3L3J2fh2rtVFY0,geoff zanelli,306,7xOh2WlB3L3J2fh2rtVFY0
1,hong kong,spotify:track:6rztuzIBEueoBWNYyhmKa4,geoff zanelli,81,6rztuzIBEueoBWNYyhmKa4
2,the farmer's daughter,spotify:track:1XSeMdMvsZbkdRatUMd2sp,geoff zanelli,81,1XSeMdMvsZbkdRatUMd2sp
3,the painted lady may be in play,spotify:track:63CpQ7ltku7C1YyBxXgo8A,geoff zanelli,51,63CpQ7ltku7C1YyBxXgo8A
4,spinoza,spotify:track:0AY4PnjCPP4ZeDWX9AST7y,geoff zanelli,81,0AY4PnjCPP4ZeDWX9AST7y


# 2nd prototype

In [26]:
from random import randint

In [27]:
from IPython.display import IFrame
from IPython.core.display import display

In [28]:
# will be used to get the common string between song entered and hot_song list

def matchingString(x,y):
    match=''
    for i in range(0,len(x)):
        for j in range(0,len(y)):
            k=1
            # now applying while condition untill we find a substring match and length of substring is less than length of x and y
            while (i+k <= len(x) and j+k <= len(y) and x[i:i+k]==y[j:j+k]):
                if len(match) <= len(x[i:i+k]):
                    match = x[i:i+k]
                k=k+1
    return match 


In [29]:
def recommender():
    song = input('What is the name of your song? ')
    song = song.lower().strip()
    
    if song == '':
        print('Not a mind reader :) Please give me a hint!')
        song = input('What is the name of your song? ')
    
    
    elif song in list(hot_songs['artist']): 
        response = input('Do you want a hot song by '+ song.title() +'? ')
        if response == 'yes':
            lst_song = list(hot_songs['song'].where(hot_songs['artist'] == song).dropna())
            x = randint(0,len(lst_song)-1)
            print('I recommend this hot song: '+lst_song[x].title()+ ' by '+ song.title())
            try:
                track_id = str((spotify_data.loc[spotify_data.song == lst_song[x], 'song_id'].values[0]))
                display(IFrame(src=f"https://open.spotify.com/embed/track/{track_id}",
                     width = '320', height = '80',allow = 'encreypted-media'))
            except:
                None
            return
            
        else:
            answer = input("Sorry, it is not a hot song :( Do you want to try again?: " )
            if answer == 'yes': 
                recommender()
            else:
                print('Have a good day! See you next time.')
            
    elif song in list(hot_songs['song']):
        answer = input("Do you mean " + song.title() + " by " + hot_songs.loc[hot_songs.song == song, 'artist'].values[0].title() +  "? ")
        if answer == 'yes':
            if song in list(hot_songs['song']):
                x = randint(0,99)
                print("Nice! This is a hot song! You might also like " + hot_songs['song'][x].title() + " by " + hot_songs['artist'][x].title())
                try:
                    track_id = str((spotify_data.loc[spotify_data.song == hot_songs['song'][x], 'song_id'].values[0]))
                    display(IFrame(src=f"https://open.spotify.com/embed/track/{track_id}",
                              width = '320', height = '80',allow = 'encreypted-media'))
                except:
                    None
                return
        else:
            answer = input("Do you want to try again?: " )
            if answer == 'yes': 
                recommender()
            else:
                print('Have a good day! See you next time.')
                
    if song in list(spotify_data['song']):
        answer = input("Do you mean " + song.title() + " by " + spotify_data.loc[spotify_data.song == song, 'artist'].values[0].title() +  "? ")
        if answer == 'yes' and spotify_data.loc[spotify_data.song == song, 'artist'].values[0] in list(hot_songs['artist']):
            lst_song = list(hot_songs['song'].where(hot_songs['artist'] == spotify_data.loc[spotify_data.song == song, 'artist'].values[0]).dropna())
            
            x = randint(0,len(lst_song)-1)
            if lst_song[x] != song:
                print('Not a hot song but you may enjoy this hot song ' + lst_song[x].title()+ ' by ' + hot_songs.loc[hot_songs.song == lst_song[x], 'artist'].values[0].title())   
                try:
                    track_id = str(spotify_data.loc[spotify_data.song ==lst_song[x], 'song_id'].values[0])
                    display(IFrame(src=f"https://open.spotify.com/embed/track/{track_id}",
                            width = '320', height = '80',allow = 'encreypted-media'))
                except:
                    None
            return
        
        
        if answer == 'yes' and (spotify_data.loc[spotify_data.song == song, 'artist'].values[0] not in list(hot_songs['artist'])):
            lst_cluster = list(spotify_data['song'].where(spotify_data['cluster'] == spotify_data.loc[spotify_data.song == song, 'cluster'].values[0]).dropna())
            x = randint(0,len(lst_cluster)-1)
            print('Not a hot song but from my spotify collection you may also enjoy '+ lst_cluster[x].title() + 'by '+ spotify_data.loc[spotify_data.song == lst_cluster[x], 'artist'].values[0].title())
            try:
                track_id = str((spotify_data.loc[spotify_data.song ==lst_cluster[x], 'song_id'].values[0]))
                display(IFrame(src=f"https://open.spotify.com/embed/track/{track_id}",
                          width = '320', height = '80',allow = 'encreypted-media'))
            except:
                None
            return
        else:
            answer = input("Sorry, it is not a hot song :( Do you want to try again? " )
            if answer == 'yes': 
                recommender()
            else:
                print('Have a good day! See you next time.')
    else: 
            common = {}
            for j in range(len(hot_songs['song'])):
                match = matchingString(song, hot_songs['song'][j])
                key = hot_songs['song'][j]
                common[key] = match 
                #print(max(len(match) for match in list(common.items())))
                if len(match) == max(len(match) for match in list(common.values())):
                    guess = [k for k,v in common.items() if v == match]
            else:
                for i in range(len(guess)):
                    answer = input("Do you mean " + guess[i].title()+ " by " + hot_songs['artist'][i].title() +  "? ")
                    if answer.lower() == 'yes':
                        x = randint(0,99)
                        print("Nice! This is a hot song! You might also like " + hot_songs['song'][x].title() + " by " + hot_songs['artist'][x].title())
                        try:
                            track_id = str((spotify_data.loc[spotify_data.song ==hot_songs['song'][x], 'song_id'].values[0]))
                            display(IFrame(src=f"https://open.spotify.com/embed/track/{track_id}",
                                      width = '320', height = '80',allow = 'encreypted-media'))
                        except: 
                            None
                        return
                    elif answer.lower() == 'no':
                        i+=1
                        if i not in range(len(guess)):
                            print("Sorry, I don't have your song :( I am improving please come back later!") 
   

In [30]:
recommender() # if someone enters an artist name, tries to find a hot song by that artist and suggest

What is the name of your song? adele
Do you want a hot song by Adele? yes
I recommend this hot song: My Little Love by Adele


In [33]:
recommender()  # if answer is no, no recommendation and start over option , at last it gives a message and it overs
                

What is the name of your song? adele
Do you want a hot song by Adele? no
Sorry, it is not a hot song :( Do you want to try again?: no
Have a good day! See you next time.
Do you mean Fair Trade by Adele? no
Sorry, I don't have your song :( I am improving please come back later!


In [46]:
recommender()     # if the song is in spotify list, first try to get the artist and suggest a hot song by that artist

What is the name of your song? state of grace
Do you mean State Of Grace by Taylor Swift? yes
Not a hot song but you may enjoy this hot song Message In A Bottle   by Taylor Swift


In [43]:
lst_song = list(hot_songs['song'].where(hot_songs['artist'] == 'taylor swift').dropna())

In [44]:
lst_song

['all too well ', 'message in a bottle  ', 'red ', 'state of grace ']