Improvements
- additional artist
- better regex filter
- making function to filter spotify more readable
- User input programm flow

# Scrape Billboard Top 100

## Parse HTML Page

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://www.billboard.com/charts/hot-100'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
soup.select('span.chart-element__information__song')[1].text

'Leave The Door Open'

## Extract Data

In [4]:
#song_names
song_lst = []
song_names = soup.select('span.chart-element__information__song')

for song in song_names:
    song_lst.append(song.text)

In [5]:
#artist_names
artist_lst = []
artist_names = soup.select('span.chart-element__information__artist')

for artist in artist_names:
    artist_lst.append(artist.text)

In [6]:
#ranking
ranking_lst = []
ranking_names = soup.select('span.chart-element__rank__number')

for rank in ranking_names:
    ranking_lst.append(rank.text)

## Convert to DataFrame and save

In [7]:
billboard_top = pd.DataFrame({'rank':ranking_lst, 'artist': artist_lst, 'song': song_lst})

In [8]:
billboard_top.head()

Unnamed: 0,rank,artist,song
0,1,The Weeknd & Ariana Grande,Save Your Tears
1,2,Silk Sonic (Bruno Mars & Anderson .Paak),Leave The Door Open
2,3,Justin Bieber Featuring Daniel Caesar & Giveon,Peaches
3,4,Polo G,Rapstar
4,5,Dua Lipa Featuring DaBaby,Levitating


In [9]:
billboard_top.to_csv('billboard_top.csv')

# User input validation

In [10]:
import re

artist_name = input('Artist: ')
if not re.match(".{4,}", artist_name):
        print ("Please enter a valid artist")
        artist_name = "9999999999xxxx"
        
song = input('Song: ')
if not re.match(".{4,}", artist_name):
        print ("Please enter a valid artist")
        song = "9999999999xxxx"
print(hot_or_not(artist_name, song))

Artist: test
Song: test


NameError: name 'hot_or_not' is not defined

In [None]:
def hot_or_not (artist, song):
    '''
    function determines if artist is in the scraped database
    '''
    contains_artist = billboard_top['artist'].str.contains(artist_name)
    contains_title = billboard_top['song'].str.contains(song)
    results = pd.DataFrame({'contains_artist': contains_artist, 'contains_title': contains_title, "match": contains_artist & contains_title})

    if results['match'].any():
        s = 'Hot'
    else:
        s = 'Not'
    return s

# Creating Dataframe from Spotify Wrapper

In [17]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import getpass
import pandas as pd
from tqdm.notebook import tqdm

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="7ac25cd2df894d4587f18918f0776908",
                                                           client_secret="947331b0345946ef847d79f23802ee84"))

In [14]:
spotifydf = pd.DataFrame()
artistdf = pd.DataFrame()
track_lst = []
artist_1 = []
artist_2 = []

#looping through categories
try:
    for category in tqdm(sp.categories(limit=15)['categories']['items']):

        #looping through playlists, ignoring ids that do not exist
        try:
            for playlist in sp.category_playlists(category_id=category['id'], country=None, limit=15, offset=0)['playlists']['items']:

                #looping through tracks, ignoring ids that do not exist
                try:
                    for tracks in sp.playlist_tracks(playlist['id'])['items']:
                        ids = tracks['track']['id']

                        #adding rows of audio_features
                        spotifydf = spotifydf.append(sp.audio_features(tracks['track']['id']))

                        #lst of track names
                        track_lst.append(sp.track(ids)['name'])

                        #lst of artist 1
                        artist_1.append(sp.track(ids)['artists'][0]['name'])

                        #lst of artist 2
                        try: artist_2.append(sp.track(ids)['artists'][1]['name'])
                        except: artist_2.append(None)
                except:
                    print('Track-ID not found error')
                    pass
        except:
            print('Playlist-ID not found error')
            pass
                    
except requests.exceptions.ReadTimeout:
    print("ReadTimeout")

finally: 
    spotifydf['track_names'] = track_lst
    spotifydf['artist_1'] = artist_1
    spotifydf['artist_2'] = artist_2

  0%|          | 0/15 [00:00<?, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s][A
  8%|▊         | 1/13 [00:20<04:06, 20.51s/it][A
 15%|█▌        | 2/13 [01:00<05:49, 31.80s/it][A
 23%|██▎       | 3/13 [01:20<04:25, 26.55s/it][A
 31%|███       | 4/13 [01:52<04:19, 28.85s/it][A
 38%|███▊      | 5/13 [02:12<03:23, 25.47s/it][A
 46%|████▌     | 6/13 [02:46<03:17, 28.25s/it][A
 54%|█████▍    | 7/13 [03:06<02:34, 25.69s/it][A
 62%|██████▏   | 8/13 [03:26<01:59, 23.83s/it][A
 69%|██████▉   | 9/13 [03:55<01:42, 25.51s/it][A
 77%|███████▋  | 10/13 [04:15<01:11, 23.87s/it][A
 85%|████████▍ | 11/13 [04:35<00:45, 22.70s/it][A
 92%|█████████▏| 12/13 [04:55<00:21, 21.69s/it][A
100%|██████████| 13/13 [05:15<00:00, 24.25s/it][A
  7%|▋         | 1/15 [05:15<1:13:35, 315.39s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:40<05:20, 40.09s/it][A
 22%|██▏       | 2/9 [01:20<04:40, 40.13s/it][A
 33%|███▎      | 3/9 [02:01<04:02, 40.44s/it][A
 44%|████▍     | 4/9 [02:41

Track-ID not found error



 22%|██▏       | 2/9 [00:29<01:44, 14.95s/it][A
 33%|███▎      | 3/9 [01:09<02:31, 25.24s/it][A
 44%|████▍     | 4/9 [01:49<02:33, 30.78s/it][A
 56%|█████▌    | 5/9 [02:29<02:16, 34.03s/it][A
 67%|██████▋   | 6/9 [03:02<01:40, 33.50s/it][A
 78%|███████▊  | 7/9 [03:33<01:05, 32.84s/it][A
 89%|████████▉ | 8/9 [04:13<00:35, 35.20s/it][A
100%|██████████| 9/9 [04:33<00:00, 30.44s/it][A
 53%|█████▎    | 8/15 [44:06<39:32, 338.89s/it]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:25<02:31, 25.21s/it][A
 29%|██▊       | 2/7 [01:04<02:48, 33.63s/it][A
 43%|████▎     | 3/7 [01:28<01:56, 29.08s/it][A
 57%|█████▋    | 4/7 [02:08<01:40, 33.40s/it][A
 71%|███████▏  | 5/7 [02:45<01:09, 34.63s/it][A
 86%|████████▌ | 6/7 [02:56<00:26, 26.58s/it][A
100%|██████████| 7/7 [03:35<00:00, 30.85s/it][A
 60%|██████    | 9/15 [47:42<30:03, 300.53s/it]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:41<02:45, 41.43s/it][A
 40%|████      | 2/5 [01:03<01:30, 30

In [15]:
len(spotifydf)

11604

In [16]:
# Extracting to CSV
spotifydf.to_csv('spotifydf.csv')