# Billboard Hot 100

## Import Dependencies 

In [1]:
#Importing dependencies for web scrapping
from bs4 import BeautifulSoup
import requests
import pandas as pd


## Web Scrapping -  Hot 100

In [2]:
def web_scraping():
    """
    Scrapes the Billboard webpage (https://www.billboard.com/charts/hot-100/) to extract information of the hot 100 songs.

    returns:
    - a list of lists with rankings, artists and songs (100 entries)
    """
    # URL of pages to be scraped
    hot_url = "https://www.billboard.com/charts/hot-100/"
    # url2_artist100 = "https://www.billboard.com/charts/artist-100/"

    # Retrieving pages with the request method
    hot_response = requests.get(hot_url)
    # response_artist = requests.get(url2_artist100)

    # Creating BeautifulSoup objects; parse with 'html.parser'
    hot_soup = BeautifulSoup(hot_response.text, 'html.parser')
    # artist_soup = BeautifulSoup(response_artist.text, 'html.parser')

    # print(hot_soup.prettify())

    # Working with Hot 100

    hot_results = hot_soup.find_all('div', class_='o-chart-results-list-row-container')
    # print(hot_results)

    hot_list = []

    for result in hot_results:

        # print(result)
        try:
            hot_song = result.find('h3', class_='c-title').text
            hot_ranking_artist = result.find_all('span', class_='c-label')

            hot_ranking = hot_ranking_artist[0].text

            if len(hot_ranking_artist) == 10:
                hot_artist = hot_ranking_artist[3].text
            else:
                hot_artist = hot_ranking_artist[1].text
            # print(len(hot_ranking_artist), hot_ranking_artist)

            hot_song = hot_song[1:len(hot_song)-1]
            hot_artist = hot_artist[1:len(hot_artist)-1]
            hot_ranking = hot_ranking[1:len(hot_ranking)-1]

            

            hot_list.append([hot_ranking, hot_artist, hot_song])
            # print(f'{hot_ranking}.- "{hot_song}" by {hot_artist}')

        except AttributeError as e:
            print(e)

    # for hot_entry in hot_list:
    #     print(hot_entry)

    #date
    from datetime import datetime
    scrapped_date = datetime.today().strftime('%Y-%m-%d')
        
    #saving hot_list in a csv file
    top_100_scrapped_df = pd.DataFrame(hot_list, columns = ["song_ranking", "artist_name", "song_title"])

    top_100_scrapped_df.to_csv(f'Top_100_Scrapped/scrapped_top100({scrapped_date}).csv', index = False)

    return top_100_scrapped_df



In [3]:
# running the web scrapping function
top_100_scrapped_df = web_scraping()

# Spotify Data Search

## Spotify for Developers - Using Spotify Library 

In [4]:
"""
Link to spotipy library docs
https://spotipy.readthedocs.io/en/2.19.0/#

"""

#Dependencies
import spotipy as sp
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np

#Importing Spotify client_ID and secret_code
from keys import client_id, client_secret

In [5]:
#adding empty columns to the 'top_100_scrapped_df' to indicate search info
top_100_scrapped_df['track_spotify_ID']=""
top_100_scrapped_df['artist_spotify_ID']=""
top_100_scrapped_df['song_duration[ms]']=""
top_100_scrapped_df['song_release_date']=""
top_100_scrapped_df['spotify_popularity']=""
top_100_scrapped_df['album_name'] =""
top_100_scrapped_df['album_type']=""

#Displaying main table
top_100_scrapped_df


Unnamed: 0,song_ranking,artist_name,song_title,track_spotify_ID,artist_spotify_ID,song_duration[ms],song_release_date,spotify_popularity,album_name,album_type
0,1,Adele,Easy On Me,,,,,,,
1,2,The Kid LAROI & Justin Bieber,Stay,,,,,,,
2,3,Mariah Carey,All I Want For Christmas Is You,,,,,,,
3,4,Brenda Lee,Rockin' Around The Christmas Tree,,,,,,,
4,5,Bobby Helms,Jingle Bell Rock,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
95,96,Jonas Brothers,Who's In Your Head,,,,,,,
96,97,"Yung Bleu, Chris Brown & 2 Chainz",Baddest,,,,,,,
97,98,42 Dugg Featuring Future,Maybach,,,,,,,
98,99,Aventura x Bad Bunny,Volvi,,,,,,,


In [6]:
#Authenticating requests using Client Credential Flow
scope = "user-library-read"
sp = sp.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

## Search function - Extracting song, artist and album data from Spotify

In [7]:
# function to extract data for any track using search
# Note: The function below extracts song_title from the a table with column_header called 'song_title'

def search(table):
    #running spotipy search function
    song_search = table['song_title']
    # print(song_search)
    results = sp.search(q='track:' + table['song_title'], type = 'track')
    tracks = results ['tracks']['items']
    
    if len(tracks) == 0:
        print(f'Song details not found for {song_search}')

    else:
        #extracting spotify ID for the track **Important to make further API calls**
        table['track_spotify_ID'] = tracks[0]['id']
        table['artist_spotify_ID'] = tracks[0]['album']['artists'][0]['id']

        #top_100_scrapped_df: song_duration,release_date

        try:
            table['song_duration[ms]'] = tracks[0]['duration_ms']
        except:
            table['song_duration[ms]'] = np.nan 

        try:
            table['song_release_date'] = tracks[0]['album']['release_date']
        except:
            table['song_release_date'] = np.nan 

        try:
            table['spotify_popularity'] = tracks[0]['popularity']  
        except:
            table['spotify_popularity'] = np.nan 

        # album_table: album_type 
        try:
            table['album_name'] = tracks[0]['album']['name']
        except:
            table['album_name'] = np.nan 

        # album_table: album_type 
        try:
            table['album_type'] = tracks[0]['album']['album_type']
        except:
            table['album_type'] = np.nan 

    return table

In [8]:
# Iterating search function for each row of the top_100_scrapped_df 
for index,row in top_100_scrapped_df.iterrows():
    top_100_scrapped_df.loc[index,:] = search(row)

# creating output csv for main table
top_100_scrapped_df.to_csv('Output_CSV/top_100_scrapped_df.csv', index=False)


Song details not found for Christmas Tree Farm (Old Timey Version)


In [11]:
# displaying top_100_scrapped_df 
top_100_scrapped_df

Unnamed: 0,song_ranking,artist_name,song_title,track_spotify_ID,artist_spotify_ID,song_duration[ms],song_release_date,spotify_popularity,album_name,album_type
0,1,Adele,Easy On Me,0gplL1WMoJ6iYaPgMCL0gX,4dpARuHxo51G3z768sgnrY,224694,2021-10-14,100,Easy On Me,single
1,2,The Kid LAROI & Justin Bieber,Stay,5HCyWlXZPP0y6Gqq8TgA20,2tIP7SsRs7vjIcLrU85W8J,141805,2021-07-09,98,STAY (with Justin Bieber),single
2,3,Mariah Carey,All I Want For Christmas Is You,0bYg9bo50gSsH3LtXe2SQn,4iHNK0tOyZPYnBU7nGAgpQ,241106,1994-11-01,96,Merry Christmas,album
3,4,Brenda Lee,Rockin' Around The Christmas Tree,2EjXfH91m7f8HiJN1yQg97,4cPHsZM98sKzmV26wlwD2W,126266,1964-10-19,92,Merry Christmas From Brenda Lee,album
4,5,Bobby Helms,Jingle Bell Rock,7vQbuQcyTflfCIOu3Uzzya,38EmEgXkgK51MT2tPY0EoC,130973,1957-12-02,92,Jingle Bell Rock/Captain Santa Claus (And His ...,single
...,...,...,...,...,...,...,...,...,...,...
95,96,Jonas Brothers,Who's In Your Head,5nhW8I46uDE5sc5ouEbzao,7gOdHgIoIKoe4i9Tta6qdD,183536,2021-09-17,76,Who's In Your Head,single
96,97,"Yung Bleu, Chris Brown & 2 Chainz",Baddest,0sr0YMdDTNNr4Lv4VnI3C8,7wlFDEWiM5OoIAt8RSli8b,160000,2021-09-24,65,"Sincerely, Kentrell",album
97,98,42 Dugg Featuring Future,Maybach,2Qt8qG9SWPdtRiaWcPNJRm,45gHcnDnMC15sgx3VL7ROG,199093,2021-05-21,72,Free Dem Boyz,album
98,99,Aventura x Bad Bunny,Volvi,2vmfvSoZBFAt9hhRoEByLi,1qto4hHid1P71emI6Fd8xi,230125,2021-08-03,92,Volví,single


In [28]:
# dropping songs for failed searchs
nan_value = float("NaN")
top_100_cleaned_df = top_100_scrapped_df.replace("", nan_value)
top_100_cleaned_df = top_100_cleaned_df.dropna(subset = ["track_spotify_ID"])

print(f'\nSongs with no search results have been dropped.\nDataFrame contains {len(top_100_cleaned_df)} rows')

top_100_cleaned_df


Songs with no search results have been dropped.
DataFrame contains 99 rows


Unnamed: 0,song_ranking,artist_name,song_title,track_spotify_ID,artist_spotify_ID,song_duration[ms],song_release_date,spotify_popularity,album_name,album_type
0,1,Adele,Easy On Me,0gplL1WMoJ6iYaPgMCL0gX,4dpARuHxo51G3z768sgnrY,224694.0,2021-10-14,100.0,Easy On Me,single
1,2,The Kid LAROI & Justin Bieber,Stay,5HCyWlXZPP0y6Gqq8TgA20,2tIP7SsRs7vjIcLrU85W8J,141805.0,2021-07-09,98.0,STAY (with Justin Bieber),single
2,3,Mariah Carey,All I Want For Christmas Is You,0bYg9bo50gSsH3LtXe2SQn,4iHNK0tOyZPYnBU7nGAgpQ,241106.0,1994-11-01,96.0,Merry Christmas,album
3,4,Brenda Lee,Rockin' Around The Christmas Tree,2EjXfH91m7f8HiJN1yQg97,4cPHsZM98sKzmV26wlwD2W,126266.0,1964-10-19,92.0,Merry Christmas From Brenda Lee,album
4,5,Bobby Helms,Jingle Bell Rock,7vQbuQcyTflfCIOu3Uzzya,38EmEgXkgK51MT2tPY0EoC,130973.0,1957-12-02,92.0,Jingle Bell Rock/Captain Santa Claus (And His ...,single
...,...,...,...,...,...,...,...,...,...,...
95,96,Jonas Brothers,Who's In Your Head,5nhW8I46uDE5sc5ouEbzao,7gOdHgIoIKoe4i9Tta6qdD,183536.0,2021-09-17,76.0,Who's In Your Head,single
96,97,"Yung Bleu, Chris Brown & 2 Chainz",Baddest,0sr0YMdDTNNr4Lv4VnI3C8,7wlFDEWiM5OoIAt8RSli8b,160000.0,2021-09-24,65.0,"Sincerely, Kentrell",album
97,98,42 Dugg Featuring Future,Maybach,2Qt8qG9SWPdtRiaWcPNJRm,45gHcnDnMC15sgx3VL7ROG,199093.0,2021-05-21,72.0,Free Dem Boyz,album
98,99,Aventura x Bad Bunny,Volvi,2vmfvSoZBFAt9hhRoEByLi,1qto4hHid1P71emI6Fd8xi,230125.0,2021-08-03,92.0,Volví,single


## Creating DataFrames - song_df, artist_df, album_df, concert_df

### Inspecting and cleaning the collected data

In [None]:
# ERD Diagram for reference

In [34]:
# adding song_ID
# Each song is unique, hence will have a unique song ID 
songs_limit = len(top_100_cleaned_df)+1
song_ID = range(1,songs_limit)
top_100_cleaned_df['song_ID'] = song_ID

In [35]:
# Grouping by artist spotify ID
artists = top_100_cleaned_df.groupby('artist_spotify_ID')['artist_name'].count()

print(f'There are a total of top {len(artists)} artists in the top 100 songs \nEach artist will be assigned a unique artist_ID')

There are a total of top 81 artists in the top 100 songs 
Each artist will be assigned a unique artist_ID


In [43]:
# creating artist_df

artist_df = top_100_cleaned_df[['artist_name', 'artist_spotify_ID']]
artist_grouped_df = artist_df.drop_duplicates(['artist_spotify_ID'])

# displaying artist_df
artist_df

Unnamed: 0,artist_name,artist_spotify_ID
0,Adele,4dpARuHxo51G3z768sgnrY
1,The Kid LAROI & Justin Bieber,2tIP7SsRs7vjIcLrU85W8J
2,Mariah Carey,4iHNK0tOyZPYnBU7nGAgpQ
3,Brenda Lee,4cPHsZM98sKzmV26wlwD2W
4,Bobby Helms,38EmEgXkgK51MT2tPY0EoC
...,...,...
94,Jessie Murph,2yLzlEFtIS0Q9UkyBZdQA7
95,Jonas Brothers,7gOdHgIoIKoe4i9Tta6qdD
96,"Yung Bleu, Chris Brown & 2 Chainz",7wlFDEWiM5OoIAt8RSli8b
97,42 Dugg Featuring Future,45gHcnDnMC15sgx3VL7ROG


In [44]:
# generating csv file for album_table
artist_grouped_df.to_csv('Output_CSV/artist_df.csv', index = False)

In [36]:
# Grouping by album_name
albums = top_100_cleaned_df.groupby('album_name')['album_name'].count()

print(f'There are a total of top {len(albums)} albums in the top 100 songs \nEach album will be assigned a unique album_ID')

There are a total of top 85 albums in the top 100 songs 
Each album will be assigned a unique album_ID


In [56]:
# creating album_df

album_df = top_100_cleaned_df[['album_name', 'album_type']]
album_grouped_df = album_df.drop_duplicates(['album_name'])

# displaying artist_df
album_grouped_df

Unnamed: 0,album_name,album_type
0,Easy On Me,single
1,STAY (with Justin Bieber),single
2,Merry Christmas,album
3,Merry Christmas From Brenda Lee,album
4,Jingle Bell Rock/Captain Santa Claus (And His ...,single
...,...,...
94,Always Been You,single
95,Who's In Your Head,single
96,"Sincerely, Kentrell",album
97,Free Dem Boyz,album


In [57]:
# creating output csv for main table
album_grouped_df.to_csv('Output_CSV/album_df.csv', index=False)

In [55]:
# creating song_df
song_df = top_100_cleaned_df[['song_ID', 'song_title', 'album_name','track_spotify_ID', 'artist_spotify_ID', 'song_ranking', 'spotify_popularity', 'song_duration[ms]', 'song_release_date']]

# generating csv file for song_table
song_df.to_csv('Output_CSV/song_df.csv', index = False)

# displaying song_table
song_df

Unnamed: 0,song_ID,song_title,album_name,track_spotify_ID,artist_spotify_ID,song_ranking,spotify_popularity,song_duration[ms],song_release_date
0,1,Easy On Me,Easy On Me,0gplL1WMoJ6iYaPgMCL0gX,4dpARuHxo51G3z768sgnrY,1,100.0,224694.0,2021-10-14
1,2,Stay,STAY (with Justin Bieber),5HCyWlXZPP0y6Gqq8TgA20,2tIP7SsRs7vjIcLrU85W8J,2,98.0,141805.0,2021-07-09
2,3,All I Want For Christmas Is You,Merry Christmas,0bYg9bo50gSsH3LtXe2SQn,4iHNK0tOyZPYnBU7nGAgpQ,3,96.0,241106.0,1994-11-01
3,4,Rockin' Around The Christmas Tree,Merry Christmas From Brenda Lee,2EjXfH91m7f8HiJN1yQg97,4cPHsZM98sKzmV26wlwD2W,4,92.0,126266.0,1964-10-19
4,5,Jingle Bell Rock,Jingle Bell Rock/Captain Santa Claus (And His ...,7vQbuQcyTflfCIOu3Uzzya,38EmEgXkgK51MT2tPY0EoC,5,92.0,130973.0,1957-12-02
...,...,...,...,...,...,...,...,...,...
95,95,Who's In Your Head,Who's In Your Head,5nhW8I46uDE5sc5ouEbzao,7gOdHgIoIKoe4i9Tta6qdD,96,76.0,183536.0,2021-09-17
96,96,Baddest,"Sincerely, Kentrell",0sr0YMdDTNNr4Lv4VnI3C8,7wlFDEWiM5OoIAt8RSli8b,97,65.0,160000.0,2021-09-24
97,97,Maybach,Free Dem Boyz,2Qt8qG9SWPdtRiaWcPNJRm,45gHcnDnMC15sgx3VL7ROG,98,72.0,199093.0,2021-05-21
98,98,Volvi,Volví,2vmfvSoZBFAt9hhRoEByLi,1qto4hHid1P71emI6Fd8xi,99,92.0,230125.0,2021-08-03


# Concert Data

In [58]:
import json
from api_keys import BIT_api
import pprint
import os

In [None]:
def event_api(artists):
    
    url_base = "https://rest.bandsintown.com/artists"

    event_list = []

    # print(artists)

    for artist in set(artists):

        # print(artist)
        url_query = f'{url_base}/{artist.lower()}/events?app_id={BIT_api}'
        # print(url_query)
        
        request = requests.get(url_query)

        requested_event = (request.json())

        try:

            event_list.append([artist, requested_event[0]['artist']['name'], requested_event[0]['venue']['name'], requested_event[0]['venue']['country'], requested_event[0]['venue']['location'], requested_event[0]['datetime'][0:10]])
            # pprint.pprint(event_dict[artist])

        except:

            event_list.append([artist, 'Not Found', 'Not Found', 'Not Found', 'Not Found', 'Not Found'])

        

        # for event in requested_event:
        #     print("-------->>>>")
        #     pprint.pprint(event)
    # print(event_list)

    call_date = datetime.today().strftime('%Y-%m-%d')

    event_df = pd.DataFrame(event_list, columns = ['input_artist', 'api_artist', 'venue', 'country', 'location', 'datetime'])
    print(event_df)

    # final_df = event_df.dropna()
    final_df = event_df.copy()
    final_df = final_df.drop(['api_artist'], axis=1)
    print(final_df)
    # final_df.to_csv(f"Data sources\event_table_{call_date}.csv", index = False)
    # final_df.to_csv(os.path.join("Data_sources", f"event_table_{call_date}.csv"))


    return event_df

# test = event_api(['ADELE', 'justin bieber', 'ADELE', 'pablo crespo'])

In [None]:
# event_df


# generating csv file for concert_table
event_df.to_csv('Output_CSV/concert_df.csv')

# displaying album_table
event_df