# Base website URL to scrape from: 

## https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2020

**Import necessary libraries for scraping websites**

In [1]:
import requests
import bs4

In [2]:
# Only Billboard Year-End pages from 1959-Present follow this format. So we'll start with these years
base_url = "https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{}"

In [8]:
# Every page after the last official page will contain text stating that there is no article created yet
res = requests.get(base_url.format('no_article_page')) # make up a url that doesn't contain an article
soup = bs4.BeautifulSoup(res.text, 'lxml')

soup.select('b')

[<b>Billboard Year-End Hot 100 singles of no article page</b>,
 <b>Wikipedia does not have an article with this exact name.</b>,
 <b>Other reasons this message may be displayed:</b>,
 <b><a href="/wiki/Case_sensitivity" title="Case sensitivity">case sensitive</a></b>,
 <b>deletion log</b>]

In [9]:
# Extract relevant information for a single page
res = requests.get(base_url.format('2005'))
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [87]:
# grab table containing the songs
wikitable = soup.select('.wikitable')[0]

In [109]:
# grab td tags containing information for each songs
td = wikitable.select('td')

In [144]:
td[1]

<td>"<a class="mw-redirect" href="/wiki/We_Belong_Together_(Mariah_Carey_song)" title="We Belong Together (Mariah Carey song)">We Belong Together</a>"</td>

In [142]:
rank_list = []

# rank data start at the 0th td tag, and are on every 3 td tags afterwards
for rank in td[0::3]:
    rank_list.append(rank.text) # add each rank to list
    
print(rank_list)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100']


In [151]:
title_list = []

# song title data start at the 1st td tag, and are on every 3 td tags afterwards
for title in td[1::3]:
    # append each song title to list, while also stripping double quotes
    title_list.append(title.text[1:-1])

print(title_list)

['We Belong Together', 'Hollaback Girl', 'Let Me Love You', 'Since U Been Gone', '1, 2 Step', 'Gold Digger', 'Boulevard of Broken Dreams', 'Candy Shop', "Don't Cha", 'Behind These Hazel Eyes', 'Disco Inferno', 'You and Me', "Don't Phunk with My Heart", 'Lose Control', 'Shake It Off', 'Mr. Brightside', 'Just a Lil Bit', 'Pon de Replay', 'How We Do', 'Beverly Hills', 'Oh', 'Lonely No More', "Drop It Like It's Hot", 'Hate It or Love It', 'Lovers & Friends', 'Soldier', 'Breakaway', 'Switch', 'Let Me Hold You', 'Like You', 'Rich Girl', 'My Humps', 'Obsession (No Es Amor)', 'Caught Up', 'Listen to Your Heart', 'Scars', 'Feel Good Inc.', 'Let Me Go', 'Holiday', "Sugar, We're Goin Down", 'Grind with Me', 'Run It!', 'Photograph', 'Lonely', 'Collide', 'Wake Me Up When September Ends', 'Slow Down', 'Over and Over', 'Some Cut', "Get It Poppin'", 'Play', 'Soul Survivor', "Pimpin' All Over the World", 'My Boo', 'Sugar (Gimme Some)', 'Karma', 'Speed of Sound', "I Don't Want to Be", 'Mockingbird', 'La

In [152]:
artist_list = []

for artist in td[2::3]:
    print(artist.text)

Mariah Carey

Gwen Stefani

Mario

Kelly Clarkson

Ciara featuring Missy Elliott

Kanye West featuring Jamie Foxx

Green Day

50 Cent featuring Olivia

The Pussycat Dolls featuring Busta Rhymes

Kelly Clarkson

50 Cent

Lifehouse

The Black Eyed Peas

Missy Elliott featuring Ciara and Fatman Scoop

Mariah Carey

The Killers

50 Cent

Rihanna

The Game featuring 50 Cent

Weezer

Ciara featuring Ludacris

Rob Thomas

Snoop Dogg featuring Pharrell

The Game featuring 50 Cent

Lil Jon featuring Usher and Ludacris

Destiny's Child featuring T.I. and Lil Wayne

Kelly Clarkson

Will Smith

Bow Wow featuring Omarion

Bow Wow featuring Ciara

Gwen Stefani featuring Eve

The Black Eyed Peas

Frankie J featuring Baby Bash

Usher

DHT featuring Edmee

Papa Roach

Gorillaz

3 Doors Down

Green Day

Fall Out Boy

Pretty Ricky

Chris Brown featuring Juelz Santana

Nickelback

Akon

Howie Day

Green Day

Bobby Valentino

Nelly featuring Tim McGraw

Trillville featuring Cutty

Fat Joe featuring Nelly



In [111]:
td

[<td>1</td>,
 <td>"<a class="mw-redirect" href="/wiki/We_Belong_Together_(Mariah_Carey_song)" title="We Belong Together (Mariah Carey song)">We Belong Together</a>"</td>,
 <td><a href="/wiki/Mariah_Carey" title="Mariah Carey">Mariah Carey</a>
 </td>,
 <td>2</td>,
 <td>"<a href="/wiki/Hollaback_Girl" title="Hollaback Girl">Hollaback Girl</a>"</td>,
 <td><a href="/wiki/Gwen_Stefani" title="Gwen Stefani">Gwen Stefani</a>
 </td>,
 <td>3</td>,
 <td>"<a href="/wiki/Let_Me_Love_You_(Mario_song)" title="Let Me Love You (Mario song)">Let Me Love You</a>"</td>,
 <td><a class="mw-redirect" href="/wiki/Mario_(entertainer)" title="Mario (entertainer)">Mario</a>
 </td>,
 <td>4</td>,
 <td>"<a href="/wiki/Since_U_Been_Gone" title="Since U Been Gone">Since U Been Gone</a>"</td>,
 <td><a href="/wiki/Kelly_Clarkson" title="Kelly Clarkson">Kelly Clarkson</a>
 </td>,
 <td>5</td>,
 <td>"<a href="/wiki/1,_2_Step" title="1, 2 Step">1, 2 Step</a>"</td>,
 <td><a href="/wiki/Ciara" title="Ciara">Ciara</a> featur

In [7]:
# first year to start with
i = 1959

while True:
    '''
    Extract number, song title, and artist
    '''

[<b>Billboard Year-End Hot 100 singles of no article page</b>,
 <b>Wikipedia does not have an article with this exact name.</b>,
 <b>Other reasons this message may be displayed:</b>,
 <b><a href="/wiki/Case_sensitivity" title="Case sensitivity">case sensitive</a></b>,
 <b>deletion log</b>]