# Scrape Billboard Top 100 charts from 1958-2012 to fill in missing data if a song was popular or not from original Million Song database.  

## 1958 is the earliest they have on record, even though the MSDatabase goes back to 1922

In [34]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle
import string
from dateutil import parser
import time

### A function to create list of Billboard Hot 100 urls that catalogue each song going back to 1958, up until 2012 

In [35]:
def get_soup(start_url):
    response = requests.get(start_url)
    page = response.text
    return BeautifulSoup(page, "lxml")

In [36]:
from datetime import date, datetime, timedelta

def perdelta(start, end, delta):
    curr = start
    while curr < end:
        yield curr
        curr += delta

urls = []
for suffix in perdelta(date(1958, 8, 9), date(2012, 12, 31), timedelta(days=7)):
    prefix = 'http://www.billboard.com/charts/hot-100/'
    link = prefix + str(suffix)
    urls.append(link)


In [37]:
urls[:3]

['http://www.billboard.com/charts/hot-100/1958-08-09',
 'http://www.billboard.com/charts/hot-100/1958-08-16',
 'http://www.billboard.com/charts/hot-100/1958-08-23']

In [38]:
#gets artist name from page
def get_artist(soup):
    artist = soup.find_all(class_="chart-list-item__artist")
    artists = []
    for item in artist:
        item = item.text.strip()
        artists.append(item)
    return artists

In [39]:
#gets song title from page
def get_track(soup):
    track = soup.find_all(class_="chart-list-item__title-text")
    tracks = []
    for item in track:
        item = item.text.strip()
        tracks.append(item)
    return tracks

### Get artist and track title from each URL. Append to list of dataframes, merge the dataframes and save to file

In [40]:
%%time
frames = []
billboard_dict = {}
for url in urls:
    soup = get_soup(url)
    artist = get_artist(soup)
    track = get_track(soup)
    dates = [parser.parse(url.split('/')[5])]*len(artist)
    df = pd.DataFrame({'artist': artist, 'track': track, 'publish_date': dates})
    time.sleep(2)
    frames.append(df)

CPU times: user 9min 54s, sys: 7.94 s, total: 10min 2s
Wall time: 2h 52min 18s


In [41]:
df_merge = pd.concat(frames).reset_index(drop=True)

In [42]:
with open('./data/billboard_rawdf.pkl', 'wb') as picklefile:
    pickle.dump(df_merge, picklefile)

In [43]:
with open('./data/billboard_rawdf.pkl', 'rb') as picklefile:
    df_bb = pickle.load(picklefile, encoding="utf-8")

In [44]:
#delete any duplicated songs (i.e. popular for several weeks)
df2 = df_merge.drop_duplicates(['track'], keep='first').reset_index(drop=True)

In [45]:
with open('./data/billboard_unique.pkl', 'wb') as picklefile:
    pickle.dump(df2, picklefile)

In [46]:
#cleanup of df with the way it was encoded
def text_clean(x):
    try:
        x = x.decode('utf-8')
    except:
        None
    return x

df2 = df2.applymap(text_clean)

In [47]:
df2.head()

Unnamed: 0,artist,track,publish_date
0,Domenico Modugno,Nel Blu Dipinto Di Blu (Volaré),1958-08-09
1,Perez Prado And His Orchestra,Patricia,1958-08-09
2,Bobby Darin,Splish Splash,1958-08-09
3,Kalin Twins,When,1958-08-09
4,Jack Scott,My True Love,1958-08-09


In [None]:
#export to file
df2.to_csv("./Data/billboard.csv", encoding="utf-8", index = False)

In [50]:
df2.shape

(21176, 3)