In [1]:
#Importing required headers
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import pickle
import string
from dateutil import parser
from datetime import date, datetime, timedelta

In [2]:
#Return parsed page as BeautifulSoup object.
def get_soup(start_url):
    response = requests.get(start_url)
    page = response.text
    return BeautifulSoup(page, "lxml")

### A function to create list of Billboard Hot 100 urls that catalogue each song going back to 1958

In [5]:
#Functions.

#Gets dates as regular intervals of 'delta' from start date to end date.
def perdelta(start, end, delta):
    curr = start
    while curr < end:
        yield curr
        curr += delta

#Array to store all urls that need to scraped.
urls = []

#Iterate over the output of perdelta, i.e over all the dates.
#Add the urls to the array.
for suffix in perdelta(date(1958, 8, 9), date(2018, 10, 31), timedelta(days=7)):
    prefix = 'http://www.billboard.com/charts/hot-100/'
    link = prefix + str(suffix)
    urls.append(link)
    
#Look at array.
print(urls[:3])

['http://www.billboard.com/charts/hot-100/1958-08-09', 'http://www.billboard.com/charts/hot-100/1958-08-16', 'http://www.billboard.com/charts/hot-100/1958-08-23']


In [6]:
#Function to extract artist name out of BeautifulSoup object.
def get_artist(soup):
    artist = soup.find_all(class_="chart-row__artist")
    artists = []
    for item in artist:
        item = item.text.strip()
        artists.append(item)
    return artists

In [7]:
#Function to extract track name out of BeautifulSoup object.
def get_track(soup):
    track = soup.find_all(class_="chart-row__song")
    tracks = []
    for item in track:
        item = item.text.strip()
        tracks.append(item)
    return tracks


### Get artist and track title from each url. Append to list of dataframes, merge the dataframes and write to a flat file

In [None]:
%%time
#Creating our dataset.
frames = []
billboard_dict = {}
for url in urls:
    soup = get_soup(url)
    artist = get_artist(soup)
    track = get_track(soup)
    dates = [parser.parse(url.split('/')[5])]*len(artist)
    df = pd.DataFrame({'artist': artist, 'track': track, 'publish_date': dates})
    frames.append(df)

#Converting to dataframe.
df_merge = pd.concat(frames).reset_index(drop=True)
df2 = df_merge.drop_duplicates(['track'], keep='first').reset_index(drop=True)

with open('dataframes/billboard_unique.pkl', 'wb') as picklefile:
    pickle.dump(df2, picklefile)
    
def text_clean(x):
    try:
        x = x.decode('utf-8')
    except:
        None
    return x

df2 = df2.applymap(text_clean)

In [17]:
#Looking at dataset.
df2.head()

Unnamed: 0,artist,publish_date,track
0,Ricky Nelson,1958-08-09,Poor Little Fool
1,Domenico Modugno,1958-08-09,Nel Blu Dipinto Di Blu (Volaré)
2,Perez Prado And His Orchestra,1958-08-09,Patricia
3,Bobby Darin,1958-08-09,Splish Splash
4,Kalin Twins,1958-08-09,When


In [19]:
#Storing in data into a .csv file.
df2.to_csv("../HotSongsBillBoard.csv", encoding="utf-8", index = False)