Using scraping techniques to list all songs featured on Billboard from the year of inception and performing analysis of the 

In [None]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

We will use BeautifulSoup to extract the wikitable of the data 

In [2]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

In [26]:
# here we access the webpage and download the content using requests
year2018=requests.get("http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1970")
year2018

<Response [200]>

In [27]:
# We'll just reuse the request object that was previously created to create a BeautifulSoup element.
soup = BeautifulSoup(year2018.text, "html.parser")


From the soup we will extract the wikitable with following attributes

In [28]:
# In this line we are looking for a single "table" element with a class of wikitable;
# and then looking for all the "tr" elements on that table (notice the find vs find_all calls).
rows = soup.find("table", attrs={"class": "wikitable"}).find_all("tr")[1:]
rows

[<tr>
 <td>1</td>
 <td>"<a class="mw-redirect" href="/wiki/Bridge_Over_Troubled_Water_(song)" title="Bridge Over Troubled Water (song)">Bridge Over Troubled Water</a>"</td>
 <td><a href="/wiki/Simon_%26_Garfunkel" title="Simon &amp; Garfunkel">Simon &amp; Garfunkel</a>
 </td></tr>, <tr>
 <td>2</td>
 <td>"<a href="/wiki/(They_Long_to_Be)_Close_to_You" title="(They Long to Be) Close to You">(They Long to Be) Close to You</a>"</td>
 <td><a href="/wiki/The_Carpenters" title="The Carpenters">The Carpenters</a>
 </td></tr>, <tr>
 <td>3</td>
 <td>"<a href="/wiki/American_Woman" title="American Woman">American Woman</a>"</td>
 <td><a href="/wiki/The_Guess_Who" title="The Guess Who">The Guess Who</a>
 </td></tr>, <tr>
 <td>4</td>
 <td>"<a href="/wiki/Raindrops_Keep_Fallin%27_on_My_Head" title="Raindrops Keep Fallin' on My Head">Raindrops Keep Fallin' on My Head</a>"</td>
 <td><a class="mw-redirect" href="/wiki/B.J._Thomas" title="B.J. Thomas">B.J. Thomas</a>
 </td></tr>, <tr>
 <td>5</td>
 <td>"

In [29]:
# We then define a function whose job it is to act on
# each column's element in each row in the table.
def function(element):
    Rank = int(element[0].get_text())
    Title = element[1].get_text()
    Band_singer = element[2].get_text()
    Link_artist = element[2].find("a").get("href")
    return [Rank, Title, Band_singer, Link_artist]


In [30]:
#Now we will create a dictionary of the songs linked through the keys
# Next we'll create a list of names that will be used as dictionary keys.
keys = ["Rank", "Title", "Band_singer", "Link_artist"]

# We now use the cleaner function to process each "td" element on a given row.
# It gives us a bunch of band information
# The zip function creates a list of pairs; which the dict function then uses
# to create a dictionary, using the first element of the pair as the key and the second as
# the value; and finally, the list comprehension iterates over each row element, and puts
# the result of each iteration on a list, which is then bound to the songs variable.
songs = [dict(zip(keys, function(row.find_all("td")))) for row in rows]
songs[1:4]
songs

[{'Band_singer': 'Simon & Garfunkel\n',
  'Link_artist': '/wiki/Simon_%26_Garfunkel',
  'Rank': 1,
  'Title': '"Bridge Over Troubled Water"'},
 {'Band_singer': 'The Carpenters\n',
  'Link_artist': '/wiki/The_Carpenters',
  'Rank': 2,
  'Title': '"(They Long to Be) Close to You"'},
 {'Band_singer': 'The Guess Who\n',
  'Link_artist': '/wiki/The_Guess_Who',
  'Rank': 3,
  'Title': '"American Woman"'},
 {'Band_singer': 'B.J. Thomas\n',
  'Link_artist': '/wiki/B.J._Thomas',
  'Rank': 4,
  'Title': '"Raindrops Keep Fallin\' on My Head"'},
 {'Band_singer': 'Edwin Starr\n',
  'Link_artist': '/wiki/Edwin_Starr',
  'Rank': 5,
  'Title': '"War"'},
 {'Band_singer': 'Diana Ross\n',
  'Link_artist': '/wiki/Diana_Ross',
  'Rank': 6,
  'Title': '"Ain\'t No Mountain High Enough"'},
 {'Band_singer': 'The Jackson 5\n',
  'Link_artist': '/wiki/The_Jackson_5',
  'Rank': 7,
  'Title': '"I\'ll Be There"'},
 {'Band_singer': 'Rare Earth\n',
  'Link_artist': '/wiki/Rare_Earth_(band)',
  'Rank': 8,
  'Title': '

Now we will extract the data from inception of awards i.e. year 1950 till 2018

In [24]:
years=range(1950, 2018)
print(years)
yearstext={}
for y in years:
    print(y)
    yreq=requests.get("http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_%i" % y)
    yearstext[y]=yreq.text

range(1950, 2018)
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
