In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# url that we are scraping
url = "http://www.basketball-reference.com/draft/NBA_2014.html"

# this is the html from the given url
html = urlopen(url)

In [3]:
soup = BeautifulSoup(html, "lxml")

In [4]:
type(soup)

bs4.BeautifulSoup

In [5]:
column_headers = [th.getText() for th in 
                  soup.findAll('tr', limit=2)[1].findAll('th')]

column_headers # our column headers

['Rk',
 'Pk',
 'Tm',
 'Player',
 'College',
 'Yrs',
 'G',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'FG%',
 '3P%',
 'FT%',
 'MP',
 'PTS',
 'TRB',
 'AST',
 'WS',
 'WS/48',
 'BPM',
 'VORP']

In [6]:
data_rows = soup.findAll('tr')[2:] # skip the first 2 header rows
type(data_rows) # now we have a list of table rows

list

The difference between extracting the play data and extracting the column headers is that the player data is in a 2-dimensional format (or a matrix), so we have to construct a 2-dimenstional list. We can do this using a nested list comprehense

In [7]:
player_data = [[td.getText() for td in data_rows[i].findAll(['td','th'])]
               for i in range(len(data_rows))]

In [8]:
player_data_02 = [] # create an empty list to hold all the data

for i in range(len(data_rows)): # for each table row
    player_row = [] # create an emply list for each pick/player
    
    # for each table data element from each table row
    for td in data_rows[i].findAll('td'):
        # get the text content and append to the player_row
        player_row.append(td.getText())
        
    # then append each pick/player to the player_data matrix
    player_data_02.append(player_row)

In [9]:
player_data == player_data_02

False

In [10]:
df = pd.DataFrame(player_data, columns=column_headers)



In [11]:
df.head() # lets see the 1st 5 rows of our DataFrame by default

Unnamed: 0,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST,WS,WS/48,BPM,VORP
0,1,1,CLE,Andrew Wiggins,University of Kansas,4,250,9033,5097,1017,...,0.331,0.755,36.1,20.4,4.1,2.1,10.5,0.056,-2.4,-0.8
1,2,2,MIL,Jabari Parker,Duke University,3,152,4874,2403,847,...,0.341,0.748,32.1,15.8,5.6,2.1,9.0,0.088,-1.3,0.9
2,3,3,PHI,Joel Embiid,University of Kansas,2,35,894,707,285,...,0.342,0.792,25.5,20.2,8.1,2.1,2.1,0.114,2.9,1.1
3,4,4,ORL,Aaron Gordon,University of Arizona,4,207,5029,2036,1104,...,0.297,0.702,24.3,9.8,5.3,1.5,10.6,0.101,0.0,2.6
4,5,5,UTA,Dante Exum,,2,148,3045,805,263,...,0.308,0.743,20.6,5.4,1.8,2.1,1.1,0.017,-3.3,-1.0
