In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# Create url templates for each kind of stats
per_g_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_per_game.html"
adv_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_advanced.html"
tot_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}\
_totals.html"
per_36m_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}_\
per_minute.html"
per_100p_url_template = "https://www.basketball-reference.com/leagues/NBA_{year}_\
per_poss.html"

# Put all the URL templates into a list
url_template_list = [per_g_url_template, adv_url_template, tot_url_template, 
                     per_36m_url_template, per_100p_url_template]

In [3]:
# Create empty lists to store data before appending to Dataframe
column_headers = []
player_data = []
# Create empty DataFrame for following functions to fill
df = pd.DataFrame()

In [4]:
# Empty DataFrames for each set of pages
df_adv = pd.DataFrame()
df_per_g = pd.DataFrame()
df_tot = pd.DataFrame()
df_per_36m = pd.DataFrame()
df_per_100p = pd.DataFrame

In [5]:
# Get column headers from each page
# Assigns a new list of column headers each time this is called
def get_column_headers(soup):
    headers = []
    for th in soup.find('tr').findAll('th'):
        #print th.getText()
        headers.append(th.getText())
    #print headers # this line was for a bug check
    # Assign global variable to headers gathered by function
    return headers    
    #column_headers = [th.getText() for th in soup.find('tr').findAll('th')]

In [6]:
# old function that's a mess
def get_player_data(soup):
    temp_player_data = []
    for i in range(len(soup.findAll('tr')[1:])):
        # temp list to store player data
        player_row = []
        
        # Loop through 'td' tags to extract player data
        for td in soup.findAll('tr')[1:][i].findAll('td'):
            player_row.append(td.getText())
        
        # Append data to a list    
        temp_player_data.append(player_row)
        
        # Replace global variable with gathered player data
    print temp_player_data
    player_data = temp_player_data

In [7]:
# Get player data from each page
# Replaces global variable at the end
#def get_player_data(soup):
    # list comprehension to grab player data and replace global list each time
#    player_data = [[td.getText() for td in soup.findAll('tr')[1:][i].findAll('td')]
#                  for i in range(len(soup.findAll('tr')[1:]))]

In [8]:
# Function to get player data from each page
def get_player_data(soup):
    # Temporary list within function to store data
    temp_player_data = []
    
    data_rows = soup.findAll('tr')[1:] # skip first row
    for i in range(len(data_rows)): # loop through each table row
        player_row = [] # empty list for each player row
        for td in data_rows[i].findAll('td'):
            player_row.append(td.getText()) # append separate data points
        temp_player_data.append(player_row) # append player row data
    return temp_player_data

In [9]:
def scrape_page(url):
    r = requests.get(url) # get the url
    soup = BeautifulSoup(r.text, 'html.parser') # Create BS object
    
    # call function to get column headers
    column_headers = get_column_headers(soup)
    
    # call function to get player data
    player_data = get_player_data(soup)
    
    # input data to DataFrame
    # Skip first value of column headers, 'Rk'
    df = pd.DataFrame(player_data, columns = column_headers[1:])
    
    return df

In [10]:
def get_season(input_year):
    first_yr = input_year - 1
    season = str(first_yr) + "-" + str(year)[2:]
    return season

In [29]:
print get_season(2016)

2015-16


In [11]:
def gen_cleaning(df):
    # Drop columns with no data
    df.dropna(axis = 1, how = "all", inplace = True)
    
    # Drop rows with no data
    df.dropna(axis = 0, how = "all", inplace = True)
    
    # Remove duplicates player inputs; ie. players who were traded
    # I only kept the TOT per game season values
    df.drop_duplicates(["Player"], keep = "first", inplace = True)
    
    # Change % symbol to _perc
    df.columns = df.columns.str.replace('%', '_perc')
    
    return df

In [27]:
def scrape_pages(url_template, start_year, end_year, output_df):
    for year in range(start_year, end_year):
        url = url_template.format(year = year) # grab URL per year
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html5lib') # Create soup item
        
        # Check to grab column headers
        if len(column_headers) == 0: # only append column headers once
            columns = get_column_headers(soup)
        else:
            continue
            
        # grab player data for each year
        player_data = get_player_data(soup)
        
        # Create temporary DataFrame first for each year
        year_df = pd.DataFrame(player_data, columns = columns[1:])
        year_df.insert(0, 'Season', get_season(year)) # insert season year column
        print year_df.tail(n=5)
        
        # Append to big DataFrame for detailed cleaning
        #output_df = output_df.append(year_df, ignore_index = True)

In [13]:
for url_page in url_template_list:
    column_headers = [] # clear column headers each time so function will work
    pass

In [14]:
# This bunch of code is just for me to check things as I go

#url = "https://www.basketball-reference.com/leagues/NBA_2017_advanced.html"
#r = requests.get(url)
#soup = BeautifulSoup(r.text, 'html.parser')
#column_headers = get_column_headers(soup)
#player_data = get_player_data(soup)
#print column_headers
#print player_data

In [15]:
df = scrape_page("https://www.basketball-reference.com/leagues/NBA_2017_advanced.html")

In [16]:
df.head(n=5)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,23,OKC,68,1055,10.1,0.56,0.724,0.144,...,,1.2,0.9,2.1,0.096,,-0.3,-2.2,-2.5,-0.1
1,Quincy Acy,PF,26,TOT,38,558,11.8,0.565,0.529,0.353,...,,0.5,0.5,0.9,0.082,,-1.8,-1.2,-3.0,-0.1
2,Quincy Acy,PF,26,DAL,6,48,-1.4,0.355,0.412,0.176,...,,-0.2,0.0,-0.1,-0.133,,-10.1,-6.0,-16.2,-0.2
3,Quincy Acy,PF,26,BRK,32,510,13.1,0.587,0.542,0.373,...,,0.6,0.5,1.1,0.102,,-1.1,-0.7,-1.8,0.0
4,Steven Adams,C,23,OKC,80,2389,16.5,0.589,0.002,0.392,...,,3.3,3.1,6.5,0.13,,-0.7,1.2,0.6,1.5


In [17]:
# Check DataFrame column names
list(df)

[u'Player',
 u'Pos',
 u'Age',
 u'Tm',
 u'G',
 u'MP',
 u'PER',
 u'TS%',
 u'3PAr',
 u'FTr',
 u'ORB%',
 u'DRB%',
 u'TRB%',
 u'AST%',
 u'STL%',
 u'BLK%',
 u'TOV%',
 u'USG%',
 u'\xa0',
 u'OWS',
 u'DWS',
 u'WS',
 u'WS/48',
 u'\xa0',
 u'OBPM',
 u'DBPM',
 u'BPM',
 u'VORP']

In [18]:
df = gen_cleaning(df)

In [19]:
len(df)

486

In [20]:
df.head(n=8)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS_perc,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,23,OKC,68,1055,10.1,0.56,0.724,0.144,...,,1.2,0.9,2.1,0.096,,-0.3,-2.2,-2.5,-0.1
1,Quincy Acy,PF,26,TOT,38,558,11.8,0.565,0.529,0.353,...,,0.5,0.5,0.9,0.082,,-1.8,-1.2,-3.0,-0.1
4,Steven Adams,C,23,OKC,80,2389,16.5,0.589,0.002,0.392,...,,3.3,3.1,6.5,0.13,,-0.7,1.2,0.6,1.5
5,Arron Afflalo,SG,31,SAC,61,1580,8.9,0.559,0.36,0.221,...,,1.2,0.2,1.4,0.043,,-1.4,-2.1,-3.5,-0.6
6,Alexis Ajinca,C,28,NOP,39,584,12.9,0.529,0.022,0.225,...,,0.0,0.9,1.0,0.08,,-5.1,1.0,-4.1,-0.3
7,Cole Aldrich,C,28,MIN,62,531,12.7,0.549,0.0,0.256,...,,0.6,0.7,1.3,0.116,,-2.0,2.6,0.6,0.4
8,LaMarcus Aldridge,PF,31,SAS,72,2335,18.6,0.532,0.053,0.258,...,,3.5,3.7,7.2,0.149,,-0.3,1.3,1.0,1.8
9,Lavoy Allen,PF,27,IND,61,871,11.6,0.485,0.006,0.196,...,,0.9,0.8,1.7,0.093,,-1.5,1.3,-0.3,0.4


In [21]:
for year in range(1977, 2017): # for each year
    # Year starts after NBA-ABA merger
    # URLs for each type of page
    per_g_url = per_g_url_template.format(year=year)
    adv_url = adv_url_template.format(year=year)
    tot_url = tot_url_template.format(year=year)
    per_36m_url = per_36m_url_template.format(year=year)
    per_100p_url = per_100p_url_template.format(year=year)

In [22]:
column_headers = []

In [23]:
len(column_headers)

0

In [24]:
len(player_data)

0

In [28]:
scrape_pages(adv_url_template, 2015, 2016, df_adv)

      Season          Player Pos Age   Tm   G    MP   PER   TS%  3PAr ...     \
670  2014-16  Thaddeus Young  PF  26  TOT  76  2434  15.7  .507  .119 ...      
671  2014-16  Thaddeus Young  PF  26  MIN  48  1605  15.0  .491  .101 ...      
672  2014-16  Thaddeus Young  PF  26  BRK  28   829  17.1  .539  .153 ...      
673  2014-16     Cody Zeller   C  22  CHO  62  1487  14.1  .530  .003 ...      
674  2014-16    Tyler Zeller   C  25  BOS  82  1731  18.9  .594  .000 ...      

     OWS  DWS   WS WS/48    OBPM  DBPM   BPM VORP  
670  1.6  1.4  3.1  .061     0.2  -0.3  -0.1  1.2  
671  0.8  0.6  1.4  .043     0.1  -0.4  -0.4  0.7  
672  0.8  0.8  1.6  .095     0.4   0.0   0.3  0.5  
673  1.6  2.2  3.8  .123    -2.1   2.5   0.4  0.9  
674  4.5  2.0  6.5  .179     0.4   0.4   0.9  1.2  

[5 rows x 29 columns]
