In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [2]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '/Users/shri/GitRepositories/nba_mvp')
from src.data.raw_dataset import scrape_first_table

In [3]:
def url_to_soup(url):
    '''
    Takes in a URL string and returns a bs4 object
    '''
    html = urlopen(url)
    return BeautifulSoup(html, 'lxml')
    
def get_row_data(soup, start):
    '''
    Takes a bs4 object and a start index and returns a list of lists representing the table data
    
    
    '''    
    rows = soup.findAll('tr')[start:]
    rows_data = [
                [td.getText() for td in row.findAll('td')] 
                for row in rows
                ]
    
    return rows_data
    
def get_seasons(corpus):
    '''
    Takes a text corpus, returns a list of seasons found in the form yyyy-yy
    '''
    
    return re.findall(r'\d+-\d+', corpus)

def get_table_headers(soup, l, index):
    '''
    Takes in a soup object, limit, and index
    
    Args:
        soup (bs4 object): soup object 
        limit (int):
        years (int):
        
    Returns:
        headers : list of strings which represent column labels
    '''
    headers = [
               header.getText() 
               for header in soup.findAll('tr', limit=l)[index].findAll('th')
               ]
    return headers

In [4]:
pd.set_option("display.max_columns", 40)

In [5]:
url = 'https://www.basketball-reference.com/leagues/NBA_2021_advanced.html'

soup = url_to_soup(url) #web scape happens here
headers = get_table_headers(soup, 2, 0)
rows_data = get_row_data(soup,1)

df_advanced_2021 = pd.DataFrame(rows_data, columns = headers[1:])

In [7]:
df_advanced_2021 = scrape_first_table(url,headers_start=1)

In [17]:
df_advanced_2021 = df_advanced_2021[df_advanced_2021.Tm != 'TOT']

In [33]:
df_advanced_2021.dropna(how='all', inplace=True)

In [53]:
TOTAL_GAMES_2021 = 72

In [43]:
df_advanced_2021.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', ' ',
       'OWS', 'DWS', 'WS', 'WS/48', ' ', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [44]:
df_advanced_2021[['Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'OWS', 'DWS', 'WS', 'WS/48','OBPM', 'DBPM', 'BPM', 'VORP']] = df_advanced_2021[['Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%',
       'OWS', 'DWS', 'WS', 'WS/48','OBPM', 'DBPM', 'BPM', 'VORP']].apply(pd.to_numeric)

In [59]:
GAMES_PLAYED = df_advanced_2021['G'].argmax()

In [61]:
GAMES_REMAINING = TOTAL_GAMES_2021 - GAMES_PLAYED

In [63]:
GAMES_REMAINING

42

In [55]:
with pd.option_context('display.max_rows', None):
    df_advanced_2021.sort_values('G', ascending=False)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
520,Ivica Zubac,C,23,LAC,30,586,20.5,0.722,0.007,0.5,13.4,24.4,19.1,7.7,0.9,3.7,14.8,15.3,,2.0,0.8,2.8,0.23,,0.8,-0.3,0.5,0.4
407,Julius Randle,PF,26,NYK,30,1101,20.7,0.589,0.267,0.379,4.0,26.6,15.5,25.8,1.0,0.5,14.2,27.6,,1.9,1.7,3.6,0.159,,3.3,0.8,4.2,1.7
32,RJ Barrett,SG,20,NYK,30,1012,13.1,0.504,0.233,0.275,4.1,15.0,9.6,14.7,1.1,0.6,10.4,23.9,,0.0,1.2,1.2,0.058,,-1.6,-0.6,-2.2,0.0
384,Elfrid Payton,PG,26,NYK,30,831,11.9,0.484,0.158,0.161,4.9,9.5,7.3,20.9,1.1,0.7,13.1,23.2,,-0.4,0.9,0.5,0.028,,-2.3,-1.1,-3.4,-0.3
179,Jeff Green,PF,34,BRK,30,780,12.2,0.647,0.534,0.218,2.1,13.3,8.1,7.2,1.2,0.8,9.6,13.9,,1.2,0.4,1.6,0.098,,-0.4,-1.0,-1.4,0.1
198,Joe Harris,SF,29,BRK,30,928,14.9,0.688,0.631,0.073,2.3,9.9,6.3,8.3,0.8,0.6,7.2,16.3,,2.1,0.3,2.4,0.123,,2.1,-1.7,0.5,0.6
292,Timothé Luwawu-Cabarrot,SF,25,BRK,30,590,9.8,0.538,0.637,0.124,3.4,9.9,6.8,8.4,1.5,0.7,9.8,16.5,,0.2,0.3,0.5,0.042,,-1.7,-1.6,-3.3,-0.2
270,Kyle Kuzma,SF,25,LAL,29,724,13.1,0.546,0.482,0.094,8.2,18.4,13.4,6.6,0.6,2.2,11.4,19.5,,0.3,1.2,1.5,0.098,,0.0,-0.9,-0.9,0.2
240,LeBron James,PG,36,LAL,29,1006,24.2,0.6,0.355,0.318,2.0,23.6,13.2,38.4,1.4,1.3,14.8,31.4,,2.3,1.9,4.2,0.2,,5.8,1.3,7.2,2.3
228,Serge Ibaka,C,31,LAC,29,701,18.2,0.593,0.287,0.164,9.1,21.5,15.5,10.6,0.5,4.6,11.7,20.5,,1.2,0.9,2.1,0.143,,1.1,-0.9,0.2,0.4


In [None]:
#df_advanced_2021.to_csv('..data/interim')