# Scraping Team Season Results

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import re
import pandas as pd

In [2]:
team_id = 'PHI'

In [39]:
#get list of team ids
df_votes = pd.read_csv('../data/raw/mvp-votes-2020-1981')

Some years a player got votes who played on >1 team during the same season. When a multi-team season occurs, BR often uses 'TOT' in the team name. Since this is a really small fraction of vote share and a rare occurance I'm comfortable removing these from the dataset because it otherwise complicates incorporating team W/L%.

In [30]:
df_votes.query('Tm == ["TOT"]')

Unnamed: 0,Year,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
144,2010,Stephen Jackson,31,TOT,0.0,1.0,1230,0.001,81,38.6,20.6,5.0,3.7,1.6,0.5,0.423,0.328,0.779,5.0,0.077
151,2009,Chauncey Billups,32,TOT,0.0,33.0,1210,0.027,79,35.3,17.7,3.0,6.4,1.2,0.2,0.418,0.408,0.913,10.1,0.174
215,2005,Vince Carter,28,TOT,0.0,3.0,1270,0.002,77,36.7,24.5,5.2,4.2,1.4,0.6,0.452,0.406,0.798,9.4,0.159
389,1995,Clyde Drexler,32,TOT,0.0,3.0,1050,0.003,76,35.9,21.8,6.3,4.8,1.8,0.6,0.461,0.36,0.824,11.7,0.206
406,1994,Dominique Wilkins,34,TOT,0.0,1.0,1010,0.001,74,35.6,26.0,6.5,2.3,1.2,0.4,0.44,0.288,0.847,8.4,0.153


In [40]:
df_votes = df_votes[df_votes.Tm != 'TOT']

In [41]:
teams = df_votes.Tm.unique()

In [42]:
teams

array(['MIL', 'LAL', 'HOU', 'DAL', 'LAC', 'OKC', 'POR', 'DEN', 'TOR',
       'MIA', 'BOS', 'GSW', 'PHI', 'UTA', 'CLE', 'NOP', 'SAS', 'MIN',
       'IND', 'WAS', 'MEM', 'CHI', 'CHA', 'NYK', 'PHO', 'ORL', 'ATL',
       'NOH', 'DET', 'SEA', 'SAC', 'NJN', 'CHH', 'WSB', 'SDC', 'KCK'],
      dtype=object)

In [31]:
df_votes.Tm.unique()

array(['MIL', 'LAL', 'HOU', 'DAL', 'LAC', 'OKC', 'POR', 'DEN', 'TOR',
       'MIA', 'BOS', 'GSW', 'PHI', 'UTA', 'CLE', 'NOP', 'SAS', 'MIN',
       'IND', 'WAS', 'MEM', 'CHI', 'CHA', 'NYK', 'PHO', 'ORL', 'ATL',
       'NOH', 'TOT', 'DET', 'SEA', 'SAC', 'NJN', 'CHH', 'WSB', 'SDC',
       'KCK'], dtype=object)

In [27]:
df_teams.Team.unique()

array(['MIL', 'LAL', 'HOU', 'DAL', 'LAC', 'OKC', 'POR', 'DEN', 'TOR',
       'MIA', 'BOS', 'GSW', 'PHI', 'UTA', 'CLE'], dtype=object)

In [43]:
renamed_teams_map = {
    'NOP':'NOH',
    'SEA':'OKC',
    'CHH':'CHA',
    'WSB':'WAS',
    'SDC':'LAC',
    'KCK':'SAC'
}

In [47]:
renamed_teams_list = set(renamed_teams_map.get(tm,tm) for tm in df_votes.Tm.unique())        

In [48]:
renamed_teams_list 

{'ATL',
 'BOS',
 'CHA',
 'CHI',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GSW',
 'HOU',
 'IND',
 'LAC',
 'LAL',
 'MEM',
 'MIA',
 'MIL',
 'MIN',
 'NJN',
 'NOH',
 'NYK',
 'OKC',
 'ORL',
 'PHI',
 'PHO',
 'POR',
 'SAC',
 'SAS',
 'TOR',
 'UTA',
 'WAS'}

In [7]:
url = f'https://www.basketball-reference.com/teams/{team_id}/'

html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
rows = soup.findAll('tr')[1:]
rows_data = [
            [td.getText() for td in row.findAll('td')] 
            for row in rows
            ]

for i,row in enumerate(rows_data):
    if row[0] == '1980-81':
        rows_data = rows_data[:i+1]
        break

seasons = [
            row.find('th').getText() 
            for row in rows
            ]

#re-inserting seasons at the beginning of each row
for season,row in zip(seasons,rows_data):
    row.insert(0,season)

In [8]:
df_team = pd.DataFrame(rows_data, columns=headers)

In [9]:
df_team.Team = team_id

In [11]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '/Users/gazelle/GitRepositories/nba_mvp')


In [12]:
sys.path

['/Users/gazelle/miniconda3/envs/nba-mvp/lib_pypy/__extensions__',
 '/Users/gazelle/GitRepositories/nba_mvp',
 '/Users/gazelle/miniconda3/envs/nba-mvp/lib_pypy',
 '/Users/gazelle/miniconda3/envs/nba-mvp/lib-python/3',
 '/Users/gazelle/miniconda3/envs/nba-mvp/lib-python/3/lib-tk',
 '/Users/gazelle/miniconda3/envs/nba-mvp/lib-python/3/plat-darwin',
 '/Users/gazelle/miniconda3/envs/nba-mvp/lib-python/3/plat-mac',
 '/Users/gazelle/miniconda3/envs/nba-mvp/lib-python/3/plat-mac/lib-scriptpackages',
 '',
 '/Users/gazelle/.local/lib/python3.7/site-packages',
 '/Users/gazelle/miniconda3/envs/nba-mvp/site-packages',
 '/Users/gazelle/miniconda3/envs/nba-mvp/site-packages/IPython/extensions',
 '/Users/gazelle/.ipython']

In [13]:
from src.data.raw_dataset import scrape_team_index_pages

In [17]:
df_team = scrape_team_index_pages(teams)

IndexError: list index out of range

In [22]:
def url_to_soup(url):
    '''
    Takes in a URL string and returns a bs4 object
    '''
    html = urlopen(url)
    return BeautifulSoup(html, 'lxml')

def get_row_data(soup, start):
    '''
    Takes a bs4 object and a start index and returns a list of lists representing the table data
    
    
    '''    
    rows = soup.findAll('tr')[start:]
    rows_data = [
                [td.getText() for td in row.findAll('td')] 
                for row in rows
                ]
    
    return rows_data

def get_table_headers(soup, l, index):
    '''
    Takes in a soup object, limit, and index
    
    Args:
        soup (bs4 object): soup object 
        limit (int):
        years (int):
        
    Returns:
        headers : list of strings which represent column labels
    '''
    headers = [
               header.getText() 
               for header in soup.findAll('tr', limit=l)[index].findAll('th')
               ]
    return headers

In [49]:
df_teams = pd.DataFrame()
for i,team in enumerate(renamed_teams_list):
    t0 = time.time() #crawl delay initializer

    url = f'https://www.basketball-reference.com/teams/{team}/'

    soup = url_to_soup(url)
    headers = get_table_headers(soup, 2, 0)
    rows_data = get_row_data(soup,1)
    seasons = [
                row.find('th').getText() 
                for row in soup.findAll('tr')[1:] 
                ]

    for season,row in zip(seasons,rows_data):
        row.insert(0,season)

    for i,row in enumerate(rows_data):
        if row[0] == '1980-81':
            rows_data = rows_data[:i+1]
            break

    df_team = pd.DataFrame(rows_data, columns = headers)
    df_team.Team = team

    df_teams = df_teams.append(df_team, ignore_index=True)

    if len(teams)-(i+1):
        time.sleep(3-(t0-time.time()))

In [52]:
len(df_teams.Team.unique())  
len(df_votes.Tm.unique())

30

36

makes sense, I removed 6 teams which changed names

In [53]:
df_teams.to_csv("../data/raw/team-seasons-2020-1981",index=False)