In [1]:
# Import relevant libraries
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd

In [2]:
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [3]:
seasons = list(range(2021,2022))
seasons

[2021]

In [4]:
# Set up the browser
executable_path = {'executable_path':"C:\Program Files (x86)\msedgedriver.exe"}
browser = Browser('edge', **executable_path)

In [5]:
for season in seasons:
    # Visit the website for scraping
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    browser.visit(url)

    # Create a BeautifulSoup object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    filter_div = soup.find('div', class_='filter')
    links = filter_div.find_all('a')
    urls = [link.get("href") for link in links]

In [6]:
table = soup.find('table', id='schedule')

links = table.find_all("a")
hrefs = [link.get('href') for link in links]
box_scores = [link for link in hrefs if link and "boxscore" in link and ".html" in link]
box_scores = [f"https://www.basketball-reference.com{score}" for score in box_scores]

In [7]:
box_score = box_scores[0]
browser.visit(box_score)

html=browser.html
soup = BeautifulSoup(html, 'html.parser')

In [190]:
score_table = soup.find('table', id='line_score')
tbody = score_table.find('tbody')
rows = tbody.find_all('tr')

line_score = []

for row in rows:
    # Get team name
    team = row.find('th', class_='center').text
    columns = row.find_all('td')
    
    # Create line score dictionary
    if(columns !=[]):
        total = columns[4].text
    
    # Create dictionary for dataframe later
    line_score_dict = { "team": team,
                        "total": total
    }
    
    # Add dictionary to array
    line_score.append(line_score_dict)
    
# Create Data frame
score_df = pd.DataFrame(line_score)
score_df

Unnamed: 0,team,total
0,GSW,99
1,BRK,125


In [237]:
def read_season_info(soup):
    nav = soup.find('div', id='bottom_nav_container')
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = hrefs[1].split()[0].split('_')[0].split('/')[-1]
    return season

In [238]:
base_cols = None

# for box_score in box_scores
teams = [score["team"] for score in line_score]
soup = BeautifulSoup(html, 'html.parser')


summaries = []
for team in teams:
    # Convert html table into pandas dataframe
    basic = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-basic"}, index_col=0)[0]
    advanced = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-advanced"}, index_col=0)[0]

    # Remove the row that contains heards within the dataframe
    advanced = advanced.drop('Reserves')
    basic = basic.drop('Reserves')
    
    # Get headers for basic and advanced stats
    advanced_columns = []
    basic_columns = []
    for i in range(len(advanced.columns)):
        advanced_columns.append(advanced.columns[i][1])

    for i in range(len(basic.columns)):
        basic_columns.append(basic.columns[i][1])

    advanced.columns = advanced_columns
    basic.columns = basic_columns
    
    # Totals and Maxes
    totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
    totals.index = totals.index.str.lower()

    maxes = pd.concat([basic.iloc[:-1, :].max(), advanced.iloc[:-1, :].max()])
    maxes.index = maxes.index.str.lower() + "_max"
    
    summary = pd.concat([totals, maxes])
    
    if base_cols is None:
        base_cols = list(summary.index.drop_duplicates(keep="first"))
        base_cols = [b for b in base_cols if "bpm" not in b]
    
    summary = summary[base_cols]
    
    summaries.append(summary)

summary = pd.concat(summaries, axis=1).T
game = pd.concat([summary, score_df], axis=1)
game["home"] = [0, 1]

game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"

full_game = pd.concat([game, game_opp], axis=1)

full_game["season"] = read_season_info(soup)

  maxes = pd.concat([basic.iloc[:-1, :].max(), advanced.iloc[:-1, :].max()])
  maxes = pd.concat([basic.iloc[:-1, :].max(), advanced.iloc[:-1, :].max()])


In [239]:
full_game

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season
0,240,240,37,99,0.374,10,33,0.303,15,23,...,7.3,7.4,9.2,6.4,82,98,BRK,125,1,2021
1,240,240,42,92,0.457,15,35,0.429,26,32,...,3.7,6.6,50.0,8.3,99,120,GSW,99,0,2021
