In [362]:
# Import relevant libraries
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd

In [363]:
# DATA_DIR = "data"
# STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
# SCORES_DIR = os.path.join(DATA_DIR, "scores")

In [364]:
seasons = list(range(2021,2022))
seasons

[2021]

In [365]:
# Set up the browser
executable_path = {'executable_path':"C:\Program Files (x86)\msedgedriver.exe"}
browser = Browser('edge', **executable_path)

In [366]:
def read_season_info(soup):
    nav = soup.find('div', id='bottom_nav_container')
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = hrefs[1].split()[0].split('_')[0].split('/')[-1]
    return season

In [367]:
for season in seasons:
    # Visit the website for scraping
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
    browser.visit(url)

    # Create a BeautifulSoup object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    filter_div = soup.find('div', class_='filter')
    links = filter_div.find_all('a')
    urls = [link.get("href") for link in links]
    
    html=browser.html
    soup = BeautifulSoup(html, 'html.parser')

    table = soup.find('table', id='schedule')

    if table is not None:
        links = table.find_all("a")
        hrefs = [link.get('href') for link in links]
        box_scores = [link for link in hrefs if link and "boxscore" in link and ".html" in link]
        box_scores = [f"https://www.basketball-reference.com{score}" for score in box_scores]
    else:
        # If the table is not found, handle this case accordingly
        print("Table with id 'schedule' not found.")

    base_cols = None
    games = []

    for box_score in box_scores:
        browser.visit(box_score)
        time.sleep(2)
        html=browser.html
        soup = BeautifulSoup(html, 'html.parser')

        score_table = soup.find('table', id='line_score')
        tbody = score_table.find('tbody')
        rows = tbody.find_all('tr')

        line_score = []

        for row in rows:
            # Get team name
            team = row.find('th', class_='center').text
            columns = row.find_all('td')

            # Create line score dictionary
            if(columns !=[]):
                total = columns[4].text

            # Create dictionary for dataframe later
            line_score_dict = { "team": team,
                                "total": total
            }

            # Add dictionary to array
            line_score.append(line_score_dict)

        # Create Data frame
        score_df = pd.DataFrame(line_score)

        teams = [score["team"] for score in line_score]

       
        summaries = []
        for team in teams:
            # Convert html table into pandas dataframe
            basic = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-basic"}, index_col=0)[0]
            advanced = pd.read_html(str(soup), attrs={"id": f"box-{team}-game-advanced"}, index_col=0)[0]
            
            # Convert all columns with number to numeric for dataframes
            basic = basic.apply(pd.to_numeric, errors="coerce")
            advanced = advanced.apply(pd.to_numeric, errors="coerce")

            # Remove the row that contains heards within the dataframe
            advanced = advanced.drop('Reserves')
            basic = basic.drop('Reserves')

            # Get headers for basic and advanced stats
            advanced_columns = []
            basic_columns = []
            for i in range(len(advanced.columns)):
                advanced_columns.append(advanced.columns[i][1])

            for i in range(len(basic.columns)):
                basic_columns.append(basic.columns[i][1])

            advanced.columns = advanced_columns
            basic.columns = basic_columns

            # Totals and Maxes
            totals = pd.concat([basic.iloc[-1, :], advanced.iloc[-1, :]])
            totals.index = totals.index.str.lower()

            maxes = pd.concat([basic.iloc[:-1, :].max(), advanced.iloc[:-1, :].max()])
            maxes.index = maxes.index.str.lower() + "_max"

            summary = pd.concat([totals, maxes])
            
            # Create comon columns that are found for all teams
            if base_cols is None:
                base_cols = list(summary.index.drop_duplicates(keep="first"))
                base_cols = [b for b in base_cols if "bpm" not in b]

            summary = summary[base_cols]
            summaries.append(summary)

        summary = pd.concat(summaries, axis=1).T
        game = pd.concat([summary, score_df], axis=1)
        game["home"] = [0, 1]

        game_opp = game.iloc[::-1].reset_index()
        game_opp.columns += "_opp"

        # Merge both hame and away team data together
        full_game = pd.concat([game, game_opp], axis=1)

        # Add the season the game was played in
        full_game["season"] = read_season_info(soup)

        # Add date to dataframe
        full_game["date"] = box_score.split('/')[-1][:8]
        full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")

        # Specify who won the game
        full_game["won"] = full_game["total"] > full_game["total_opp"]

        games.append(full_game)

        if len(games) % 100 == 0:
            print(f"{len(games)} / {len(box_scores)}")
        
        time.sleep(2)

In [370]:
games_df = pd.concat(games, ignore_index=True)

In [371]:
games_df.to_csv("nba_games.csv")