## Collect Historic NBA Game Data
This web scraper collects historic game NBA game data and stores it as a csv file  
Current Season's Scraped : 2018 - 2023  
Website used to collect data : https://www.basketball-reference.com/

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import datetime
import os

In [3]:
# finds all the links to games played in given  
def get_game_links(year, month):
    SEASON_URL = f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'

    # extracts raw html from season schedule game
    request = requests.get(SEASON_URL)
    soup = BeautifulSoup(request.content, 'html.parser')
    
    # need for rate limiting
    time.sleep(5)

    # finds and extracts game links from html
    game_links = soup.find_all('td', attrs={'data-stat' : 'box_score_text'})
    game_links = list(map(lambda link: (link.find('a')['href']).replace('/boxscores/', '').replace('.html', ''), game_links))

    return game_links


In [4]:
# determines which teams played in the game and how many points were scored
def get_score_line(soup):
    score_line = str(soup.find('meta', {'property' : 'og:description'}))

    score_line = score_line[score_line.index('"') + 1 : score_line.index('.')]
    score_line = score_line.translate({ord(i): None for i in '()vs'})
    score_line = score_line.split()

    return score_line

In [5]:
# extracts total team stats from html given a team
def get_team_stat_line(soup, team):
    table = soup.find("table", attrs={"id" : f"box-{team}-game-basic"}).find("tfoot").find_all('td')
    table = pd.DataFrame(map(lambda stat : stat.text, table), index=map(lambda stat : str.upper(stat["data-stat"]), table)).T

    # removes all needed (since it is n/a for a team)
    del table["PLUS_MINUS"]
    del table["MP"]
    del table["PTS"]

    return table

In [6]:
# combine team stats with opponent team stats 
# team stats measure offensive capability of a team 
# opponent stats measure defensive capability of a team
def get_opp_team_stats(team_stats, opp_team_stats):
    opp_team_stats.columns = [f'{stat}_OPP' for stat in opp_team_stats.columns]
    total_team_stats = pd.concat([team_stats, opp_team_stats], axis=1)

    return total_team_stats

In [7]:
def get_date(game_link):
    year = int(game_link[:4])
    month = int(game_link[4:6])
    day = int(game_link[6:8])

    date = datetime.date(year, month, day)

    return date

In [16]:
def get_game_data(game, season):
    URL = f'https://www.basketball-reference.com/boxscores/{game}.html'

    # extract raw html from game box score page
    request = requests.get(URL)
    soup = BeautifulSoup(request.content, 'html.parser')
    time.sleep(5)

    # find score line data from webpage and url
    [away_team, away_score, home_team, home_score] = get_score_line(soup)
    date = get_date(game)

    away_score = int(away_score)
    home_score = int(home_score)

    # create a list for score line data
    score_line_stats = ["SEASON", "DATE", "HOME/AWAY", "TEAM", "PTS", "TEAM_OPP", "PTS_OPP", "WIN/LOSS"]
    away_score_line = [season, date, "AWAY", away_team, away_score, home_team, home_score, "WIN" if away_score > home_score else "LOSS"]
    home_score_line = [season, date, "HOME", home_team, home_score, away_team, away_score, "WIN" if away_score < home_score else "LOSS"]
    away_score_line = pd.DataFrame(away_score_line, index=score_line_stats).T
    home_score_line = pd.DataFrame(home_score_line, index=score_line_stats).T

    # collects both teams offensive stats from webpage
    away_team_stats = get_team_stat_line(soup, away_team)
    home_team_stats = get_team_stat_line(soup, home_team)

    # adds opponents team's stats to team stats (to measure the defensive skills of a team)
    away_data = get_opp_team_stats(away_team_stats, home_team_stats.copy(True))
    home_data = get_opp_team_stats(home_team_stats, away_team_stats.copy(True))

    # all relevant data into one data frame
    away_data = pd.concat([away_score_line, away_data], axis=1)
    home_data = pd.concat([home_score_line, home_data], axis=1)
    game_data = pd.concat([away_data, home_data], axis=0)

    # prints out progress message 
    print(f"{date}: {away_team} vs {home_team}")

    return game_data


In [None]:
# retrieves all the game data given a season and month
def get_monthly_games_data(season, month):
    games = get_game_links(season, month)

    monthly_data = [get_game_data(game, season) for game in games]
    monthly_data = pd.concat(monthly_data, axis=0)

    file_name = f'{month} {season}.csv'
    file_path = f"Data/{season}/{file_name.capitalize()}"

    monthly_data.to_csv(file_path)

    return monthly_data

In [None]:
# collects all monthly data for a given season
months = ['october', 'november', 'december', 'january', 'february', 'march', 'april', 'may']
for month in months:
    print(get_monthly_games_data(2018, month))

In [3]:
# combines multiple csv files into one csv
def combine_files(csv_list, file_name):
    csv_df = [pd.read_csv(file, index_col=0) for file in csv_list]
    csv_df = pd.concat(csv_df, axis=0)
    csv_df = csv_df.reset_index(drop=True)

    csv_df.to_csv(file_name)

    return csv_df

In [10]:
# combines all month data for a season into one csv
season = 2021
months = ['December', 'January', 'February', 'March', "April", 'May', 'June', 'July']
files = [f"Data/{season}/{month} {season}.csv" for month in months]
combine_files(files, f'Data/Season/{season} Season Data.csv')

Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,FGA,...,FTA_OPP,FT_PCT_OPP,ORB_OPP,DRB_OPP,TRB_OPP,AST_OPP,STL_OPP,BLK_OPP,TOV_OPP,PF_OPP
0,2021,2020-12-22,AWAY,GSW,99,BRK,125,LOSS,37,99,...,32,0.813,13,44,57,24,11,7,20,22
1,2021,2020-12-22,HOME,BRK,125,GSW,99,WIN,42,92,...,23,0.652,13,34,47,26,6,6,18,24
2,2021,2020-12-22,AWAY,LAC,116,LAL,109,WIN,44,93,...,31,0.774,8,37,45,22,4,2,19,20
3,2021,2020-12-22,HOME,LAL,109,LAC,116,LOSS,38,81,...,19,0.737,11,29,40,22,10,3,16,29
4,2021,2020-12-23,AWAY,CHO,114,CLE,121,LOSS,45,90,...,20,0.750,10,40,50,34,12,3,18,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2337,2021,2021-07-14,HOME,MIL,109,PHO,103,WIN,39,97,...,19,0.842,5,35,40,18,3,9,17,24
2338,2021,2021-07-17,AWAY,MIL,123,PHO,119,WIN,50,87,...,11,0.909,8,27,35,23,9,5,8,20
2339,2021,2021-07-17,HOME,PHO,119,MIL,123,LOSS,48,87,...,17,0.529,11,26,37,26,7,1,11,17
2340,2021,2021-07-20,AWAY,PHO,98,MIL,105,LOSS,38,86,...,29,0.862,11,42,53,20,10,6,18,17


In [12]:
# combines multiple season csv files into one NBA games data set
files = [f'Data/Season/{season} Season Data.csv'for season in [2018, 2019, 2020, 2021, 2022, 2023]]
combine_files(files, f'Data/Historic NBA Game Data.csv')

Unnamed: 0,SEASON,DATE,HOME/AWAY,TEAM,PTS,TEAM_OPP,PTS_OPP,WIN/LOSS,FG,FGA,...,FTA_OPP,FT_PCT_OPP,ORB_OPP,DRB_OPP,TRB_OPP,AST_OPP,STL_OPP,BLK_OPP,TOV_OPP,PF_OPP
0,2018,2017-10-17,AWAY,BOS,99,CLE,102,LOSS,36,88,...,25,0.840,9,41,50,19,3,4,17,25
1,2018,2017-10-17,HOME,CLE,102,BOS,99,WIN,38,83,...,25,0.760,9,37,46,24,11,4,10,24
2,2018,2017-10-17,AWAY,HOU,122,GSW,121,WIN,47,97,...,21,0.905,6,35,41,34,5,9,17,25
3,2018,2017-10-17,HOME,GSW,121,HOU,122,LOSS,43,80,...,19,0.684,10,33,43,28,9,5,12,16
4,2018,2017-10-18,AWAY,CHO,90,DET,102,LOSS,29,73,...,12,0.917,9,38,47,24,14,3,8,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15157,2023,2023-06-07,HOME,MIA,94,DEN,109,LOSS,34,92,...,27,0.815,13,45,58,28,3,5,13,18
15158,2023,2023-06-09,AWAY,DEN,108,MIA,95,WIN,39,79,...,20,0.850,8,29,37,23,2,3,14,19
15159,2023,2023-06-09,HOME,MIA,95,DEN,108,LOSS,35,78,...,21,0.762,5,29,34,26,11,7,6,18
15160,2023,2023-06-12,AWAY,MIA,89,DEN,94,LOSS,33,96,...,23,0.565,11,46,57,21,6,7,14,13
