In [8]:
import pandas as pd
from utils.boxscore import fetch_boxscore_data

In [9]:
"""
I use a dataframe that I created earlier with the script "Game Scrape" to create the URLs needed to collect the data for 
plays.
"""
games = pd.read_csv("games.csv")
games.head()

Unnamed: 0,season_start_year,away_team,points_away,home_team,points_home,attendance,notes,start_et,datetime,is_regular,game_id
0,2019,New Orleans Pelicans,122,Toronto Raptors,130,20787.0,,Scotiabank Arena,2019-10-22,1,a9c04542-2216-482b-84c9-2387dc3b89bc
1,2019,Los Angeles Lakers,102,Los Angeles Clippers,112,19068.0,,STAPLES Center,2019-10-22,1,b7284f7b-4f14-4213-bceb-8bcdd834e4b7
2,2019,Chicago Bulls,125,Charlotte Hornets,126,15424.0,,Spectrum Center,2019-10-23,1,328e874b-3ccb-426f-8296-4dc767d33bce
3,2019,Detroit Pistons,119,Indiana Pacers,110,17923.0,,Bankers Life Fieldhouse,2019-10-23,1,f940296c-9946-440c-8b6c-21012826b4e7
4,2019,Denver Nuggets,108,Portland Trail Blazers,100,19991.0,,Moda Center,2019-10-23,1,2f15fb2f-e4bc-4cbb-adf7-9aafa6329747


In [10]:
"""
I add the abbreviation for the home team because I need it for creating the URLs.
"""

teamAbbreviation = {'Dallas Mavericks':'DAL', 'Phoenix Suns':'PHO', 'Boston Celtics':'BOS',
       'Portland Trail Blazers':'POR', 'New Jersey Nets':'NJN', 'Toronto Raptors':'TOR',
       'Los Angeles Lakers':'LAL', 'Utah Jazz':'UTA', 'Philadelphia 76ers':'PHI',
       'New York Knicks':'NYK', 'Minnesota Timberwolves':'MIN', 'Orlando Magic':'ORL',
       'San Antonio Spurs':'SAS', 'Sacramento Kings':'SAC', 'Atlanta Hawks':'ATL',
       'Seattle SuperSonics':'SEA', 'Washington Bullets':'WSB', 'Indiana Pacers':'IND',
       'Los Angeles Clippers':'LAC', 'Miami Heat':'MIA', 'Milwaukee Bucks':'MIL',
       'Charlotte Hornets':'CHO', 'Cleveland Cavaliers':'CLE', 'Houston Rockets':'HOU',
       'Denver Nuggets':'DEN', 'Vancouver Grizzlies':'VAN', 'Golden State Warriors':'GSW',
       'Chicago Bulls':'CHI', 'Detroit Pistons':'DET', 'Washington Wizards':'WAS',
       'Memphis Grizzlies':'MEM', 'New Orleans Hornets':'NOH', 'Charlotte Bobcats':'CHA',
       'New Orleans/Oklahoma City Hornets':'NOK', 'Oklahoma City Thunder':'OKC',
       'Brooklyn Nets':'BRK', 'New Orleans Pelicans':'NOP'}

games['abbreviation'] = games['home_team'].apply(lambda x: teamAbbreviation[x])
games['abbreviation'] = games.apply(
    lambda row: 'CHH' if row.season_start_year < 2014 and row.home_team == 'Charlotte Hornets' else row.abbreviation, 
    axis=1
)

In [11]:
df = fetch_boxscore_data(games, start_index=4000, chunk_size=4000)

4000: Fetching data from https://www.basketball-reference.com/boxscores/202212310OKC.html
4001: Fetching data from https://www.basketball-reference.com/boxscores/202212310MIN.html
4002: Fetching data from https://www.basketball-reference.com/boxscores/202212310MEM.html
4003: Fetching data from https://www.basketball-reference.com/boxscores/202212310HOU.html
4004: Fetching data from https://www.basketball-reference.com/boxscores/202212310SAS.html
4005: Fetching data from https://www.basketball-reference.com/boxscores/202212310CHI.html
4006: Fetching data from https://www.basketball-reference.com/boxscores/202212310IND.html
4007: Fetching data from https://www.basketball-reference.com/boxscores/202212310CHO.html
4008: Fetching data from https://www.basketball-reference.com/boxscores/202301010MEM.html
4009: Fetching data from https://www.basketball-reference.com/boxscores/202301010DEN.html
4010: Fetching data from https://www.basketball-reference.com/boxscores/202301010MIL.html
4011: Fetc

In [12]:
# Initialize a new column with 0, assuming all players are not starters initially
df['is_starter'] = 0

# Set the 'is_starter' value to 1 for the first player in the DataFrame
df.loc[0, 'is_starter'] = 1

# Find the indices of the rows where player_name is 'Team Totals' and set 'is_starter' to 1
df.loc[df['player_name'] == 'Team Totals', 'is_starter'] = 1

# The rows between 'Reserves' and 'Team Totals' are considered reserves.
reserve_indices = df[df['player_name'] == 'Reserves'].index
team_total_indices = df[df['player_name'] == 'Team Totals'].index

# Loop through and set 'is_starter' to 0 for reserves
for reserve_idx, team_total_idx in zip(reserve_indices, team_total_indices):
    df.loc[reserve_idx + 1:team_total_idx - 1, 'is_starter'] = 0

In [13]:
# Cut values that do not include data on players
df = df[(df.player_name!='Reserves')&(df.player_name!='Team Totals')]

In [14]:
df.to_csv("boxscore.csv", index=False)