In [1]:
import pandas as pd
import uuid
from utils.boxscore import fetch_boxscore_data

In [2]:
"""
I use a dataframe that I created earlier with the script "Game Scrape" to create the URLs needed to collect the data for 
plays.
"""
games = pd.read_csv("games-backup.csv")
games['game_id'] = games['game_id'].apply(lambda x: str(uuid.uuid4()))
games.head()

Unnamed: 0,season_start_year,away_team,points_away,home_team,points_home,attendance,notes,start_et,datetime,is_regular,game_id
0,2020,Golden State Warriors,99,Brooklyn Nets,125,0.0,,7:00p,2020-12-22,1,2804fa0d-7ebc-4547-8607-58985c3e69eb
1,2020,Los Angeles Clippers,116,Los Angeles Lakers,109,0.0,,10:00p,2020-12-22,1,79b28319-0709-4a37-9d55-e7bfd3aaeb1c
2,2020,New York Knicks,107,Indiana Pacers,121,0.0,,7:00p,2020-12-23,1,69161d63-0fdc-4615-9fc1-cced00707a4c
3,2020,Dallas Mavericks,102,Phoenix Suns,106,0.0,,10:30p,2020-12-23,1,ac2fe1f6-0f69-43b7-b95f-d4caec5d9c28
4,2020,Utah Jazz,120,Portland Trail Blazers,100,0.0,,10:00p,2020-12-23,1,ec96e40d-e9af-4dc2-a05d-70a0a8aa1f7c


In [3]:
"""
I add the abbreviation for the home team because I need it for creating the URLs.
"""

teamAbbreviation = {'Dallas Mavericks':'DAL', 'Phoenix Suns':'PHO', 'Boston Celtics':'BOS',
       'Portland Trail Blazers':'POR', 'New Jersey Nets':'NJN', 'Toronto Raptors':'TOR',
       'Los Angeles Lakers':'LAL', 'Utah Jazz':'UTA', 'Philadelphia 76ers':'PHI',
       'New York Knicks':'NYK', 'Minnesota Timberwolves':'MIN', 'Orlando Magic':'ORL',
       'San Antonio Spurs':'SAS', 'Sacramento Kings':'SAC', 'Atlanta Hawks':'ATL',
       'Seattle SuperSonics':'SEA', 'Washington Bullets':'WSB', 'Indiana Pacers':'IND',
       'Los Angeles Clippers':'LAC', 'Miami Heat':'MIA', 'Milwaukee Bucks':'MIL',
       'Charlotte Hornets':'CHO', 'Cleveland Cavaliers':'CLE', 'Houston Rockets':'HOU',
       'Denver Nuggets':'DEN', 'Vancouver Grizzlies':'VAN', 'Golden State Warriors':'GSW',
       'Chicago Bulls':'CHI', 'Detroit Pistons':'DET', 'Washington Wizards':'WAS',
       'Memphis Grizzlies':'MEM', 'New Orleans Hornets':'NOH', 'Charlotte Bobcats':'CHA',
       'New Orleans/Oklahoma City Hornets':'NOK', 'Oklahoma City Thunder':'OKC',
       'Brooklyn Nets':'BRK', 'New Orleans Pelicans':'NOP'}

games['abbreviation'] = games['home_team'].apply(lambda x: teamAbbreviation[x])
games['abbreviation'] = games.apply(
    lambda row: 'CHH' if row.season_start_year < 2014 and row.home_team == 'Charlotte Hornets' else row.abbreviation, 
    axis=1
)

In [4]:
df = fetch_boxscore_data(games, start_index=1701)


1701: Fetching data from https://www.basketball-reference.com/boxscores/202201020OKC.html
1702: Fetching data from https://www.basketball-reference.com/boxscores/202201020LAL.html
1703: Fetching data from https://www.basketball-reference.com/boxscores/202201030DAL.html
1704: Fetching data from https://www.basketball-reference.com/boxscores/202201030LAC.html
1705: Fetching data from https://www.basketball-reference.com/boxscores/202201030POR.html
1706: Fetching data from https://www.basketball-reference.com/boxscores/202201030NOP.html
1707: Fetching data from https://www.basketball-reference.com/boxscores/202201030GSW.html
1708: Fetching data from https://www.basketball-reference.com/boxscores/202201030CHI.html
1709: Fetching data from https://www.basketball-reference.com/boxscores/202201030BRK.html
1710: Fetching data from https://www.basketball-reference.com/boxscores/202201030MIL.html
1711: Fetching data from https://www.basketball-reference.com/boxscores/202201030WAS.html
1712: Fetc

In [5]:
# Initialize a new column with 0, assuming all players are not starters initially
df['is_starter'] = 0

# Set the 'is_starter' value to 1 for the first player in the DataFrame
df.loc[0, 'is_starter'] = 1

# Find the indices of the rows where player_name is 'Team Totals' and set 'is_starter' to 1
df.loc[df['player_name'] == 'Team Totals', 'is_starter'] = 1

# The rows between 'Reserves' and 'Team Totals' are considered reserves.
reserve_indices = df[df['player_name'] == 'Reserves'].index
team_total_indices = df[df['player_name'] == 'Team Totals'].index

# Loop through and set 'is_starter' to 0 for reserves
for reserve_idx, team_total_idx in zip(reserve_indices, team_total_indices):
    df.loc[reserve_idx + 1:team_total_idx - 1, 'is_starter'] = 0

In [6]:
# Cut values that do not include data on players
df = df[(df.player_name!='Reserves')&(df.player_name!='Team Totals')]

In [7]:
df.to_csv("boxscore_1701-2200.csv", index=False)