In [1]:
import pandas as pd

In [2]:
"""
I use a dataframe that I created earlier with the script "Game Scrape" to create the URLs needed to collect the data for 
plays.
"""
games = pd.read_csv("games.csv")
games.head()

Unnamed: 0,seasonStartYear,awayTeam,pointsAway,homeTeam,pointsHome,attendance,notes,startET,datetime,isRegular,game_id
0,1996,Sacramento Kings,85,Houston Rockets,96,16285.0,,,1996-11-01,1,1
1,1996,Los Angeles Clippers,97,Golden State Warriors,85,15593.0,,,1996-11-01,1,2
2,1996,Portland Trail Blazers,114,Vancouver Grizzlies,85,19193.0,,,1996-11-01,1,3
3,1996,Seattle SuperSonics,91,Utah Jazz,99,19911.0,,,1996-11-01,1,4
4,1996,New York Knicks,107,Toronto Raptors,99,28457.0,,,1996-11-01,1,5


In [4]:
"""
I add the abbreviation for the home team because I need it for creating the URLs.
"""

teamAbbreviation = {'Dallas Mavericks':'DAL', 'Phoenix Suns':'PHO', 'Boston Celtics':'BOS',
       'Portland Trail Blazers':'POR', 'New Jersey Nets':'NJN', 'Toronto Raptors':'TOR',
       'Los Angeles Lakers':'LAL', 'Utah Jazz':'UTA', 'Philadelphia 76ers':'PHI',
       'New York Knicks':'NYK', 'Minnesota Timberwolves':'MIN', 'Orlando Magic':'ORL',
       'San Antonio Spurs':'SAS', 'Sacramento Kings':'SAC', 'Atlanta Hawks':'ATL',
       'Seattle SuperSonics':'SEA', 'Washington Bullets':'WSB', 'Indiana Pacers':'IND',
       'Los Angeles Clippers':'LAC', 'Miami Heat':'MIA', 'Milwaukee Bucks':'MIL',
       'Charlotte Hornets':'CHO', 'Cleveland Cavaliers':'CLE', 'Houston Rockets':'HOU',
       'Denver Nuggets':'DEN', 'Vancouver Grizzlies':'VAN', 'Golden State Warriors':'GSW',
       'Chicago Bulls':'CHI', 'Detroit Pistons':'DET', 'Washington Wizards':'WAS',
       'Memphis Grizzlies':'MEM', 'New Orleans Hornets':'NOH', 'Charlotte Bobcats':'CHA',
       'New Orleans/Oklahoma City Hornets':'NOK', 'Oklahoma City Thunder':'OKC',
       'Brooklyn Nets':'BRK', 'New Orleans Pelicans':'NOP'}

games['abbreviation'] = games['homeTeam'].apply(lambda x: teamAbbreviation[x])

def fixHornets(row):
    if row.seasonStartYear < 2014 and row.homeTeam == 'Charlotte Hornets':
        return 'CHH'
    else:
        return row.abbreviation
games['abbreviation'] = games.apply(fixHornets, axis=1)

In [111]:
df = pd.DataFrame(columns=['game_id', 'teamName','playerName', 'MP', 
                           'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB',
                           'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-'])
for i, row in games.iterrows():
    year, month, day = row.datetime.split('-')
    url = "https://www.basketball-reference.com/boxscores/{}{}{}0{}.html".format(year, month, day, row.abbreviation)
    tables = pd.io.html.read_html(url)
    away = True
    for table in tables:
        # It throws an error at the 2nd if statement without the 1st if statement
        if table.columns.get_level_values(0)[1] == 'Advanced Box Score Stats': continue
        if table['Basic Box Score Stats'][-1:]['MP'].isna().max(): continue
        if int(table['Basic Box Score Stats']['MP'][-1:].max()) >= 240:
            teamStats = table['Basic Box Score Stats']
            teamStats['playerName'] = table['Unnamed: 0_level_0']
            teamStats['game_id'] = row.game_id
            if away:
                teamStats['teamName'] = row.awayTeam
                away = False
            else:
                teamStats['teamName'] = row.homeTeam
            df = pd.concat([df, teamStats])
    

In [110]:
df.to_csv("boxscore_uncleaned.csv", index=False)

In [135]:
"""
Create a variable for starters
"""
lst = []
j= 0
for i, row in df.iterrows():
    if row.playerName == 'Reserves':
        lst.append(0)
    elif row.playerName == 'Team Totals':
        lst.append(1)
    elif i == 0:
        lst.append(1)
    else:
        lst.append(lst[-1])

df['isStarter'] = lst

In [143]:
#Cut values that do not include data on players
df = df[(df.playerName!='Reserves')&(df.playerName!='Team Totals')]
df.drop(columns=['FG%', '3P%', 'FT%'], inplace=True)

In [152]:
df.to_csv("boxscore.csv", index=False)
