# ML-5333 Project 2 - MLB Data Collection

In [2]:
import pandas as pd
import numpy as np
import baseball_scraper
import pybaseball
import statsapi
import mlbgame

Get masterfile for statsapi team name, code, etc.

In [3]:
with open('data\\master\\team_names.txt', 'r') as file:
    team_list = [x.replace('\n','') for x in file.readlines()]
team_df = []
for team in team_list:
    team_df.append(statsapi.lookup_team(team)[0])
team_df = pd.DataFrame.from_dict(team_df)
team_df.columns = ['statsapi_id','name','team_code','file_code','team_name','loc_name','short_name']
team_df = team_df.sort_values(by=['statsapi_id'])
team_df = team_df.reset_index(drop=True)
# team_df.to_csv('data\\master\\statsapi_teams.csv', index=False)

Function to get schedule dataframe for team in specified season

In [6]:
def get_team_game_list(team_code, season):

    date_dict = {
        2016: {'start': '04/03/2016', 'end': '10/02/2016'},
        2017: {'start': '04/02/2017', 'end': '10/01/2017'},
        2018: {'start': '03/28/2018', 'end': '09/30/2018'},
        2019: {'start': '03/27/2019', 'end': '09/29/2019'},
        2020: {'start': '07/23/2020', 'end': '09/27/2020'},
        2021: {'start': '04/01/2021', 'end': '10/03/2021'}
    }

    game_list = statsapi.schedule(start_date=date_dict[season]['start'],end_date=date_dict[season]['end'],team=team_code)

    schedule_df_list = []
    for game in game_list:
        if game['status'] in ['Final', 'Completed Early: Rain']:
            if team_code == game['home_id']:
                team_status = 'Home'
                opp_status = ('away_name', 'away_id')
                if game['home_score'] > game['away_score']:
                    game_result = 'W'
                else:
                    game_result = 'L'
            else:
                team_status = 'Away'
                opp_status = ('home_name', 'home_id')
                if game['away_score'] > game['home_score']:
                    game_result = 'W'
                else:
                    game_result = 'L'
            schedule_df_list.append([game['game_id'],team_status,game[opp_status[0]],game[opp_status[1]],game_result])
    col_list = ['game_id','home/away','opp_name','opp_code','result']
    return pd.DataFrame(schedule_df_list, columns=col_list)

Get schedule dictionary for 2016-2021 seasons and save

In [7]:
schedule_dict = {}
season_list = [2016,2017,2018,2019,2020,2021]
for season in season_list:
    schedule_dict.update({season: {}})
    for team in team_df['statsapi_id']:
        schedule_dict[season].update({team: get_team_game_list(team, season)})
for season in season_list:
    for team in team_df['statsapi_id']:
        schedule_dict[season][team].to_csv(f'data\\statsapi_season_data\\{season}\\{team}.csv', index=False)

Method to convert IP string to number

In [None]:
def inning_converter(innings_pitched):
    ip_string = str(innings_pitched)

    if ip_string[len(ip_string)-2:len(ip_string)] == '.1':
        ip = int(innings_pitched) + 0.333333
    elif ip_string[len(ip_string)-2:len(ip_string)] == '.2':
        ip = int(innings_pitched) + 0.666667
    else:
        ip = innings_pitched

    return ip

Get batter tables for all games 2016-2021

In [21]:
statsapi_master = pd.read_csv('data\\master\\statsapi_teams.csv')
for season in [2019,2020,2021]:
    for team in statsapi_master['statsapi_id']:
        batting_df_list = []
        team_schedule = pd.read_csv(f'data\\statsapi_season_data\\{season}\\{team}.csv')
        for game in team_schedule['game_id']:
            boxscore = statsapi.boxscore_data(game)

            # determine if team is home or away
            if team == boxscore['teamInfo']['home']['id']:
                team_status = 'home'
                opp_status = 'away'
            else:
                team_status = 'away'
                opp_status = 'home'

            # get opponent starting pitcher stats
            opp_sp = boxscore[f'{opp_status}Pitchers'][1]
            opp_sp_id = opp_sp['personId']
            opp_sp_hits = boxscore[opp_status]['players'][f'ID{opp_sp_id}']['seasonStats']['pitching']['hits']
            opp_sp_ab = boxscore[opp_status]['players'][f'ID{opp_sp_id}']['seasonStats']['pitching']['atBats']
            opp_sp_name = boxscore[opp_status]['players'][f'ID{opp_sp_id}']['person']['fullName']
            if opp_sp_ab > 0:
                opp_sp_info = [opp_sp_id,opp_sp_name,float(opp_sp['era']),round(opp_sp_hits/opp_sp_ab, 3)]
            else:
                opp_sp_info = [opp_sp_id,opp_sp_name,float(opp_sp['era']),0]

            # get team batting order and stats
            batting_order = boxscore[team_status]['battingOrder']
            for batter in batting_order:
                batter_name = boxscore[team_status]['players'][f'ID{batter}']['person']['fullName']
                batter_spot = int(boxscore[team_status]['players'][f'ID{batter}']['battingOrder'][0])
                game_info = boxscore[team_status]['players'][f'ID{batter}']['stats']['batting']
                season_avg = float(boxscore[team_status]['players'][f'ID{batter}']['seasonStats']['batting']['avg'])

                batter_stats = [batter,batter_name,game_info['hits'],game_info['atBats']+game_info['baseOnBalls'],batter_spot,season_avg]
                batter_stats =  batter_stats + opp_sp_info
                batting_df_list.append(batter_stats)

        # tranform to dataframe and save
        col_list = ['batterID','batterName','Hits','AB','battingSpot','Avg','pitcherID','pitcherName','pitcherERA','pitcherAvg']
        batting_df = pd.DataFrame(batting_df_list, columns=col_list)
        batting_df.to_csv(f'data\\statsapi_game_data\\{season}\\{team}.csv', index=False)
        print(season, team)

2019 108
2019 109
2019 110
2019 111
2019 112
2019 113
2019 114
2019 115
2019 116
2019 117
2019 118
2019 119
2019 120
2019 121
2019 133
2019 134
2019 135
2019 136
2019 137
2019 138
2019 139
2019 140
2019 141
2019 142
2019 143
2019 144
2019 145
2019 146
2019 147
2019 158
2020 108
2020 109
2020 110
2020 111
2020 112
2020 113
2020 114
2020 115
2020 116
2020 117
2020 118
2020 119
2020 120
2020 121


ValueError: could not convert string to float: '-.--'