In [1]:
import pandas as pd
import requests
import json
import time
import numpy as np

pd.set_option('display.max_columns', None)

In [21]:
#Filtering to games completed as of mid-way through August 23rd
game_list = pd.read_csv('../raw_data/games.csv', index_col="id")
game_list = game_list[game_list["status"] == "closed"]
g_list = game_list.index

In [22]:
# setting naming convention for the columns that the json files are read into the Dataframe
columns = ['id', 'game_id', 'inning', 'side', 'hitter_id', 'hitter_hand', 'pitcher_id', 'pitcher_hand', 'description', 'temp_f', 'weather_condition', 'humidity', 'wind_speed_mph']

In [23]:
#Create empty DataFrame to concat to in the next code cell
at_bat_df = pd.DataFrame(columns=columns)

In [25]:
#Parsing through JSON to pull the key data into a dataframe

columns = ['id', 'game_id', 'inning', 'side', 'hitter_id', 'hitter_hand', 'pitcher_id', 'pitcher_hand', 'description', 'temp_f', 'weather_condition', 'humidity', 'wind_speed_mph']

for g in g_list:
    with open(f'../data/plate_app_data/{g}.json') as user_file:
        api_c = user_file.read()
    api_c = json.loads(api_c)
    
    for ing in range(1, len(api_c["game"]["innings"])):
        for hlf in range(2):
            for atb in range(len(api_c["game"]["innings"][ing]["halfs"][hlf]["events"])):
                
                if 'at_bat' not in api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]:
                    continue
                
                if 'pitcher_id' not in api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]:
                    continue
                
                id = api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]["id"]
                game_id = api_c["game"]["id"]
                inning = api_c["game"]["innings"][ing]["number"]
                side = api_c["game"]["innings"][ing]["halfs"][hlf]["half"]

                hitter_id = api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]["hitter_id"]
                hitter_hand = api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]["hitter_hand"]
                
                pitcher_id = api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]["pitcher_id"]
                pitcher_hand = api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]["pitcher_hand"]
                
                if 'description' not in api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]:
                    continue
                
                description = api_c["game"]["innings"][ing]["halfs"][hlf]["events"][atb]["at_bat"]["description"]

                if 'weather' in api_c["game"]["innings"][ing]["halfs"][hlf]:
                    if "temp_f" not in api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]:
                        temp_f = np.nan
                    else:
                        temp_f = api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]["temp_f"]
                    
                    if "condition" not in api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]:
                        weather_condition = np.nan
                    else:    
                        weather_condition = api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]["condition"]
                    
                    if "humidity" not in api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]:
                        humidity  = np.nan
                    else:
                        humidity = api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]["humidity"]
                        
                    if "wind" not in api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]:
                        wind_speed_mph  = np.nan
                        
                    else:  
                        wind_speed_mph = api_c["game"]["innings"][ing]["halfs"][hlf]["weather"]["current_conditions"]["wind"]["speed_mph"]
                    
                else:  
                    temp_f = np.nan
                    weather_condition = np.nan
                    humidity = np.nan
                    wind_speed_mph = np.nan
            
                _temp = pd.DataFrame(list((id, game_id, inning, side, hitter_id, hitter_hand, pitcher_id, pitcher_hand, description, temp_f, weather_condition, humidity, wind_speed_mph))).T
                
                _temp.columns = columns
                
                at_bat_df = pd.concat((at_bat_df, _temp))
                
at_bat_df = at_bat_df.set_index("id")

In [2]:
#reading .csv verison
at_bat_df = pd.read_csv("../raw_data/all_ab_raw_data.csv")


In [7]:
#Mapping descriptions to outcome codes
search_substrings = ["walks", "walked", "hit by pitch", "singles", "doubles", "triples", "homers", "strikes out"]
mapping = ["walk", "walk", "HBP", "1B", "2B", "3B", "HR", "SO"]

for substring, value in zip(search_substrings, mapping):
    # Check if the substring is present in the column
    mask = at_bat_df['description'].str.contains(substring, case=False)
    # Assign the corresponding value to the 'result' column where the mask is True
    at_bat_df.loc[mask, 'play_outcome'] = value

at_bat_df['play_outcome'] = at_bat_df['play_outcome'].fillna("IPO")

In [13]:
#Mapping outcomes to multi-class targets
outcome_mapping = {"walk": 1,
                   "HBP": 1,
                   "1B": 1,
                   "2B": 2,
                   "3B": 3,
                   "HR": 4,
                   "SO": 0,
                   "IPO": 0}

at_bat_df["mc_target"] = at_bat_df["play_outcome"].map(lambda x: outcome_mapping[x])

In [15]:
#Mapping multi-class targets to binary targets, final target for model
at_bat_df["y_target"] = at_bat_df["mc_target"].map(lambda x: 0 if x  == 0 else 1)

In [25]:
#Writing final raw data set with targets to .csv to share and reuse
at_bat_df.to_csv('../data/all_ab_raw_data_w_target.csv', index=True)