# Merging table to create Dataset for feature engineering

In [87]:
import pandas as pd
import requests
import json
import time
import numpy as np

pd.set_option('display.max_columns', None)

In [88]:
# Data import
games = pd.read_csv("../raw_data/games_w_venue.csv")
players = pd.read_csv("../raw_data/players.csv")
stadiums = pd.read_csv("../raw_data/stadiums.csv")
teams = pd.read_csv("../raw_data/teams.csv")
data = pd.read_csv("../raw_data/all_ab_raw_data_w_target.csv")
added_data = pd.read_csv("../raw_data/all_ab_raw_data_add_columns.csv")

In [89]:
data.shape

(143088, 16)

In [90]:
# Merging data and games
games = games.rename(columns={"id": "game_id"})
data = data.merge(games, how="left", on='game_id')
data.shape

(143088, 35)

In [91]:
# Merging data and hitters
hitters = players[~players.id.duplicated(keep="first")]
hitters = hitters.add_prefix("hitter_")
data = data.merge(hitters, how="left", on="hitter_id")
data.shape

(143088, 45)

In [92]:
# Merging data and pitchers
pitchers = players[~players.id.duplicated(keep="first")]
pitchers = pitchers.add_prefix("pitcher_")
data = data.merge(pitchers, how="left", on="pitcher_id")
data.shape

(143088, 55)

In [93]:
# Merging data and home team
home_team = teams.add_prefix("home_team_")
data = data.rename(columns={"home_team": "home_team_id"})
data = data.merge(home_team, how="left", on="home_team_id")
data.shape

(143088, 58)

In [94]:
# Merging data and away team
away_team = teams.add_prefix("away_team_")
data = data.rename(columns={"away_team": "away_team_id"})
data = data.merge(away_team, how="left", on="away_team_id")
data.shape

(143088, 61)

In [95]:
# Merging data and stadium
venue = stadiums.add_prefix("stadium_")
data = data.rename(columns={"venue_id": "stadium_id"})
data = data.merge(venue, how="left", on="stadium_id")
data.shape

(143088, 75)

In [96]:
# Merging data and away adress
away_stadium = stadiums
away_stadium["abbr"] = teams.abbr
away_stadium = away_stadium.add_prefix("away_stadium_")
away_stadium = away_stadium.rename(columns={"away_stadium_abbr": "away_team_abbr"})
data = data.merge(away_stadium, how="left", on="away_team_abbr")
data.shape

(143088, 90)

In [98]:
added_data = added_data.drop(columns='Unnamed: 0')

In [99]:
# Merging data and added parsing columns from JSON files
data = data.merge(added_data, how="left", left_on='id', right_on='id')
data.shape

(143088, 98)

In [100]:
data.head(3)

Unnamed: 0.1,id,game_id,inning,side,hitter_id,hitter_hand,pitcher_id,pitcher_hand,description,temp_f,weather_condition,humidity,wind_speed_mph,play_outcome,mc_target,y_target,Unnamed: 0,status,coverage,game_number,day_night,scheduled,home_team_id,away_team_id,attendance,duration,double_header,entry_mode,reference,venue,home,away,broadcast,rescheduled,stadium_id,hitter_status,hitter_position,hitter_primary_position,hitter_first_name,hitter_last_name,hitter_preferred_name,hitter_jersey_number,hitter_depth,hitter_team_id,hitter_team_nickname,pitcher_status,pitcher_position,pitcher_primary_position,pitcher_first_name,pitcher_last_name,pitcher_preferred_name,pitcher_jersey_number,pitcher_depth,pitcher_team_id,pitcher_team_nickname,home_team_name,home_team_market,home_team_abbr,away_team_name,away_team_market,away_team_abbr,stadium_name,stadium_market,stadium_capacity,stadium_surface,stadium_address,stadium_city,stadium_state,stadium_zip,stadium_country,stadium_field_orientation,stadium_stadium_type,stadium_time_zone,stadium_lat,stadium_lon,away_stadium_id,away_stadium_name,away_stadium_market,away_stadium_capacity,away_stadium_surface,away_stadium_address,away_stadium_city,away_stadium_state,away_stadium_zip,away_stadium_country,away_stadium_field_orientation,away_stadium_stadium_type,away_stadium_time_zone,away_stadium_lat,away_stadium_lon,at_bat_end_time,pitch_type_code,pitch_type_des,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start,output_code
0,e7acb70b-affd-4fe8-9dcb-8a71da23612b,00846785-f968-4867-8896-cb4d5f7a3e63,1,T,6ea2efc3-345a-4e30-8b95-744a60a3e7a5,R,738eb5e8-2ab8-4428-928a-81a31b7228de,R,Nico Hoerner flies out to left field to Luis R...,70.0,Partly cloudy,53.0,4.0,IPO,0,0,0,closed,full,1,N,2023-06-08T01:38:00+00:00,4f735188-37c8-473d-ae32-1f7e34ccf892,55714da8-fcaf-4574-8443-59bfb511a524,28817.0,2:31,False,STOMP,717855,"{'name': 'Angel Stadium', 'market': 'Los Angel...","{'name': 'Angels', 'market': 'Los Angeles', 'a...","{'name': 'Cubs', 'market': 'Chicago', 'abbr': ...",{'network': 'MLB Network'},,60732da9-ad03-4feb-9a36-aee3e98c7a2b,A,IF,2B,Nicholas,Hoerner,Nico,2.0,2.0,55714da8-fcaf-4574-8443-59bfb511a524,Cubs,A,P,RP,Jaime,Barría,Jaime,51.0,6.0,4f735188-37c8-473d-ae32-1f7e34ccf892,Angels,Angels,Los Angeles,LAA,Cubs,Chicago,CHC,Angel Stadium,Los Angeles,45517.0,grass,2000 Gene Autry Way,Anaheim,CA,92806,USA,NE,outdoor,US/Pacific,33.799662,-117.883438,53f8eb0d-a361-4a7a-930b-2f8735ea0698,Wrigley Field,Chicago,41363.0,grass,1060 West Addison Street,Chicago,IL,60613,USA,NE,outdoor,US/Central,41.947447,-87.656054,2023-06-08T01:41:45+00:00,FF,Four-Seam Fastball,93.3,9.0,0.0,0.0,oFO
1,0bbf479e-52f5-4bd6-a0f0-445e172ea7fc,00846785-f968-4867-8896-cb4d5f7a3e63,1,T,5773d9a2-dfc1-458a-acac-67528d6618b3,R,738eb5e8-2ab8-4428-928a-81a31b7228de,R,Dansby Swanson pops out to Jared Walsh.,70.0,Partly cloudy,53.0,4.0,IPO,0,0,0,closed,full,1,N,2023-06-08T01:38:00+00:00,4f735188-37c8-473d-ae32-1f7e34ccf892,55714da8-fcaf-4574-8443-59bfb511a524,28817.0,2:31,False,STOMP,717855,"{'name': 'Angel Stadium', 'market': 'Los Angel...","{'name': 'Angels', 'market': 'Los Angeles', 'a...","{'name': 'Cubs', 'market': 'Chicago', 'abbr': ...",{'network': 'MLB Network'},,60732da9-ad03-4feb-9a36-aee3e98c7a2b,A,IF,SS,Dansby,Swanson,Dansby,7.0,1.0,55714da8-fcaf-4574-8443-59bfb511a524,Cubs,A,P,RP,Jaime,Barría,Jaime,51.0,6.0,4f735188-37c8-473d-ae32-1f7e34ccf892,Angels,Angels,Los Angeles,LAA,Cubs,Chicago,CHC,Angel Stadium,Los Angeles,45517.0,grass,2000 Gene Autry Way,Anaheim,CA,92806,USA,NE,outdoor,US/Pacific,33.799662,-117.883438,53f8eb0d-a361-4a7a-930b-2f8735ea0698,Wrigley Field,Chicago,41363.0,grass,1060 West Addison Street,Chicago,IL,60613,USA,NE,outdoor,US/Central,41.947447,-87.656054,2023-06-08T01:42:22+00:00,SL,Slider,85.4,1.0,9.0,1.0,oPO
2,56a4f487-1e79-4383-bea2-0327559a6423,00846785-f968-4867-8896-cb4d5f7a3e63,1,T,d8968ebd-8227-44b6-b442-77d088c9f98a,L,738eb5e8-2ab8-4428-928a-81a31b7228de,R,Ian Happ walks.,70.0,Partly cloudy,53.0,4.0,walk,1,1,0,closed,full,1,N,2023-06-08T01:38:00+00:00,4f735188-37c8-473d-ae32-1f7e34ccf892,55714da8-fcaf-4574-8443-59bfb511a524,28817.0,2:31,False,STOMP,717855,"{'name': 'Angel Stadium', 'market': 'Los Angel...","{'name': 'Angels', 'market': 'Los Angeles', 'a...","{'name': 'Cubs', 'market': 'Chicago', 'abbr': ...",{'network': 'MLB Network'},,60732da9-ad03-4feb-9a36-aee3e98c7a2b,A,OF,LF,Ian,Happ,Ian,8.0,1.0,55714da8-fcaf-4574-8443-59bfb511a524,Cubs,A,P,RP,Jaime,Barría,Jaime,51.0,6.0,4f735188-37c8-473d-ae32-1f7e34ccf892,Angels,Angels,Los Angeles,LAA,Cubs,Chicago,CHC,Angel Stadium,Los Angeles,45517.0,grass,2000 Gene Autry Way,Anaheim,CA,92806,USA,NE,outdoor,US/Pacific,33.799662,-117.883438,53f8eb0d-a361-4a7a-930b-2f8735ea0698,Wrigley Field,Chicago,41363.0,grass,1060 West Addison Street,Chicago,IL,60613,USA,NE,outdoor,US/Central,41.947447,-87.656054,2023-06-08T01:43:58+00:00,FF,Four-Seam Fastball,94.4,6.0,10.0,2.0,bB


In [101]:
data.describe()

Unnamed: 0.1,inning,temp_f,humidity,wind_speed_mph,mc_target,y_target,Unnamed: 0,game_number,attendance,reference,hitter_jersey_number,hitter_depth,pitcher_jersey_number,pitcher_depth,stadium_capacity,stadium_lat,stadium_lon,away_stadium_capacity,away_stadium_lat,away_stadium_lon,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start
count,143088.0,143013.0,143013.0,143013.0,143088.0,143088.0,143088.0,143088.0,142641.0,143088.0,111803.0,111803.0,95095.0,95095.0,142698.0,142698.0,142698.0,143013.0,143013.0,143013.0,142424.0,143042.0,143042.0,141428.0
mean,4.958732,73.285184,54.075483,11.183962,0.466384,0.318986,945.036083,1.014732,29305.09988,717835.516032,21.444925,1.619518,44.552258,3.049887,42387.621004,38.127296,-92.610975,42391.641389,38.219927,-92.418223,89.056988,3.88948,29.433083,0.978625
std,2.583891,12.717081,19.632913,221.291345,0.860947,0.466085,545.990683,0.120479,11307.751334,546.710633,17.989597,0.964939,18.770786,1.723717,5475.053405,5.018305,16.405014,5489.398744,5.033629,16.425165,6.106822,1.897205,27.071353,0.815291
min,1.0,34.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0,716887.0,0.0,1.0,1.0,1.0,25025.0,25.778057,-122.390621,25025.0,25.778057,-122.390621,33.7,0.0,-6.0,0.0
25%,3.0,65.0,40.0,4.0,0.0,0.0,475.0,1.0,20286.0,717363.0,8.0,1.0,32.0,2.0,40000.0,33.890672,-104.993349,40000.0,33.890672,-104.993349,84.6,2.0,7.0,0.0
50%,5.0,74.0,56.0,8.0,0.0,0.0,942.0,1.0,30578.0,717834.0,18.0,1.0,45.0,3.0,41376.0,39.097736,-87.656054,41700.0,39.283787,-87.634833,89.8,4.0,20.0,1.0
75%,7.0,82.0,69.0,12.0,1.0,1.0,1416.0,1.0,38605.0,718309.0,28.0,2.0,57.0,4.0,45971.0,41.830066,-80.006409,45971.0,41.830066,-80.006409,93.9,5.0,50.0,2.0
max,14.0,117.0,100.0,22369.0,4.0,1.0,2430.0,2.0,55565.0,718782.0,99.0,8.0,99.0,8.0,56000.0,47.589904,-71.098782,56000.0,47.589904,-71.098782,104.8,16.0,117.0,3.0


In [102]:
data.duplicated().sum()

0

In [103]:
columns_to_remove_fp = list(('description', 'play_outcome', 'mc_target', 
                            'Unnamed: 0', 'status', 'coverage', 'game_number', 
                            'duration', 'double_header', 'entry_mode', 'reference', 
                            'venue', 'home', 'away', 'broadcast', 'rescheduled', 'hitter_status', 
                            'hitter_position', 'hitter_first_name', 'hitter_last_name', 'hitter_preferred_name',
                            'hitter_jersey_number', 'hitter_depth', 'hitter_team_id', 'hitter_team_nickname', 'pitcher_status',
                            'pitcher_position', 'pitcher_first_name', 'pitcher_last_name', 'pitcher_preferred_name', 'pitcher_jersey_number',
                            'pitcher_depth', 'pitcher_team_id', 'pitcher_team_nickname', 'home_team_name', 'home_team_market', 'home_team_abbr',
                            'away_team_name', 'away_team_market', 'away_team_abbr', 'stadium_name', 'stadium_market', 'stadium_surface', 'stadium_address',
                            'stadium_city', 'stadium_state', 'stadium_zip', 'stadium_country', 'stadium_field_orientation', 'stadium_time_zone', 'away_stadium_id',
                            'away_stadium_name', 'away_stadium_market', 'away_stadium_surface', 'away_stadium_address', 'away_stadium_city', 'away_stadium_state', 'away_stadium_zip',
                            'away_stadium_country', 'away_stadium_field_orientation', 'away_stadium_stadium_type', 'away_stadium_time_zone'))

In [104]:
data = data.drop(columns=columns_to_remove_fp)

In [108]:
data.describe()

Unnamed: 0,inning,temp_f,humidity,wind_speed_mph,y_target,attendance,stadium_capacity,stadium_lat,stadium_lon,away_stadium_capacity,away_stadium_lat,away_stadium_lon,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start
count,143088.0,143013.0,143013.0,143013.0,143088.0,142641.0,142698.0,142698.0,142698.0,143013.0,143013.0,143013.0,142424.0,143042.0,143042.0,141428.0
mean,4.958732,73.285184,54.075483,11.183962,0.318986,29305.09988,42387.621004,38.127296,-92.610975,42391.641389,38.219927,-92.418223,89.056988,3.88948,29.433083,0.978625
std,2.583891,12.717081,19.632913,221.291345,0.466085,11307.751334,5475.053405,5.018305,16.405014,5489.398744,5.033629,16.425165,6.106822,1.897205,27.071353,0.815291
min,1.0,34.0,4.0,1.0,0.0,0.0,25025.0,25.778057,-122.390621,25025.0,25.778057,-122.390621,33.7,0.0,-6.0,0.0
25%,3.0,65.0,40.0,4.0,0.0,20286.0,40000.0,33.890672,-104.993349,40000.0,33.890672,-104.993349,84.6,2.0,7.0,0.0
50%,5.0,74.0,56.0,8.0,0.0,30578.0,41376.0,39.097736,-87.656054,41700.0,39.283787,-87.634833,89.8,4.0,20.0,1.0
75%,7.0,82.0,69.0,12.0,1.0,38605.0,45971.0,41.830066,-80.006409,45971.0,41.830066,-80.006409,93.9,5.0,50.0,2.0
max,14.0,117.0,100.0,22369.0,1.0,55565.0,56000.0,47.589904,-71.098782,56000.0,47.589904,-71.098782,104.8,16.0,117.0,3.0


In [109]:
data.to_csv('../raw_data/final_raw_data.csv', index=True)