# Merging table to create Dataset for feature engineering

In [37]:
import pandas as pd
import numpy as np
import datetime as dt

pd.set_option('display.max_columns', None)

In [38]:
# Data import
games = pd.read_csv("../raw_data/games_w_venue.csv", index_col="Unnamed: 0")
players = pd.read_csv("../raw_data/players.csv", index_col="Unnamed: 0")
teams = pd.read_csv("../raw_data/teams.csv")
data = pd.read_csv("../raw_data/all_ab_raw_data_w_target.csv")

In [39]:
data.shape

(142307, 19)

In [40]:
#classifying last pitch as ball or strike
data["pitch_class"] = data["pitch_location_zone"].apply(lambda x: 0 if x < 10 else 1)
data = data.drop(columns="pitch_location_zone")

In [41]:
# Merging data and games
games = games.rename(columns={"id": "game_id"})
data = data.merge(games, how="left", on='game_id')
data.shape

(142307, 37)

In [42]:
# Merging data and hitters
hitters = players[~players.id.duplicated(keep="first")]
hitters = hitters.add_prefix("hitter_")
data = data.merge(hitters, how="left", on="hitter_id")

In [43]:
# Merging data and pitchers
pitchers = players[~players.id.duplicated(keep="first")]
pitchers = pitchers.add_prefix("pitcher_")
data = data.merge(pitchers, how="left", on="pitcher_id")
data.shape

(142307, 47)

In [44]:
# Merging data and home team
home_team = teams.add_prefix("home_team_")
data = data.rename(columns={"home_team": "home_team_id"})
data = data.merge(home_team, how="left", on="home_team_id")
data.shape

(142307, 50)

In [45]:
# Merging data and away team
away_team = teams.add_prefix("away_team_")
data = data.rename(columns={"away_team": "away_team_id"})
data = data.merge(away_team, how="left", on="away_team_id")
data.shape

(142307, 53)

In [46]:
data.columns

Index(['id', 'game_id', 'hitter_id', 'hitter_hand', 'pitcher_id',
       'pitcher_hand', 'description', 'temp_f', 'humidity', 'at_bat_end_time',
       'pitch_speed_mph', 'pitch_count_at_bat',
       'pitcher_pitch_count_at_bat_start', 'outs_at_start', 'play_outcome',
       'mc_target', 'y_target', 'pitch_type_cat', 'pitch_class', 'status',
       'coverage', 'game_number', 'day_night', 'scheduled', 'home_team_id',
       'away_team_id', 'attendance', 'duration', 'double_header', 'entry_mode',
       'reference', 'venue', 'home', 'away', 'broadcast', 'rescheduled',
       'venue_id', 'hitter_player_name', 'hitter_team_id', 'hitter_team_name',
       'hitter_position', 'hitter_primary_position', 'pitcher_player_name',
       'pitcher_team_id', 'pitcher_team_name', 'pitcher_position',
       'pitcher_primary_position', 'home_team_name', 'home_team_market',
       'home_team_abbr', 'away_team_name', 'away_team_market',
       'away_team_abbr'],
      dtype='object')

In [47]:
data.columns

Index(['id', 'game_id', 'hitter_id', 'hitter_hand', 'pitcher_id',
       'pitcher_hand', 'description', 'temp_f', 'humidity', 'at_bat_end_time',
       'pitch_speed_mph', 'pitch_count_at_bat',
       'pitcher_pitch_count_at_bat_start', 'outs_at_start', 'play_outcome',
       'mc_target', 'y_target', 'pitch_type_cat', 'pitch_class', 'status',
       'coverage', 'game_number', 'day_night', 'scheduled', 'home_team_id',
       'away_team_id', 'attendance', 'duration', 'double_header', 'entry_mode',
       'reference', 'venue', 'home', 'away', 'broadcast', 'rescheduled',
       'venue_id', 'hitter_player_name', 'hitter_team_id', 'hitter_team_name',
       'hitter_position', 'hitter_primary_position', 'pitcher_player_name',
       'pitcher_team_id', 'pitcher_team_name', 'pitcher_position',
       'pitcher_primary_position', 'home_team_name', 'home_team_market',
       'home_team_abbr', 'away_team_name', 'away_team_market',
       'away_team_abbr'],
      dtype='object')

In [48]:
columns_to_remove_fp = list(('description','scheduled',
                            'status', 'coverage', 'game_number', 
                            'duration', 'double_header', 'entry_mode', 'reference', 
                            'venue', 'home', 'away', 'broadcast', 'rescheduled','hitter_team_id', 'hitter_team_name','pitcher_position',
                            'pitcher_team_id', 'pitcher_team_name', 'home_team_name', 'home_team_market', 'home_team_abbr',
                            'away_team_name', 'away_team_market', 'away_team_abbr'))

In [49]:
data = data.drop(columns=columns_to_remove_fp)

In [50]:
#Cleaning up data points
data['outs_at_start'] = data['outs_at_start'].apply(lambda x: 2 if x == 3 else x)
data['pitcher_pitch_count_at_bat_start'] = data['pitcher_pitch_count_at_bat_start'].apply(lambda x: 0 if x < 0 else x)

In [51]:
#Coverting columns to the correct dtype
data["at_bat_end_time"] = pd.to_datetime(data["at_bat_end_time"])

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142307 entries, 0 to 142306
Data columns (total 28 columns):
 #   Column                            Non-Null Count   Dtype              
---  ------                            --------------   -----              
 0   id                                142307 non-null  object             
 1   game_id                           142307 non-null  object             
 2   hitter_id                         142307 non-null  object             
 3   hitter_hand                       142307 non-null  object             
 4   pitcher_id                        142307 non-null  object             
 5   pitcher_hand                      142307 non-null  object             
 6   temp_f                            142307 non-null  float64            
 7   humidity                          142307 non-null  float64            
 8   at_bat_end_time                   142307 non-null  datetime64[ns, UTC]
 9   pitch_speed_mph                   142307 non-nul

In [53]:
data = data.sort_values(["at_bat_end_time"], ignore_index=True, ascending=True)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142307 entries, 0 to 142306
Data columns (total 28 columns):
 #   Column                            Non-Null Count   Dtype              
---  ------                            --------------   -----              
 0   id                                142307 non-null  object             
 1   game_id                           142307 non-null  object             
 2   hitter_id                         142307 non-null  object             
 3   hitter_hand                       142307 non-null  object             
 4   pitcher_id                        142307 non-null  object             
 5   pitcher_hand                      142307 non-null  object             
 6   temp_f                            142307 non-null  float64            
 7   humidity                          142307 non-null  float64            
 8   at_bat_end_time                   142307 non-null  datetime64[ns, UTC]
 9   pitch_speed_mph                   142307 non-nul

In [55]:
data.to_csv('../raw_data/final_raw_data.csv', index=True)