# Features engineering

In [90]:
# Libraries and parameters
import pandas as pd
pd.set_option('display.max_columns', None)
import math

# Functions


In [91]:
# Import data
data = pd.read_csv("../raw_data/final_raw_data.csv", index_col="Unnamed: 0")

In [92]:
# #Creating hitter stats up to at bat
data['hitter_previous_stats_szn'] = data.groupby("hitter_id")["y_target"].cumsum() / (data.groupby('hitter_id').cumcount() + 1)
data['rolling_1ab'] = data.groupby("hitter_id")["y_target"].transform(lambda x: x.shift().rolling(1, min_periods=1).mean())

In [93]:
# Calculate stats of pitcher before he pitchs
data['pitcher_previous_stats_szn'] = data.groupby("pitcher_id")["y_target"].cumsum() / (data.groupby('pitcher_id').cumcount() + 1)
data['rolling_1pitch'] = data.groupby("pitcher_id")["y_target"].transform(lambda x: x.shift().rolling(1, min_periods=1).mean())

In [94]:
#drop 0.004343% of data (first ten at bats for each player)
data = data.dropna(subset=["rolling_1ab", "rolling_1pitch"])

In [95]:
#Create handed matchups
data["handed_matchup"] = data["hitter_hand"] + data["pitcher_hand"]
data.head()

Unnamed: 0,id,game_id,inning,side,hitter_id,hitter_hand,pitcher_id,pitcher_hand,temp_f,weather_condition,humidity,wind_speed_mph,at_bat_end_time,pitch_location_zone,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start,y_target,day_night,home_team_id,away_team_id,attendance,stadium_id,hitter_player_name,hitter_position,hitter_primary_position,hitter_zone_1,hitter_zone_2,hitter_zone_3,hitter_zone_4,hitter_zone_5,hitter_zone_6,hitter_zone_7,hitter_zone_8,hitter_zone_9,hitter_zone_10,hitter_zone_11,hitter_zone_12,hitter_zone_13,hitter_zone_14,pitcher_player_name,pitcher_primary_position,pitcher_zone_1,pitcher_zone_2,pitcher_zone_3,pitcher_zone_4,pitcher_zone_5,pitcher_zone_6,pitcher_zone_7,pitcher_zone_8,pitcher_zone_9,pitcher_zone_10,pitcher_zone_11,pitcher_zone_12,pitcher_zone_13,pitcher_zone_14,stadium_capacity,stadium_stadium_type,stadium_lat,stadium_lon,away_stadium_lat,away_stadium_lon,hitter_previous_stats_szn,rolling_1ab,pitcher_previous_stats_szn,rolling_1pitch,handed_matchup
20,c6204c5e-4037-40dd-a2d0-cd21fcad30b0,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,T,041632a9-afb2-4ec3-b1de-9b0bbe33ab64,L,eebc991a-23ea-4f1c-ba3b-37ff21ee1603,R,40.0,Sunny,30.0,2.0,2023-03-30 17:48:48+00:00,1.0,97.7,5.0,44.0,1.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Wade Jr., LaMonte",IF,1B,0.3125,0.227273,0.454545,0.268293,0.375,0.205882,0.342105,0.333333,0.272727,0.0,0.638298,0.416667,0.452381,0.40625,"Cole, Gerrit",SP,0.15,0.173913,0.25,0.135135,0.28,0.333333,0.15625,0.333333,0.226415,0.0,0.446154,0.365854,0.190476,0.296296,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.5,1.0,0.2,0.0,LR
23,d46669f3-7ab1-4af2-af9a-56e2599a8dee,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,T,e5bdeb0e-38fc-4d30-8127-43d0d5b2864d,L,eebc991a-23ea-4f1c-ba3b-37ff21ee1603,R,40.0,Sunny,30.0,2.0,2023-03-30 17:50:34+00:00,6.0,97.5,5.0,49.0,2.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Conforto, Michael",OF,RF,0.222222,0.269231,0.294118,0.204545,0.27907,0.272727,0.24,0.341463,0.26087,0.0,0.577778,0.391304,0.347826,0.5,"Cole, Gerrit",SP,0.15,0.173913,0.25,0.135135,0.28,0.333333,0.15625,0.333333,0.226415,0.0,0.446154,0.365854,0.190476,0.296296,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.0,0.0,0.181818,0.0,LR
28,8b5f48bd-f0fb-49c5-b3c6-d26fe601721e,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,B,d1c07b01-a86d-4c17-ac38-8217e364c2cf,R,72248e3e-3b0f-4523-9ff6-833aa33f3b32,R,40.0,Sunny,30.0,2.0,2023-03-30 17:56:55+00:00,3.0,93.2,4.0,37.0,0.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"LeMahieu, David",IF,3B,0.307692,0.26087,0.5,0.292683,0.26087,0.214286,0.409091,0.285714,0.212121,0.0,0.324324,0.5,0.414634,0.288462,"Webb, Logan",SP,0.285714,0.347826,0.117647,0.289474,0.273973,0.340426,0.383333,0.304762,0.211268,0.0,0.318182,0.384615,0.233333,0.246032,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.0,0.0,0.3,1.0,RR
30,4b61e033-2a49-4ecf-94ea-2f18fdeb8f49,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,B,86f7390e-61bd-4556-8325-a6705c7f693b,R,72248e3e-3b0f-4523-9ff6-833aa33f3b32,R,40.0,Sunny,30.0,2.0,2023-03-30 17:58:42+00:00,9.0,84.7,4.0,41.0,1.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Judge, Aaron",OF,RF,0.333333,0.428571,0.333333,0.4,0.468085,0.15,0.411765,0.258065,0.290323,0.0,0.5,0.5,0.214286,0.614035,"Webb, Logan",SP,0.285714,0.347826,0.117647,0.289474,0.273973,0.340426,0.383333,0.304762,0.211268,0.0,0.318182,0.384615,0.233333,0.246032,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.5,1.0,0.272727,0.0,RR
32,4eb10c2f-5c1f-42e9-9d71-9df84cbf813b,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,B,75cb4b6c-a087-4b77-90e3-7473284fa8ad,L,72248e3e-3b0f-4523-9ff6-833aa33f3b32,R,40.0,Sunny,30.0,2.0,2023-03-30 18:01:13+00:00,3.0,91.5,6.0,45.0,2.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Rizzo, Anthony",IF,1B,0.125,0.5,0.090909,0.352941,0.314815,0.384615,0.25,0.314286,0.181818,0.0,0.34375,0.533333,0.328947,0.34,"Webb, Logan",SP,0.285714,0.347826,0.117647,0.289474,0.273973,0.340426,0.383333,0.304762,0.211268,0.0,0.318182,0.384615,0.233333,0.246032,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.5,1.0,0.25,0.0,LR


In [96]:
data.isnull().sum()

id                            0
game_id                       0
inning                        0
side                          0
hitter_id                     0
                             ..
hitter_previous_stats_szn     0
rolling_1ab                   0
pitcher_previous_stats_szn    0
rolling_1pitch                0
handed_matchup                0
Length: 68, dtype: int64

In [97]:
# data[data["hitter_id"] == "a8efa694-2d64-4ad3-bf39-7d4f9006be7f"].to_csv('../../archive/test_rolling_ab.csv')
# data[data["pitcher_id"] == "e27962b7-39b4-43b5-9e6f-0fe27dcf2ead"].to_csv('../../archive/test_rolling_pitch.csv')

In [98]:
data.to_csv("../preproc_data/data_with_new_features.csv")

In [99]:
data.head()

Unnamed: 0,id,game_id,inning,side,hitter_id,hitter_hand,pitcher_id,pitcher_hand,temp_f,weather_condition,humidity,wind_speed_mph,at_bat_end_time,pitch_location_zone,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start,y_target,day_night,home_team_id,away_team_id,attendance,stadium_id,hitter_player_name,hitter_position,hitter_primary_position,hitter_zone_1,hitter_zone_2,hitter_zone_3,hitter_zone_4,hitter_zone_5,hitter_zone_6,hitter_zone_7,hitter_zone_8,hitter_zone_9,hitter_zone_10,hitter_zone_11,hitter_zone_12,hitter_zone_13,hitter_zone_14,pitcher_player_name,pitcher_primary_position,pitcher_zone_1,pitcher_zone_2,pitcher_zone_3,pitcher_zone_4,pitcher_zone_5,pitcher_zone_6,pitcher_zone_7,pitcher_zone_8,pitcher_zone_9,pitcher_zone_10,pitcher_zone_11,pitcher_zone_12,pitcher_zone_13,pitcher_zone_14,stadium_capacity,stadium_stadium_type,stadium_lat,stadium_lon,away_stadium_lat,away_stadium_lon,hitter_previous_stats_szn,rolling_1ab,pitcher_previous_stats_szn,rolling_1pitch,handed_matchup
20,c6204c5e-4037-40dd-a2d0-cd21fcad30b0,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,T,041632a9-afb2-4ec3-b1de-9b0bbe33ab64,L,eebc991a-23ea-4f1c-ba3b-37ff21ee1603,R,40.0,Sunny,30.0,2.0,2023-03-30 17:48:48+00:00,1.0,97.7,5.0,44.0,1.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Wade Jr., LaMonte",IF,1B,0.3125,0.227273,0.454545,0.268293,0.375,0.205882,0.342105,0.333333,0.272727,0.0,0.638298,0.416667,0.452381,0.40625,"Cole, Gerrit",SP,0.15,0.173913,0.25,0.135135,0.28,0.333333,0.15625,0.333333,0.226415,0.0,0.446154,0.365854,0.190476,0.296296,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.5,1.0,0.2,0.0,LR
23,d46669f3-7ab1-4af2-af9a-56e2599a8dee,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,T,e5bdeb0e-38fc-4d30-8127-43d0d5b2864d,L,eebc991a-23ea-4f1c-ba3b-37ff21ee1603,R,40.0,Sunny,30.0,2.0,2023-03-30 17:50:34+00:00,6.0,97.5,5.0,49.0,2.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Conforto, Michael",OF,RF,0.222222,0.269231,0.294118,0.204545,0.27907,0.272727,0.24,0.341463,0.26087,0.0,0.577778,0.391304,0.347826,0.5,"Cole, Gerrit",SP,0.15,0.173913,0.25,0.135135,0.28,0.333333,0.15625,0.333333,0.226415,0.0,0.446154,0.365854,0.190476,0.296296,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.0,0.0,0.181818,0.0,LR
28,8b5f48bd-f0fb-49c5-b3c6-d26fe601721e,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,B,d1c07b01-a86d-4c17-ac38-8217e364c2cf,R,72248e3e-3b0f-4523-9ff6-833aa33f3b32,R,40.0,Sunny,30.0,2.0,2023-03-30 17:56:55+00:00,3.0,93.2,4.0,37.0,0.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"LeMahieu, David",IF,3B,0.307692,0.26087,0.5,0.292683,0.26087,0.214286,0.409091,0.285714,0.212121,0.0,0.324324,0.5,0.414634,0.288462,"Webb, Logan",SP,0.285714,0.347826,0.117647,0.289474,0.273973,0.340426,0.383333,0.304762,0.211268,0.0,0.318182,0.384615,0.233333,0.246032,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.0,0.0,0.3,1.0,RR
30,4b61e033-2a49-4ecf-94ea-2f18fdeb8f49,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,B,86f7390e-61bd-4556-8325-a6705c7f693b,R,72248e3e-3b0f-4523-9ff6-833aa33f3b32,R,40.0,Sunny,30.0,2.0,2023-03-30 17:58:42+00:00,9.0,84.7,4.0,41.0,1.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Judge, Aaron",OF,RF,0.333333,0.428571,0.333333,0.4,0.468085,0.15,0.411765,0.258065,0.290323,0.0,0.5,0.5,0.214286,0.614035,"Webb, Logan",SP,0.285714,0.347826,0.117647,0.289474,0.273973,0.340426,0.383333,0.304762,0.211268,0.0,0.318182,0.384615,0.233333,0.246032,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.5,1.0,0.272727,0.0,RR
32,4eb10c2f-5c1f-42e9-9d71-9df84cbf813b,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,B,75cb4b6c-a087-4b77-90e3-7473284fa8ad,L,72248e3e-3b0f-4523-9ff6-833aa33f3b32,R,40.0,Sunny,30.0,2.0,2023-03-30 18:01:13+00:00,3.0,91.5,6.0,45.0,2.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Rizzo, Anthony",IF,1B,0.125,0.5,0.090909,0.352941,0.314815,0.384615,0.25,0.314286,0.181818,0.0,0.34375,0.533333,0.328947,0.34,"Webb, Logan",SP,0.285714,0.347826,0.117647,0.289474,0.273973,0.340426,0.383333,0.304762,0.211268,0.0,0.318182,0.384615,0.233333,0.246032,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,0.5,1.0,0.25,0.0,LR
