# Preprocessing

In [42]:
# Libraries and prameters
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer


# Functions

In [43]:
# Import data
data = pd.read_csv("../preproc_data/data_with_new_features_prev_stats.csv", index_col=0)
data.head(2)

Unnamed: 0,id,game_id,inning,side,hitter_id,hitter_hand,pitcher_id,pitcher_hand,temp_f,weather_condition,humidity,wind_speed_mph,at_bat_end_time,pitch_location_zone,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start,y_target,day_night,home_team_id,away_team_id,attendance,stadium_id,hitter_player_name,hitter_position,hitter_primary_position,pitcher_player_name,pitcher_primary_position,stadium_capacity,stadium_stadium_type,stadium_lat,stadium_lon,away_stadium_lat,away_stadium_lon,distance,hitter_previous_stats_szn,rolling_1ab,rolling_3ab,rolling_5ab,rolling_10ab,pitcher_previous_stats_szn,rolling_1pitch,rolling_3pitch,rolling_5pitch,rolling_10pitch
20,c6204c5e-4037-40dd-a2d0-cd21fcad30b0,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,T,041632a9-afb2-4ec3-b1de-9b0bbe33ab64,L,eebc991a-23ea-4f1c-ba3b-37ff21ee1603,R,40.0,Sunny,30.0,2.0,2023-03-30 17:48:48+00:00,1.0,97.7,5.0,44.0,1.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Wade Jr., LaMonte",IF,1B,"Cole, Gerrit",SP,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,4130.533296,0.5,1.0,1.0,1.0,1.0,0.2,0.0,0.0,0.2,0.222222
23,d46669f3-7ab1-4af2-af9a-56e2599a8dee,f3f0ae8e-cb65-4e96-9530-c6f868738f09,3,T,e5bdeb0e-38fc-4d30-8127-43d0d5b2864d,L,eebc991a-23ea-4f1c-ba3b-37ff21ee1603,R,40.0,Sunny,30.0,2.0,2023-03-30 17:50:34+00:00,6.0,97.5,5.0,49.0,2.0,0,D,a09ec676-f887-43dc-bbb3-cf4bbaee9a18,a7723160-10b7-4277-a309-d8dd95a8ae65,46172.0,706e9828-6687-4ac8-a409-3fb972e8bae9,"Conforto, Michael",OF,RF,"Cole, Gerrit",SP,47309.0,outdoor,40.828819,-73.926569,37.77842,-122.390621,4130.533296,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,0.2,0.2


In [44]:
# Remove last columns
data = data.drop(columns=["id", "game_id", "hitter_id", "pitcher_id", "home_team_id", "away_team_id",
                   "stadium_id", "stadium_lat", "stadium_lon", "away_stadium_lat", "away_stadium_lon",
                   "at_bat_end_time", "pitcher_primary_position", "hitter_player_name", "pitcher_player_name",
                   "hitter_primary_position"])
data.head(2)

Unnamed: 0,inning,side,hitter_hand,pitcher_hand,temp_f,weather_condition,humidity,wind_speed_mph,pitch_location_zone,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start,y_target,day_night,attendance,hitter_position,stadium_capacity,stadium_stadium_type,distance,hitter_previous_stats_szn,rolling_1ab,rolling_3ab,rolling_5ab,rolling_10ab,pitcher_previous_stats_szn,rolling_1pitch,rolling_3pitch,rolling_5pitch,rolling_10pitch
20,3,T,L,R,40.0,Sunny,30.0,2.0,1.0,97.7,5.0,44.0,1.0,0,D,46172.0,IF,47309.0,outdoor,4130.533296,0.5,1.0,1.0,1.0,1.0,0.2,0.0,0.0,0.2,0.222222
23,3,T,L,R,40.0,Sunny,30.0,2.0,6.0,97.5,5.0,49.0,2.0,0,D,46172.0,OF,47309.0,outdoor,4130.533296,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,0.2,0.2


In [45]:
data.attendance = data.apply((lambda row: row["attendance"] / row["stadium_capacity"]), axis=1)
data = data.drop(columns=["stadium_capacity"])
data.attendance

20        0.975967
23        0.975967
28        0.975967
30        0.975967
32        0.975967
            ...   
142302    0.392957
142303    0.392957
142304    0.392957
142305    0.392957
142306    0.392957
Name: attendance, Length: 140931, dtype: float64

In [46]:
data.y_target.value_counts(dropna=False)

y_target
0    96255
1    44676
Name: count, dtype: int64

In [47]:
# Convert weather_condition to 3 categories
categories = pd.read_csv("../raw_data/Classeur2.csv",sep=";", index_col="weather_condition")
categories = categories.to_dict()["weather_cat"]

data.weather_condition = data.weather_condition.replace(categories)
data.weather_condition.value_counts(dropna=False)

weather_condition
good_weather    129998
mild_weather      9123
bad_weather       1810
Name: count, dtype: int64

In [48]:
X = data.drop(columns=["y_target"])
y = data.y_target

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [50]:
data.head(2)

Unnamed: 0,inning,side,hitter_hand,pitcher_hand,temp_f,weather_condition,humidity,wind_speed_mph,pitch_location_zone,pitch_speed_mph,pitch_count_at_bat,pitcher_pitch_count_at_bat_start,outs_at_start,y_target,day_night,attendance,hitter_position,stadium_stadium_type,distance,hitter_previous_stats_szn,rolling_1ab,rolling_3ab,rolling_5ab,rolling_10ab,pitcher_previous_stats_szn,rolling_1pitch,rolling_3pitch,rolling_5pitch,rolling_10pitch
20,3,T,L,R,40.0,good_weather,30.0,2.0,1.0,97.7,5.0,44.0,1.0,0,D,0.975967,IF,outdoor,4130.533296,0.5,1.0,1.0,1.0,1.0,0.2,0.0,0.0,0.2,0.222222
23,3,T,L,R,40.0,good_weather,30.0,2.0,6.0,97.5,5.0,49.0,2.0,0,D,0.975967,OF,outdoor,4130.533296,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,0.2,0.2


In [51]:
data.hitter_position.value_counts()

hitter_position
IF    67595
OF    49185
C     16668
DH     6954
P       529
Name: count, dtype: int64

In [52]:
num_col = make_column_selector(dtype_include=np.number)

num_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    MinMaxScaler()
)


bin_cat_col = ["side", "pitcher_hand", "day_night"]
bin_cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

cat_col = ["hitter_hand", "weather_condition", "stadium_stadium_type", "hitter_position",]
cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(sparse_output=True, drop="if_binary")
)

preproc_transformer = make_column_transformer(
    (num_transformer, num_col),
    (bin_cat_transformer, bin_cat_col),
    (cat_transformer, cat_col)
)

preproc = make_pipeline(preproc_transformer)
preproc

In [53]:
preproc.fit(X_train)

X_train_preproc = preproc.transform(X_train)
X_train_preproc = pd.DataFrame(X_train_preproc, columns=preproc.get_feature_names_out(), index=X_train.index)

X_test_preproc = preproc.transform(X_test)
X_test_preproc = pd.DataFrame(X_test_preproc, columns=preproc.get_feature_names_out(), index=X_test.index)

In [54]:
X_train_preproc

Unnamed: 0,pipeline-1__inning,pipeline-1__temp_f,pipeline-1__humidity,pipeline-1__wind_speed_mph,pipeline-1__pitch_location_zone,pipeline-1__pitch_speed_mph,pipeline-1__pitch_count_at_bat,pipeline-1__pitcher_pitch_count_at_bat_start,pipeline-1__outs_at_start,pipeline-1__attendance,pipeline-1__distance,pipeline-1__hitter_previous_stats_szn,pipeline-1__rolling_1ab,pipeline-1__rolling_3ab,pipeline-1__rolling_5ab,pipeline-1__rolling_10ab,pipeline-1__pitcher_previous_stats_szn,pipeline-1__rolling_1pitch,pipeline-1__rolling_3pitch,pipeline-1__rolling_5pitch,pipeline-1__rolling_10pitch,pipeline-2__side,pipeline-2__pitcher_hand,pipeline-2__day_night,pipeline-3__hitter_hand_B,pipeline-3__hitter_hand_L,pipeline-3__hitter_hand_R,pipeline-3__weather_condition_bad_weather,pipeline-3__weather_condition_good_weather,pipeline-3__weather_condition_mild_weather,pipeline-3__stadium_stadium_type_indoor,pipeline-3__stadium_stadium_type_outdoor,pipeline-3__stadium_stadium_type_retractable,pipeline-3__hitter_position_C,pipeline-3__hitter_position_DH,pipeline-3__hitter_position_IF,pipeline-3__hitter_position_OF,pipeline-3__hitter_position_P
138006,0.153846,0.493976,0.500000,0.020408,0.307692,0.945637,0.266667,0.295652,0.5,0.640282,0.854632,0.404059,0.0,0.333333,0.4,0.400000,0.293495,0.0,0.000000,0.20,0.40,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10981,0.076923,0.433735,0.177083,0.163265,0.615385,0.646638,0.266667,0.147826,0.5,0.503817,0.632884,0.333333,0.0,0.666667,0.4,0.500000,0.419355,0.0,0.000000,0.20,0.30,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4130,0.384615,0.554217,0.729167,0.163265,1.000000,0.804006,0.400000,0.565217,0.0,0.185007,0.553150,0.600000,1.0,0.333333,0.6,0.555556,0.388889,0.0,0.000000,0.00,0.30,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
77717,0.307692,0.385542,0.593750,0.244898,0.923077,0.758226,0.066667,0.660870,0.5,0.706145,0.220447,0.349398,0.0,0.000000,0.0,0.200000,0.261261,0.0,0.333333,0.40,0.20,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
14096,0.538462,0.566265,0.635417,0.306122,0.461538,0.682403,0.133333,0.260870,1.0,0.658795,0.437614,0.269231,0.0,0.333333,0.2,0.300000,0.320755,0.0,0.333333,0.20,0.20,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54352,0.230769,0.578313,0.281250,0.142857,0.923077,0.859800,0.400000,0.539130,0.5,0.272371,0.232580,0.406250,0.0,0.000000,0.0,0.300000,0.245690,1.0,1.000000,0.60,0.30,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
111930,0.000000,0.554217,0.500000,0.122449,0.923077,0.686695,0.133333,0.034783,0.5,0.719472,0.095205,0.356902,1.0,0.666667,0.8,0.500000,0.328358,0.0,0.333333,0.40,0.30,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
133830,0.076923,0.542169,0.500000,0.142857,0.846154,0.854077,0.266667,0.330435,0.5,0.608824,0.633196,0.222222,0.0,0.333333,0.2,0.100000,0.298872,1.0,0.333333,0.60,0.50,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
76284,0.307692,0.650602,0.572917,0.061224,0.923077,0.580830,0.466667,0.434783,0.5,0.494513,0.438982,0.336245,0.0,0.333333,0.4,0.400000,0.339623,0.0,0.000000,0.00,0.10,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [55]:
X_train_preproc.to_csv("../preproc_data/X_train_preproc.csv")
X_test_preproc.to_csv("../preproc_data/X_test_preproc.csv")

y_train.to_csv("../preproc_data/y_train.csv")
y_test.to_csv("../preproc_data/y_test.csv")