# Preprocessing

In [275]:
# Libraries and prameters
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer


# Functions

In [276]:
# Import data
data = pd.read_csv("../preproc_data/data_with_new_features.csv")

In [277]:
data.stadium_stadium_type.unique()

array(['outdoor', 'indoor', 'retractable', nan], dtype=object)

In [278]:
# Remove last columns
data = data.drop(columns=["id", "game_id", "hitter_id", "pitcher_id", "home_team_id", "away_team_id",
                   "stadium_id", "stadium_lat", "stadium_lon", "away_stadium_lat", "away_stadium_lon",
                   "at_bat_end_time", "Unnamed: 0.1", "Unnamed: 0", "pitcher_primary_position",
                   "hitter_primary_position", "stadium_capacity"])
data.shape

(143088, 19)

In [279]:
data.y_target.value_counts(dropna=False)

y_target
0    97445
1    45643
Name: count, dtype: int64

In [280]:
# Convert weather_condition to 3 categories
categories = pd.read_csv("../raw_data/Classeur2.csv",sep=";", index_col="weather_condition")
categories = categories.to_dict()["weather_cat"]

data.weather_condition = data.weather_condition.replace(categories)

In [281]:
X = data.drop(columns=["y_target"])
y = data.y_target

In [282]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [283]:
#Adding imputer to handle null values
imputer = KNNImputer(n_neighbors=5)

In [284]:
num_col = ["inning", "temp_f", "humidity", "wind_speed_mph", "attendance", "pitch_speed_mph",
           "pitch_count_at_bat", "pitcher_pitch_count_at_bat_start", "outs_at_start", "distance",
           "hitter_previous_stats", "pitcher_previous_stats"]

num_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    MinMaxScaler()
)


bin_cat_col = ["side", "pitcher_hand", "day_night"]
bin_cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

cat_col = ["hitter_hand", "weather_condition", "stadium_stadium_type"]
cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(sparse_output=True, drop="if_binary")
)

preproc_transformer = make_column_transformer(
    (num_transformer, num_col),
    (bin_cat_transformer, bin_cat_col),
    (cat_transformer, cat_col)
)

preproc = make_pipeline(preproc_transformer)
preproc

In [285]:
preproc.fit(X_train)

X_train_preproc = preproc.transform(X_train)
X_train_preproc = pd.DataFrame(X_train_preproc, columns=preproc.get_feature_names_out())

X_test_preproc = preproc.transform(X_test)
X_test_preproc = pd.DataFrame(X_test_preproc, columns=preproc.get_feature_names_out())

In [286]:
X_train_preproc

Unnamed: 0,pipeline-1__inning,pipeline-1__temp_f,pipeline-1__humidity,pipeline-1__wind_speed_mph,pipeline-1__attendance,pipeline-1__pitch_speed_mph,pipeline-1__pitch_count_at_bat,pipeline-1__pitcher_pitch_count_at_bat_start,pipeline-1__outs_at_start,pipeline-1__distance,pipeline-1__hitter_previous_stats,pipeline-1__pitcher_previous_stats,pipeline-2__side,pipeline-2__pitcher_hand,pipeline-2__day_night,pipeline-3__hitter_hand_B,pipeline-3__hitter_hand_L,pipeline-3__hitter_hand_R,pipeline-3__weather_condition_bad_weather,pipeline-3__weather_condition_good_weather,pipeline-3__weather_condition_mild_weather,pipeline-3__stadium_stadium_type_indoor,pipeline-3__stadium_stadium_type_outdoor,pipeline-3__stadium_stadium_type_retractable
0,0.230769,0.385542,0.291667,0.367347,0.182651,0.814346,0.3125,0.119658,0.5,0.241557,0.320988,0.571429,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.461538,0.433735,0.437500,0.448980,0.742194,0.791842,0.1875,0.085470,0.5,0.488926,0.323529,0.388889,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.230769,0.530120,0.302083,0.265306,0.786592,0.703235,0.1875,0.452991,0.0,0.000000,0.280000,0.297414,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.230769,0.746988,0.250000,0.224490,0.341996,0.846695,0.1875,0.470085,1.0,0.316375,0.328947,0.270802,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.615385,0.433735,0.291667,0.102041,0.642347,0.845288,0.1250,0.042735,0.5,0.598164,0.292683,0.235294,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114465,0.076923,0.409639,0.666667,0.020408,0.455575,0.699015,0.3750,0.128205,0.5,0.944609,0.267857,0.306452,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
114466,0.076923,0.650602,0.083333,0.020408,0.425520,0.774965,0.3125,0.162393,1.0,0.212874,0.257310,0.342466,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
114467,0.384615,0.445783,0.447917,0.285714,0.744605,0.720113,0.3125,0.068376,0.0,0.207905,0.329480,0.292135,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
114468,0.538462,0.457831,0.593750,0.204082,0.213768,0.824191,0.2500,0.837607,0.5,0.551768,0.281046,0.318966,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [287]:
X_train_preproc.to_csv("../preproc_data/X_train_preproc.csv")
X_test_preproc.to_csv("../preproc_data/X_test_preproc.csv")

y_train.to_csv("../preproc_data/y_train.csv")
y_test.to_csv("../preproc_data/y_test.csv")