# Preprocessing

In [1]:
# Libraries and prameters
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer


# Functions

In [2]:
# Import data
data = pd.read_csv("../preproc_data/data_with_new_features.csv")

In [3]:
data.stadium_stadium_type.unique()

array(['outdoor', 'indoor', 'retractable', nan], dtype=object)

In [4]:
# Remove last columns
data = data.drop(columns=["id", "game_id", "hitter_id", "pitcher_id", "home_team_id", "away_team_id",
                   "stadium_id", "stadium_lat", "stadium_lon", "away_stadium_lat", "away_stadium_lon",
                   "at_bat_end_time", "Unnamed: 0.1", "Unnamed: 0", "pitcher_primary_position",
                   "hitter_primary_position", "stadium_capacity"])
data.shape

(143088, 19)

In [5]:
data.y_target.value_counts(dropna=False)

y_target
0    97445
1    45643
Name: count, dtype: int64

In [6]:
# Convert weather_condition to 3 categories
categories = pd.read_csv("../raw_data/Classeur2.csv",sep=";", index_col="weather_condition")
categories = categories.to_dict()["weather_cat"]

data.weather_condition = data.weather_condition.replace(categories)

In [7]:
X = data.drop(columns=["y_target"])
y = data.y_target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [9]:
#Adding imputer to handle null values
imputer = KNNImputer(n_neighbors=5)

In [10]:
num_col = ["inning", "temp_f", "humidity", "wind_speed_mph", "attendance", "pitch_speed_mph",
           "pitch_count_at_bat", "pitcher_pitch_count_at_bat_start", "outs_at_start", "distance",
           "hitter_previous_stats", "pitcher_previous_stats"]

num_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    MinMaxScaler()
)


bin_cat_col = ["side", "pitcher_hand", "day_night"]
bin_cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

cat_col = ["hitter_hand", "weather_condition", "stadium_stadium_type"]
cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(sparse_output=True, drop="if_binary")
)

preproc_transformer = make_column_transformer(
    (num_transformer, num_col),
    (bin_cat_transformer, bin_cat_col),
    (cat_transformer, cat_col)
)

preproc = make_pipeline(preproc_transformer)
preproc

In [13]:
preproc.fit(X_train)

X_train_preproc = preproc.transform(X_train)
X_train_preproc = pd.DataFrame(X_train_preproc, columns=preproc.get_feature_names_out(), index=X_train.index)

X_test_preproc = preproc.transform(X_test)
X_test_preproc = pd.DataFrame(X_test_preproc, columns=preproc.get_feature_names_out(), index=X_test.index)

In [14]:
X_train_preproc

Unnamed: 0,pipeline-1__inning,pipeline-1__temp_f,pipeline-1__humidity,pipeline-1__wind_speed_mph,pipeline-1__attendance,pipeline-1__pitch_speed_mph,pipeline-1__pitch_count_at_bat,pipeline-1__pitcher_pitch_count_at_bat_start,pipeline-1__outs_at_start,pipeline-1__distance,pipeline-1__hitter_previous_stats,pipeline-1__pitcher_previous_stats,pipeline-2__side,pipeline-2__pitcher_hand,pipeline-2__day_night,pipeline-3__hitter_hand_B,pipeline-3__hitter_hand_L,pipeline-3__hitter_hand_R,pipeline-3__weather_condition_bad_weather,pipeline-3__weather_condition_good_weather,pipeline-3__weather_condition_mild_weather,pipeline-3__stadium_stadium_type_indoor,pipeline-3__stadium_stadium_type_outdoor,pipeline-3__stadium_stadium_type_retractable
65291,0.000000,0.373494,0.666667,0.163265,0.678557,0.709450,0.133333,0.000000,0.0,0.124910,0.373541,0.314815,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
46558,0.538462,0.361446,0.229167,0.326531,0.518492,0.854725,0.133333,0.000000,1.0,0.122569,0.299401,0.306818,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
86539,0.307692,0.638554,0.427083,0.163265,0.782831,0.833568,0.333333,0.700855,0.0,0.129734,0.344262,0.371875,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
100350,0.230769,0.433735,0.593750,0.061224,0.939692,0.750353,0.200000,0.094017,0.5,0.007857,0.318644,0.348189,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
20265,0.384615,0.168675,0.510417,0.020408,0.202106,0.720733,0.066667,0.000000,0.0,0.031087,0.242424,0.300000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63814,0.000000,0.626506,0.406250,0.204082,0.302133,0.696756,0.200000,0.102564,1.0,0.229175,0.347826,0.290441,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
133565,0.230769,0.530120,0.656250,0.142857,0.440835,0.729196,0.333333,0.358974,1.0,0.028660,0.423077,0.327496,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
108620,0.076923,0.590361,0.593750,0.061224,0.607451,0.698166,0.133333,0.282051,0.5,0.253342,0.288462,0.256000,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
23843,0.076923,0.313253,0.656250,0.061224,0.697058,0.605078,0.333333,0.205128,0.5,0.629534,0.189655,0.416667,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [15]:
X_train_preproc.to_csv("../preproc_data/X_train_preproc.csv")
X_test_preproc.to_csv("../preproc_data/X_test_preproc.csv")

y_train.to_csv("../preproc_data/y_train.csv")
y_test.to_csv("../preproc_data/y_test.csv")