In [149]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
import optuna

from xgboost import XGBClassifier, XGBRegressor

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [94]:
data_dir = os.path.join(os.path.expanduser('~'), 'git_repos', 'TK5', 'Data', 'nwds-xstrikes')

In [95]:
os.listdir(data_dir)

['test.csv', 'sample_solution.csv', 'train.csv']

In [96]:
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
df_sample = pd.read_csv(os.path.join(data_dir, 'sample_solution.csv'))
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))

In [97]:
df_train['is_lefty'] = 0
df_train.loc[df_train['p_throws'] == 'L', 'is_lefty'] = 1
df_test['is_lefty'] = 0
df_test.loc[df_test['p_throws'] == 'L', 'is_lefty'] = 1

In [98]:
df_test.head()

Unnamed: 0,uid,sz_top,sz_bot,pitch_type,release_pos_x,release_pos_y,release_pos_z,stand,p_throws,inning,inning_topbot,outs_when_up,balls,strikes,if_fielding_alignment,of_fielding_alignment,on_3b,on_2b,on_1b,release_speed,spin_axis,release_spin_rate,pfx_x,pfx_z,plate_x,plate_z,is_lefty
0,122428,3.06,1.55,SI,-1.9,54.31,6.59,L,R,1,Top,2,1,1,Infield shift,Standard,False,False,False,93.8,202,2333,-1.1,1.05,0.99,2.19,0
1,291855,3.29,1.56,FC,2.68,53.84,5.67,R,L,4,Top,0,0,1,Standard,Strategic,False,False,False,88.5,153,2068,-0.14,0.8,0.16,3.06,1
2,225539,3.62,1.69,CH,-1.17,54.73,6.94,L,R,4,Bot,0,1,0,Standard,Strategic,False,False,False,78.5,221,1609,-0.73,0.62,-0.05,2.45,0
3,1410,3.42,1.71,FF,-1.41,54.33,5.98,R,R,6,Top,2,0,0,Standard,Standard,False,False,False,94.0,220,2265,-0.69,1.33,1.3,2.24,0
4,256048,3.14,1.42,FF,3.77,53.53,3.4,R,L,7,Bot,0,0,2,Standard,Standard,False,False,True,90.8,100,2158,1.56,0.6,-0.18,3.76,1


## Feature Engineering

In [99]:
# create Attack zones like NW
for df in [df_train, df_test]:
    df["attack_zone"] = "waste"
    df.loc[(df_train['plate_x'].between(-0.558, 0.558)) & (df['plate_z'].between(1.833, 3.166)), "attack_zone"] = "heart"
    df.loc[(df_train['plate_x'].between(-1.108, 1.108)) & (df['plate_z'].between(1.166, 3.833) & (~df['attack_zone'].isin(['heart']))), "attack_zone"] = "shadow"
    df.loc[(df_train['plate_x'].between(-1.666, 1.666)) & (df['plate_z'].between(0.5, 4.5) & (~df['attack_zone'].isin(['heart', 'shadow']))), "attack_zone"] = "chase"

In [100]:
df_train.loc[:, ['attack_zone', 'is_strike']].groupby('attack_zone').mean().sort_values(by=('is_strike'), ascending=False)

Unnamed: 0_level_0,is_strike
attack_zone,Unnamed: 1_level_1
heart,0.995639
shadow,0.480563
chase,0.004127
waste,7.8e-05


In [101]:
df_train.loc[:, ['pitch_type', 'is_strike']].groupby('pitch_type').mean().sort_values(by=('is_strike'), ascending=False)

Unnamed: 0_level_0,is_strike
pitch_type,Unnamed: 1_level_1
SI,0.4017
CU,0.366531
KC,0.353332
FF,0.342433
FC,0.325355
SL,0.312941
FA,0.26839
EP,0.25
CS,0.243243
CH,0.216823


In [102]:
# Make codes for pitch types and attack zones
for df in [df_train, df_test]:
    df['pitch_type_code'] = df['pitch_type'].astype('category').cat.codes
    df['attack_zone_code'] = df['attack_zone'].astype('category').cat.codes
    df['stand_code'] = df['stand'].astype('category').cat.codes

In [103]:
for df in [df_train, df_test]:
    df['on_base'] = 0
    df.loc[(df['on_1b'] == 1) | (df['on_2b'] == 1) | (df['on_3b'] == 1), 'on_base'] = 1

In [163]:
for df in [df_train, df_test]:
    df['inside_zone'] = 0
    df.loc[(df['plate_z'].between(df['sz_bot'], df['sz_top'])), 'inside_zone'] = 1
    #df.loc[(df['plate_x'].between(-0.558, 0.558)) & (df['plate_z'].between(1.833, 3.166)), 'inside_zone'] = 1

In [161]:
df[['sz_bot', 'sz_top', 'plate_x', 'inside_zone']].describe()

Unnamed: 0,sz_bot,sz_top,plate_x,inside_zone
count,20000.0,20000.0,20000.0,20000.0
mean,1.597673,3.377599,0.059817,0.05565
std,0.117979,0.19929,0.996508,0.22925
min,0.92,2.52,-4.44,0.0
25%,1.52,3.24,-0.7,0.0
50%,1.6,3.38,0.08,0.0
75%,1.68,3.51,0.81,0.0
max,2.02,4.18,4.13,1.0


In [164]:
df_train.loc[:, ['inside_zone', 'is_strike']].groupby('inside_zone').mean().sort_values(by=('is_strike'), ascending=False)

Unnamed: 0_level_0,is_strike
inside_zone,Unnamed: 1_level_1
1,0.546191
0,0.075914


## Base Model

In [165]:
id_feats = ['uid', 'pitch_type']
feats = [
    'release_speed', 
    'release_spin_rate', 
    'release_pos_x',
    'release_pos_y', 
    'release_pos_z', 
    'balls', 
    'strikes',
    'outs_when_up', 
    'plate_x', 
    'plate_z', 
    'is_lefty', 
#    'pitch_type_code', 
#    'attack_zone_code', 
    'inning',
    'on_base',
    'stand_code',
    'inside_zone'
    ]
target = "is_strike"

In [105]:
df_train.stand.unique()

array(['R', 'L'], dtype=object)

In [106]:
df_xgb = pd.DataFrame()
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(df_train):
    train = df_train.iloc[train_index].copy()
    val = df_train.iloc[val_index].copy()
    
    xgb = XGBClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1, random_state=42)
    xgb.fit(train[feats], train[target])
    
    dfs_pred = {}
    y_pred = xgb.predict_proba(val[feats])
    dfs_pred[target] = pd.Series(y_pred[:, 1], index=val.index)
    dfs_pred['uid'] = val['uid']
    df_pred = pd.concat(dfs_pred, axis=1)
    df_xgb = pd.concat([df_xgb, df_pred], axis=0)

In [107]:
df_train['is_strike_xgb'] = df_xgb['is_strike']

In [108]:
df_train[['uid', 'is_strike', 'is_strike_xgb']].head(20)

Unnamed: 0,uid,is_strike,is_strike_xgb
0,0,1,0.9975632
1,1,1,0.9359161
2,2,1,0.9964484
3,3,1,0.6407064
4,4,0,0.001325827
5,5,1,0.8974493
6,6,0,0.0003132289
7,7,1,0.8385496
8,8,1,0.9967873
9,9,1,0.8803086


In [109]:
log_loss(df_train['is_strike'], df_train['is_strike_xgb'])

0.15714354441509296

In [70]:
xgb = XGBClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1, random_state=42)
xgb.fit(df_train[feats], df_train[target])

dfs_pred = {}
y_pred = xgb.predict_proba(df_test[feats])
dfs_pred[target] = pd.Series(y_pred[:, 1], index=df_test.index)
dfs_pred['uid'] = df_test['uid']
df_pred = pd.concat(dfs_pred, axis=1)

In [76]:
df_pred[['uid', 'is_strike']].to_csv('preds/pred_xgb.csv', index=False)

In [110]:
def tune(trial):
    train_x, test_x, train_y, test_y = train_test_split(df_train[feats], df_train[target], test_size=0.2, random_state=42)

    params = {
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        }
    
    model = XGBClassifier(**params)
    model.fit(train_x, train_y, verbose=False)

    preds = model.predict(test_x)
    loss = log_loss(test_y, preds)
    return loss

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(tune, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [103]:
params = study.best_trial.params

model = XGBClassifier(**params)
model.fit(df_train[feats], df_train[target], verbose=False)

preds = model.predict(df_test[feats])
dfs_pred = {}
dfs_pred['uid'] = df_test['uid']
dfs_pred[target] = pd.Series(preds, index=df_test.index)
df_pred = pd.concat(dfs_pred, axis=1)

In [104]:
df_pred[['uid', 'is_strike']].to_csv('preds/pred_xgb_optuna.csv', index=False)

## Catboost go Brrrrr

In [166]:
def tune_catboost(trial):
    train_x, test_x, train_y, test_y = train_test_split(df_train[feats], df_train[target], test_size=0.2, random_state=42)

    params = {
        "iterations" : trial.suggest_int("iterations", 100, 1000),
        "learning_rate" : trial.suggest_float("learning_rate", 1e-3, 1.0),
        "random_seed" : 42,
        'logging_level': 'Silent',
    }
    
    model = CatBoostRegressor(**params)
    model.fit(train_x, train_y, verbose=False)

    preds = model.predict(test_x)
    loss = log_loss(test_y, preds)
    return loss

In [168]:
study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(tune_catboost, n_trials=100)
print('Number of finished trials:', len(study_cat.trials))
print('Best trial:', study_cat.best_trial.params)

[32m[I 2023-03-07 15:07:23,737][0m A new study created in memory with name: no-name-4583e9ce-c6a9-45c0-af05-37277658224a[0m
[32m[I 2023-03-07 15:07:36,097][0m Trial 0 finished with value: 0.21137177080083708 and parameters: {'iterations': 897, 'learning_rate': 0.6281540075153639}. Best is trial 0 with value: 0.21137177080083708.[0m
[32m[I 2023-03-07 15:07:49,298][0m Trial 1 finished with value: 0.22061965504032782 and parameters: {'iterations': 988, 'learning_rate': 0.5628502794210359}. Best is trial 0 with value: 0.21137177080083708.[0m
[32m[I 2023-03-07 15:07:52,938][0m Trial 2 finished with value: 0.1731221078447244 and parameters: {'iterations': 276, 'learning_rate': 0.308325400032247}. Best is trial 2 with value: 0.1731221078447244.[0m
[32m[I 2023-03-07 15:07:54,710][0m Trial 3 finished with value: 0.16913338712272305 and parameters: {'iterations': 132, 'learning_rate': 0.1088853206305721}. Best is trial 3 with value: 0.16913338712272305.[0m
[32m[I 2023-03-07 15:07

Number of finished trials: 100
Best trial: {'iterations': 976, 'learning_rate': 0.03902668232964486}


In [169]:
params = study_cat.best_trial.params
model = CatBoostRegressor(**params)
model.fit(df_train[feats], df_train[target], verbose=False)

preds = model.predict(df_test[feats])
dfs_pred = {}
dfs_pred['uid'] = df_test['uid']
dfs_pred[target] = pd.Series(preds, index=df_test.index)
df_pred = pd.concat(dfs_pred, axis=1)

In [170]:
df_pred[['uid', 'is_strike']].to_csv('preds/pred_cat_optuna_5.csv', index=False)

In [171]:
df_inspect = df_test.merge(df_pred, on='uid', how='left')

In [172]:
df_inspect.loc[:, ['attack_zone', 'is_strike']].groupby('attack_zone').mean().sort_values(by=('is_strike'), ascending=False)

Unnamed: 0_level_0,is_strike
attack_zone,Unnamed: 1_level_1
heart,0.563257
shadow,0.392187
chase,0.206504
waste,0.185106
