In [89]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
import optuna

from xgboost import XGBRegressor, XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
data_dir = os.path.join(os.path.expanduser('~'), 'git_repos', 'TK5', 'Data', 'nwds-xstrikes')

In [3]:
os.listdir(data_dir)

['test.csv', 'sample_solution.csv', 'train.csv']

In [4]:
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
df_sample = pd.read_csv(os.path.join(data_dir, 'sample_solution.csv'))
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))

In [5]:
df_train['is_lefty'] = 0
df_train.loc[df_train['p_throws'] == 'L', 'is_lefty'] = 1
df_test['is_lefty'] = 0
df_test.loc[df_test['p_throws'] == 'L', 'is_lefty'] = 1

In [6]:
df_test.head()

Unnamed: 0,uid,sz_top,sz_bot,pitch_type,release_pos_x,release_pos_y,release_pos_z,stand,p_throws,inning,inning_topbot,outs_when_up,balls,strikes,if_fielding_alignment,of_fielding_alignment,on_3b,on_2b,on_1b,release_speed,spin_axis,release_spin_rate,pfx_x,pfx_z,plate_x,plate_z,is_lefty
0,122428,3.06,1.55,SI,-1.9,54.31,6.59,L,R,1,Top,2,1,1,Infield shift,Standard,False,False,False,93.8,202,2333,-1.1,1.05,0.99,2.19,0
1,291855,3.29,1.56,FC,2.68,53.84,5.67,R,L,4,Top,0,0,1,Standard,Strategic,False,False,False,88.5,153,2068,-0.14,0.8,0.16,3.06,1
2,225539,3.62,1.69,CH,-1.17,54.73,6.94,L,R,4,Bot,0,1,0,Standard,Strategic,False,False,False,78.5,221,1609,-0.73,0.62,-0.05,2.45,0
3,1410,3.42,1.71,FF,-1.41,54.33,5.98,R,R,6,Top,2,0,0,Standard,Standard,False,False,False,94.0,220,2265,-0.69,1.33,1.3,2.24,0
4,256048,3.14,1.42,FF,3.77,53.53,3.4,R,L,7,Bot,0,0,2,Standard,Standard,False,False,True,90.8,100,2158,1.56,0.6,-0.18,3.76,1


## Base Model

In [7]:
df_train.columns

Index(['uid', 'sz_top', 'sz_bot', 'pitch_type', 'release_pos_x',
       'release_pos_y', 'release_pos_z', 'stand', 'p_throws', 'inning',
       'inning_topbot', 'outs_when_up', 'balls', 'strikes',
       'if_fielding_alignment', 'of_fielding_alignment', 'on_3b', 'on_2b',
       'on_1b', 'release_speed', 'spin_axis', 'release_spin_rate', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'is_strike', 'is_lefty'],
      dtype='object')

In [110]:
id_feats = ['uid', 'pitch_type']
feats = ['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z', 'release_pos_x',
       'release_pos_y', 'release_pos_z', 'balls', 'strikes', 'plate_x', 'plate_z', 'is_lefty']
target = "is_strike"

In [59]:
df_xgb = pd.DataFrame()
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(df_train):
    train = df_train.iloc[train_index].copy()
    val = df_train.iloc[val_index].copy()
    
    xgb = XGBClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1, random_state=42)
    xgb.fit(train[feats], train[target])
    
    dfs_pred = {}
    y_pred = xgb.predict_proba(val[feats])
    dfs_pred[target] = pd.Series(y_pred[:, 1], index=val.index)
    dfs_pred['uid'] = val['uid']
    df_pred = pd.concat(dfs_pred, axis=1)
    df_xgb = pd.concat([df_xgb, df_pred], axis=0)

In [60]:
df_xgb.head(20)

Unnamed: 0,is_strike,uid
0,0.386028,0
6,0.281567,6
11,0.379605,11
12,0.177834,13
16,0.371208,17
22,0.300899,23
24,0.31273,25
26,0.318394,27
30,0.346544,31
33,0.310577,34


In [61]:
df_train['is_strike_xgb'] = df_xgb['is_strike']

In [64]:
df_train[['uid', 'is_strike', 'is_strike_xgb']].head(20)

Unnamed: 0,uid,is_strike,is_strike_xgb
0,0,1,0.386028
1,1,1,0.364724
2,2,1,0.347803
3,3,1,0.35942
4,4,0,0.331893
5,5,1,0.288775
6,6,0,0.281567
7,7,1,0.30526
8,8,1,0.281813
9,9,1,0.307256


In [65]:
log_loss(df_train['is_strike'], df_train['is_strike_xgb'])

0.629669937669223

In [70]:
xgb = XGBClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1, random_state=42)
xgb.fit(df_train[feats], df_train[target])

dfs_pred = {}
y_pred = xgb.predict_proba(df_test[feats])
dfs_pred[target] = pd.Series(y_pred[:, 1], index=df_test.index)
dfs_pred['uid'] = df_test['uid']
df_pred = pd.concat(dfs_pred, axis=1)

In [76]:
df_pred[['uid', 'is_strike']].to_csv('preds/pred_xgb.csv', index=False)

In [101]:
def tune(trial):
    train_x, test_x, train_y, test_y = train_test_split(df_train[feats], df_train[target], test_size=0.2, random_state=42)

    params = {
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        }
    
    model = XGBRegressor(**params)
    model.fit(train_x, train_y, verbose=False)

    preds = model.predict(test_x)
    loss = log_loss(test_y, preds)
    return loss

In [123]:
study = optuna.create_study(direction='minimize')
study.optimize(tune, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2023-03-04 17:06:16,540][0m A new study created in memory with name: no-name-10f6609e-37dd-451d-943a-0fe84d5b864d[0m
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
[33m[W 2023-03-04 17:06:22,804][0m Trial 0 failed with parameters: {'lambda': 3.8995304475556787, 'alpha': 2.643963943231714, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.1, 'min_child_weight': 192} because of the following error: The value nan is not acceptable..[0m
[33m[W 2023-03-04 17:06:22,805][0m Trial 0 failed with value nan.[0m
[32m[I 2023-03-04 17:06:27,750][0m Trial 1 finished with value: 0.5215944081419538 and parameters: {'lambda': 1.6182442814090163, 'alpha': 7.433857773790648, 'colsample_bytree': 0.3, 'subsample': 0.9, 'learning_rate': 0.01, 'min_child_weight': 179}. Best is trial 1 with value: 0.5215944081419538.[0m
[33m[W 2023-03-04 17:06:30,493][0m Trial 2 failed with parameters: {'lambda': 1.9308103

KeyboardInterrupt: 

In [103]:
params = study.best_trial.params

model = XGBRegressor(**params)
model.fit(df_train[feats], df_train[target], verbose=False)

preds = model.predict(df_test[feats])
dfs_pred = {}
dfs_pred['uid'] = df_test['uid']
dfs_pred[target] = pd.Series(preds, index=df_test.index)
df_pred = pd.concat(dfs_pred, axis=1)

In [104]:
df_pred[['uid', 'is_strike']].to_csv('preds/pred_xgb_optuna.csv', index=False)

## Catboost go Brrrrr

In [134]:
def tune_catboost(trial):
    train_x, test_x, train_y, test_y = train_test_split(df_train[feats], df_train[target], test_size=0.2, random_state=42)

    params = {
        "iterations" : trial.suggest_int("iterations", 100, 1000),
        "learning_rate" : trial.suggest_float("learning_rate", 1e-3, 1.0),
        "random_seed" : 42,
        'logging_level': 'Silent',
    }
    
    model = CatBoostRegressor(**params)
    model.fit(train_x, train_y, verbose=False)

    preds = model.predict(test_x)
    loss = log_loss(test_y, preds)
    return loss

In [139]:
study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(tune_catboost, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2023-03-04 17:23:56,286][0m A new study created in memory with name: no-name-a4ee9c39-fa43-4308-823b-40375f6a175b[0m
[32m[I 2023-03-04 17:24:14,914][0m Trial 0 finished with value: 0.24186796341868794 and parameters: {'iterations': 972, 'learning_rate': 0.7388031219020379}. Best is trial 0 with value: 0.24186796341868794.[0m
[32m[I 2023-03-04 17:24:21,348][0m Trial 1 finished with value: 0.1789001466517139 and parameters: {'iterations': 313, 'learning_rate': 0.3391204100579025}. Best is trial 1 with value: 0.1789001466517139.[0m
[32m[I 2023-03-04 17:24:35,073][0m Trial 2 finished with value: 0.18664743156901764 and parameters: {'iterations': 972, 'learning_rate': 0.2431452716258222}. Best is trial 1 with value: 0.1789001466517139.[0m
[32m[I 2023-03-04 17:24:39,586][0m Trial 3 finished with value: 0.22799931096859952 and parameters: {'iterations': 301, 'learning_rate': 0.8995287618555415}. Best is trial 1 with value: 0.1789001466517139.[0m
[32m[I 2023-03-04 17:24:

Number of finished trials: 3
Best trial: {'lambda': 1.6182442814090163, 'alpha': 7.433857773790648, 'colsample_bytree': 0.3, 'subsample': 0.9, 'learning_rate': 0.01, 'min_child_weight': 179}


In [140]:
params = study_cat.best_trial.params
model = CatBoostRegressor(**params)
model.fit(df_train[feats], df_train[target], verbose=False)

preds = model.predict(df_test[feats])
dfs_pred = {}
dfs_pred['uid'] = df_test['uid']
dfs_pred[target] = pd.Series(preds, index=df_test.index)
df_pred = pd.concat(dfs_pred, axis=1)

In [141]:
df_pred[['uid', 'is_strike']].to_csv('preds/pred_cat_optuna.csv', index=False)