In [1]:
import pandas as pd
import numpy as np
import sys

import xgboost as xgb
sys.path.append('../code/')

from preprocessing import *
from config import *
from utils import *

# Load data

In [175]:
regular = pd.read_csv('../dataset/Regular_Season_Batter.csv', index_col=0)
submit = pd.read_csv('../dataset/submission.csv', index_col=0)
print('regular shape: ',regular.shape)
print('submit shape: ',submit.shape)

regular shape:  (2454, 28)
submit shape:  (220, 1)


In [176]:
regular.loc[:,'team'] = regular.team.apply(lambda x: '넥센' if x == '히어로즈' else x)
regular.team.unique()

array(['LG', '한화', 'NC', 'OB', '삼성', '롯데', 'KT', '현대', '우리', '넥센', '두산',
       'SK', 'KIA', '해태', '쌍방울'], dtype=object)

In [177]:
regular = regular[regular.year > 2001]

In [178]:
regular.shape

(2359, 28)

In [179]:
team_move = pd.read_csv('../dataset/batter_team_move.csv')
team_move.BATTER = team_move.BATTER.apply(lambda x: x.split('(')[0] if '(' in x else x)
team_move.DATE = team_move.DATE.apply(lambda x: x.replace('-','')).astype(int)
print('team_move: ',team_move.shape)

team_move:  (725, 5)


# 사전에 필요한 정보

In [180]:
pre_info = ['batter_name','year','team','height/weight','year_born','position','career','starting_salary']

# regular에서 2019 출전 선수들의 최근 정보 가져오기

In [181]:
names2019 = submit.batter_name.unique()
regular_names = regular.batter_name.unique()

In [182]:
set(names2019) - set(regular_names)

set()

In [183]:
regular2019 = regular[regular.batter_name.isin(names2019)]
regular2019.shape

(1532, 28)

In [184]:
regular2019 = regular2019.groupby('batter_name')['year'].max().reset_index()
print('regular2019_recent: ',regular2019.shape)
print('가장 최근 기록이 몇년도일까?: ',regular2019.year.unique())

regular2019_recent:  (220, 2)
가장 최근 기록이 몇년도일까?:  [2018]


In [185]:
submit.loc[:,'year'] = 2019
regular2019_recent = pd.merge(regular2019, regular[pre_info], on=['batter_name','year'], how='inner')
regular2019_recent = pd.merge(submit, regular2019_recent.drop('year',axis=1), on='batter_name', how='inner')

In [186]:
team171819 = team_move[team_move.BATTER.isin(names2019)]
team171819 = team171819.sort_values(by='DATE', ascending=False)
team171819_recent = team171819.groupby('BATTER')['DATE'].max().reset_index()
team171819 = pd.merge(team171819_recent, team171819,  on=['BATTER','DATE'], how='inner')
team171819 = team171819.iloc[team171819[['BATTER','DATE']].drop_duplicates(keep='first').index]                                                    
print('team171819 shape: ',team171819.shape)

team171819 shape:  (99, 5)


In [187]:
def moving_team(name, old_team):
    news_ = team171819.BATTER.unique()
    if name in news_:
        new_team = team171819[team171819.BATTER==name].TEAM.iloc[0]
        if new_team != old_team:
            if new_team == '키움':
                return '넥센'
            return new_team
        else:
            return old_team
    else:
        return old_team

In [188]:
regular2019_recent.loc[:,'team'] = regular2019_recent.apply(lambda x: moving_team(x.batter_name, x.team), axis=1)

# Concate 2019 data

In [189]:
add_col = ['avg', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'TB', 'RBI', 'SB', 'CS', 'BB', 'HBP', 'SO', 'GDP', 'SLG', 'OBP', 'E','OPS']
for c in add_col:
    regular2019_recent.loc[:,c] = np.nan

In [190]:
new_regular = pd.concat([regular, regular2019_recent[regular.columns]], axis=0)

In [191]:
new_regular.shape

(2579, 28)

# preprocessing

In [192]:
con = Config()

In [193]:
cat_features = con.cat_features
lag_features = con.lag_features
total_features = con.total_features
features = con.features
inter_features = con.international
names = regular.batter_name.unique()
print('features: ',features)
print('cat_features: ',cat_features)
print('lag_featuers: ',lag_features)
print('total_features: ',total_features)
print('inter_features: ',inter_features)

features:  ['year', 'batter_name', 'height/weight', 'year_born', 'starting_salary', 'position', 'career', 'SLG', 'OBP', 'OPS', 'AB']
cat_features:  ['team']
lag_featuers:  ['H', 'HBP', '2B', 'HR', 'SO', 'R', 'RBI', 'AB', 'BB', 'avg', 'SLG', 'OBP', 'E', 'OPS']
total_features:  ['H', 'G', 'HBP', 'GDP', '2B', 'HR', 'SO', 'R', 'RBI', 'AB', 'BB']
inter_features:  ['Selma(고)', '쿠바 Ciego de Avila Maximo Gomez Baez(대)', '캐나다 A.B Lucas Secondary(고)', '필라델피아', '히로시마', '일본 아세아대', '샌프란시스코', '미국 윌리캐넌초', '미국 쿠퍼고', '미국 페퍼다인대', '미네소타', '볼티모어', '미국 Catawba(대)', '미국 Creighton(대)', '미국 Diamond Bar(고)', '미국 Fort Loramie(고)', '미국 Kentucky(대)', '미국 Las Vegas(대)', '미국 Smithfield', '미국 Texas at Arlington(대)', '미국 Toledo(대)', '미국 Wabash Valley(대)', '미국 레이노사고', '미국 볼주립대', '미국 위스콘신 라크로스대', '도미니카', '도미니카 Elias Rodriguez(고)', '도미니카 산토도밍고고', '도미니카 알레한드로 바쓰고', '도미니카 엘세이보고', '네덜란드 Voorben Praktyk(고)']


In [194]:
data = new_regular.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579 entries, 0 to 219
Data columns (total 28 columns):
batter_name        2579 non-null object
year               2579 non-null int64
team               2579 non-null object
avg                2334 non-null float64
G                  2359 non-null float64
AB                 2359 non-null float64
R                  2359 non-null float64
H                  2359 non-null float64
2B                 2359 non-null float64
3B                 2359 non-null float64
HR                 2359 non-null float64
TB                 2359 non-null float64
RBI                2359 non-null float64
SB                 2359 non-null float64
CS                 2359 non-null float64
BB                 2359 non-null float64
HBP                2359 non-null float64
SO                 2359 non-null float64
GDP                2359 non-null float64
SLG                2334 non-null float64
OBP                2336 non-null float64
E                  2359 non-null floa

In [195]:
data.loc[:,'OPS'] = data['OPS'].fillna(0)
data.loc[:,'OBP'] = data['OBP'].fillna(0)
data.loc[:,'SLG'] = data['SLG'].fillna(0)

data = BABIP_and_avg(data,names)
lag_features.extend(['TB', 'MH', 'PA', 'RISP', 'BABIP'])
total_features.extend(['TB',  'SF', 'MH', 'PA'])
print('## BABIP and BABIP_AVG')

data = diff_BABIP(data)
lag_features.append('BABIP_diff')
print('## diff BABIP')

data = corrected_OPS(data)
lag_features.append('corrected_OPS')
print('## corrected OPS')

data.loc[:,'1B'] = data['H'] - data['2B'] - data['3B'] - data['HR']
lag_features.append('1B')
print('## Add 1B')

for f in tqdm(range(len(lag_features))):
    data.loc[:,'lag_1_' + lag_features[f]] = data.apply(lambda x: lag_n(data, x['batter_name'], x['year'], lag_features[f]), axis=1)
print('## lag_n')

for f in tqdm(range(len(total_features))):
    data.loc[:,'total_' + total_features[f]] = data.apply(lambda x: get_total(data, x['batter_name'], x['year'], total_features[f]), axis=1)
print('## total')

data = split_position(data)
cat_features.extend(['position1'])
print('## split_position')
data = fill_position1(data)
print('## fill position')
data = height_weight_fillna(data)
print('## fill na height/weight')
data = weight_height_(data)
print('## height/weight')
data = age_(data)
print('## age')
data = starting_salary_(data)
print('## starting_salary')
data = cum_season_(data, names)
print('## cum_season')

data = OPS_up_down(data, names)
print('## OPS_up_down')
data = OBP_up_down(data, names)
print('## OBP_up_down')
data = SLG_up_down(data, names)
print('## SLG_up_down')
data = grad_status_(data)
print('## grad_status')
data = career_count(data)
print('## career_count')
data = from_inter_(data, inter_features)
print('## from inter')

data = categorical_variables(data, cat_features)
print('## dummy variables')

  new_data.loc[(new_data.batter_name==name) & (new_data.year==y),'BABIP_AVG'] = (total['H'] - total['HR']) / (total['AB'] - total['SO'] - total['HR'] + total['SF'])


## BABIP and BABIP_AVG
## diff BABIP
## corrected OPS
## Add 1B


100%|██████████████████████████████████| 22/22 [01:07<00:00,  3.08s/it]


## lag_n


100%|██████████████████████████████████| 15/15 [00:49<00:00,  3.33s/it]


## total
## split_position
## fill position
## fill na height/weight
## height/weight
## age
## starting_salary
## cum_season
## OPS_up_down
## OBP_up_down
## SLG_up_down
## grad_status
## career_count
## from inter
## dummy variables


In [224]:
c1 = ['year', 'height', 'weight', 'age', 'starting_salary', 'cum_season', 'grad_status', 'career_count', 'from_inter']
c2 = list()
c3 = list()
c4 = ['team_KT','team_LG','team_NC','team_SK','team_넥센','team_두산','team_롯데','team_삼성','team_우리','team_한화','team_현대','position1_내야수','position1_외야수','position1_포수']
c5 = ['OPS_up','OPS_down','OBP_up','OBP_down','SLG_up','SLG_down']
for f in lag_features:
    c2.append('lag_1_{}'.format(f))
for f in total_features:
    c3.append('total_{}'.format(f))

In [197]:
target = 'OPS'

In [225]:
c = c1 + c2 + c3 + c4 + c5

# split data

In [226]:
use_data = data[c]

In [227]:
use_data.shape

(2579, 66)

In [228]:
x_train = data[data.year<2017][c]
y_train = data[data.year<2017][['OPS','AB']]
x_valid = data[data.year==2017][c]
y_valid = data[data.year==2017][['OPS','AB']]
x_test = data[data.year==2018][c]
y_test = data[data.year==2018][['OPS','AB']]
x_submit = data[data.year==2019][c]
y_submit = data[data.year==2019]['OPS']

In [229]:
x_train = x_train.drop('year',axis=1)
x_valid = x_valid.drop('year',axis=1)
x_test = x_test.drop('year',axis=1)
x_submit = x_submit.drop('year',axis=1)

# XGboost

In [230]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1915 entries, 1 to 2356
Data columns (total 65 columns):
height                 1915 non-null float64
weight                 1915 non-null float64
age                    1915 non-null int64
starting_salary        1915 non-null float64
cum_season             1915 non-null float64
grad_status            1915 non-null int64
career_count           1915 non-null int64
from_inter             1915 non-null int64
lag_1_H                1442 non-null float64
lag_1_HBP              1442 non-null float64
lag_1_2B               1442 non-null float64
lag_1_HR               1442 non-null float64
lag_1_SO               1442 non-null float64
lag_1_R                1442 non-null float64
lag_1_RBI              1442 non-null float64
lag_1_AB               1442 non-null float64
lag_1_BB               1442 non-null float64
lag_1_avg              1431 non-null float64
lag_1_SLG              1442 non-null float64
lag_1_OBP              1442 non-null float64
l

In [233]:
params = {
    "learning_rate": 0.1,
    "n_estimators": 10000,
    "max_depth": 4,
    "min_child_weight": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "alpha": 0,
    "lambda": 1,
    "objective": "gpu:reg:linear",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eval_metric":"rmse"
}

In [236]:
model = xgb.XGBRegressor(**params)

model.fit(x_train, y_train[target],
          eval_set=[(x_train, y_train[target]), (x_valid, y_valid[target])],
          early_stopping_rounds=50,
          sample_weight=y_train['AB'],
          verbose=50)
prob = model.predict(x_test, ntree_limit=model.best_iteration)
e = wrmse(y_test[target], prob, y_test['AB'])
e2 = rmse(y_test[target], prob)
print('test WRMSE: {0:.6f}'.format(e))
print('test RMSE: {0:.6f}'.format(e2))

[0]	validation_0-rmse:0.29138	validation_1-rmse:0.273738
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 50 rounds.
[50]	validation_0-rmse:0.245961	validation_1-rmse:0.213616
Stopping. Best iteration:
[14]	validation_0-rmse:0.251347	validation_1-rmse:0.209879

test WRMSE: 0.130431
test RMSE: 0.219539


In [237]:
x_2017 = pd.concat([x_train, x_valid], axis=0)
y_2017 = pd.concat([y_train, y_valid], axis=0)

In [238]:
print(x_2017.shape)
print(y_2017.shape)

(2132, 65)
(2132, 2)


In [239]:
model = xgb.XGBRegressor(**params)

model.fit(x_2017, y_2017[target],
          eval_set=[(x_2017, y_2017[target]), (x_test, y_test[target])],
          early_stopping_rounds=500,
          sample_weight=y_2017['AB'],
          verbose=50)
prob = model.predict(x_submit, ntree_limit=model.best_iteration)

[0]	validation_0-rmse:0.289671	validation_1-rmse:0.284927
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 500 rounds.
[50]	validation_0-rmse:0.242728	validation_1-rmse:0.218443
[100]	validation_0-rmse:0.232236	validation_1-rmse:0.214178
[150]	validation_0-rmse:0.224038	validation_1-rmse:0.211689
[200]	validation_0-rmse:0.216936	validation_1-rmse:0.208658
[250]	validation_0-rmse:0.210304	validation_1-rmse:0.208185
[300]	validation_0-rmse:0.205445	validation_1-rmse:0.207608
[350]	validation_0-rmse:0.201095	validation_1-rmse:0.207594
[400]	validation_0-rmse:0.197037	validation_1-rmse:0.209579
[450]	validation_0-rmse:0.193164	validation_1-rmse:0.208968
[500]	validation_0-rmse:0.189911	validation_1-rmse:0.20862
[550]	validation_0-rmse:0.186419	validation_1-rmse:0.209237
[600]	validation_0-rmse:0.182856	validation_1-rmse:0.209419
[650]	validation_0-rmse:0.179699	validation_1-rmse:0.209326
[700

In [209]:
data[data.year==2019].shape

(220, 106)

In [222]:
for c in data.columns:
    print(c)

batter_name
year
team
avg
G
AB
R
H
2B
3B
HR
TB
RBI
SB
CS
BB
HBP
SO
GDP
SLG
OBP
E
height/weight
year_born
position
career
starting_salary
OPS
MH
SF
IBB
SAC
PH-BA
PA
RISP
BABIP
BABIP_AVG
BABIP_diff
corrected_OPS
1B
lag_1_H
lag_1_HBP
lag_1_2B
lag_1_HR
lag_1_SO
lag_1_R
lag_1_RBI
lag_1_AB
lag_1_BB
lag_1_avg
lag_1_SLG
lag_1_OBP
lag_1_E
lag_1_OPS
lag_1_TB
lag_1_MH
lag_1_PA
lag_1_RISP
lag_1_BABIP
lag_1_BABIP_diff
lag_1_corrected_OPS
lag_1_1B
total_H
total_G
total_HBP
total_GDP
total_2B
total_HR
total_SO
total_R
total_RBI
total_AB
total_BB
total_TB
total_SF
total_MH
total_PA
position1
position2
height
weight
age
cum_season
OPS_up
OPS_down
OBP_up
OBP_down
SLG_up
SLG_down
grad_status
career_count
from_inter
team_KT
team_LG
team_NC
team_SK
team_넥센
team_두산
team_롯데
team_삼성
team_우리
team_한화
team_현대
position1_내야수
position1_외야수
position1_포수


In [219]:
x_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2132 entries, 1 to 2357
Data columns (total 58 columns):
height                 2132 non-null float64
weight                 2132 non-null float64
age                    2132 non-null int64
cum_season             2132 non-null float64
grad_status            2132 non-null int64
career_count           2132 non-null int64
from_inter             2132 non-null int64
lag_1_H                1624 non-null float64
lag_1_HBP              1624 non-null float64
lag_1_2B               1624 non-null float64
lag_1_HR               1624 non-null float64
lag_1_SO               1624 non-null float64
lag_1_R                1624 non-null float64
lag_1_RBI              1624 non-null float64
lag_1_AB               1624 non-null float64
lag_1_BB               1624 non-null float64
lag_1_avg              1613 non-null float64
lag_1_SLG              1624 non-null float64
lag_1_OBP              1624 non-null float64
lag_1_E                1624 non-null float64
l

In [210]:

prob.shape

(220,)

In [211]:
submit.loc[:,'OPS'] = prob

In [216]:
submit.drop('year',axis=1).reset_index().to_csv('../submit1.csv',index=False)