In [5]:
import pandas as pd
import numpy as np
import pybaseball

# Data Load

In [6]:
players_df = pd.read_csv('new_data/players_df.csv')
players_df

Unnamed: 0,kor_name,eng_name,team,release,end_dt
0,더스틴 니퍼트,dustin nippert,두산,0,2010-12-31
1,페르난도,fernando alexis nieve,두산,0,2010-12-31
2,트래비스,travis jarrod blackley,기아,0,2010-12-31
3,리즈,radhames corey liz,엘지,0,2010-12-31
4,주키치,benjamin daniel jukich,엘지,0,2010-12-31
...,...,...,...,...,...
146,찰리 반즈,charlie barnes,롯데,0,2021-12-31
147,션 놀린,"sean nolin, sean patrick nolin",기아,0,2021-12-31
148,로니 윌리엄스,ronnie ellis williams,시즌중 방출,1,2021-12-31
149,펠릭스 페냐,félix peña,한화,0,2021-12-31


# id를 추출하기 위한 전처리

In [7]:
players_df.loc[players_df['kor_name']=='조조 레이예스','eng_name'] = 'jo-jo reyes'
players_df.loc[players_df['kor_name']=='왕웨이중','eng_name'] = 'wei-chung wang'

## pybaseball 라이브러리에 검색이 되지 않는 선수 확인

In [8]:
pybaseball.playerid_lookup('wang','wei-chung')

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,wang,wei-chung,623913,wangw001,wangwe01,14382,2014.0,2019.0


In [9]:
tmp_lst = []
for tmp in players_df['eng_name']:
    if len(pybaseball.playerid_lookup(tmp.split(' ')[-1],tmp.split(' ')[0]))!=1:
        tmp_lst.append(tmp)
        print(tmp,':',len(pybaseball.playerid_lookup(tmp.split(' ')[-1],tmp.split(' ')[0])))

benjamin daniel jukich : 0
oneli m. pérez garcia : 0
julio cesar depaula : 0
james eugene magrane : 0
douglas alan "doug" mathis : 0
christopher brandon bootcheck : 0
mario jose santiago : 0
david thomas bush : 0
horacio ramírez : 0
andrew william "andy" van hekken : 0
rick,van den hurk : 0
derek james hankins : 0
charles j. "charlie" shirek : 0
cory s. riordan : 0
travis banwart : 0
dennis sean "d. j." houlton jr. : 0
zachary robert stewart : 0
phillip andrew irwin : 0
andrew frank sisco : 0
sugar ray,marimon : 0
josh smith : 3
tyler eppler : 0
ronnie ellis williams : 0


# id 추출

In [10]:
def get_name_first_last(x):
    if ',' in x:
        name_first,name_last = x.split(',')[-1], x.split(',')[0]
    else:
        name_first,name_last = x.split(' ')[-1], x.split(' ')[0]
    return name_first, name_last

In [11]:
def get_start_dt(x):
    name_first, name_last = get_name_first_last(x)
    tmp = pybaseball.playerid_lookup(name_first,name_last)
    return str(int(tmp.iloc[-1]['mlb_played_first']))+'-01-01' if len(tmp)>0 else np.nan

def get_savant_id(x):
    name_first, name_last = get_name_first_last(x)
    tmp = pybaseball.playerid_lookup(name_first,name_last)
    return tmp.iloc[-1]['key_mlbam'] if len(tmp)>0 else np.nan

def get_fangraphs_id(x):
    name_first, name_last = get_name_first_last(x)
    tmp = pybaseball.playerid_lookup(name_first,name_last)
    return tmp.iloc[-1]['key_fangraphs'] if len(tmp)>0 else np.nan

def get_player_id(x):
    name_first, name_last = get_name_first_last(x)
    tmp = pybaseball.playerid_lookup(name_first,name_last)
    return tmp.iloc[-1]['key_bbref'] if len(tmp)>0 else np.nan

players_df['start_dt'] = players_df['eng_name'].apply(get_start_dt)
players_df['savant_id'] = players_df['eng_name'].apply(get_savant_id)
players_df['fangraphs_id'] = players_df['eng_name'].apply(get_fangraphs_id)
players_df['player_id'] = players_df['eng_name'].apply(get_player_id)

players_df

Unnamed: 0,kor_name,eng_name,team,release,end_dt,start_dt,savant_id,fangraphs_id,player_id
0,더스틴 니퍼트,dustin nippert,두산,0,2010-12-31,2005-01-01,430962.0,4426.0,nippedu01
1,페르난도,fernando alexis nieve,두산,0,2010-12-31,2006-01-01,430588.0,2173.0,nievefe01
2,트래비스,travis jarrod blackley,기아,0,2010-12-31,2004-01-01,429715.0,3234.0,blacktr01
3,리즈,radhames corey liz,엘지,0,2010-12-31,2007-01-01,467785.0,5843.0,lizra01
4,주키치,benjamin daniel jukich,엘지,0,2010-12-31,,,,
...,...,...,...,...,...,...,...,...,...
146,찰리 반즈,charlie barnes,롯데,0,2021-12-31,2021-01-01,656212.0,19865.0,barnech01
147,션 놀린,"sean nolin, sean patrick nolin",기아,0,2021-12-31,,,,
148,로니 윌리엄스,ronnie ellis williams,시즌중 방출,1,2021-12-31,,,,
149,펠릭스 페냐,félix peña,한화,0,2021-12-31,2016-01-01,570240.0,13403.0,penafe01


In [12]:
# mlb 경력없는 선수 삭제
players_df = players_df[~players_df['savant_id'].isna()].reset_index(drop=True)
players_df

Unnamed: 0,kor_name,eng_name,team,release,end_dt,start_dt,savant_id,fangraphs_id,player_id
0,더스틴 니퍼트,dustin nippert,두산,0,2010-12-31,2005-01-01,430962.0,4426.0,nippedu01
1,페르난도,fernando alexis nieve,두산,0,2010-12-31,2006-01-01,430588.0,2173.0,nievefe01
2,트래비스,travis jarrod blackley,기아,0,2010-12-31,2004-01-01,429715.0,3234.0,blacktr01
3,리즈,radhames corey liz,엘지,0,2010-12-31,2007-01-01,467785.0,5843.0,lizra01
4,바티스타,denny m. bautista,한화,0,2010-12-31,2004-01-01,429714.0,1947.0,bautide01
...,...,...,...,...,...,...,...,...,...
119,숀 모리만도,shawn morimando,SSG,0,2021-12-31,2016-01-01,596049.0,13606.0,morimsh01
120,글렌 스파크맨,glenn sparkman,롯데,0,2021-12-31,2017-01-01,642098.0,15200.0,sparkgl01
121,찰리 반즈,charlie barnes,롯데,0,2021-12-31,2021-01-01,656212.0,19865.0,barnech01
122,펠릭스 페냐,félix peña,한화,0,2021-12-31,2016-01-01,570240.0,13403.0,penafe01


## 추출 및 병합

In [86]:
df = pd.DataFrame()
for i in range(len(players_df)):
    tmp = players_df.loc[i]
    try:
        tmp_df = pybaseball.statcast_pitcher(tmp['start_dt'],tmp['end_dt'],tmp['savant_id'])
    except:
        print(tmp['kor_name'],' 에러')
    df = pd.concat([df,tmp_df])

Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
메릴 켈리  에러
Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))


Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))


Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))


Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))


Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))
  df = pd.read_csv(io.StringIO(data.text))


Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data


In [87]:
df

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,FF,2010-10-09,93.0,-1.88,6.90,"Nippert, Dustin",425834,430962,field_out,hit_into_play,...,2,6,2,6,2,,,,0.001,-0.099
1,FF,2010-10-09,95.2,-1.88,6.89,"Nippert, Dustin",150289,430962,home_run,hit_into_play,...,2,6,2,6,2,,,,-0.064,1.811
2,FF,2010-10-09,94.3,-1.87,7.01,"Nippert, Dustin",150289,430962,,foul,...,2,4,2,4,2,,,,0.000,-0.037
3,FF,2010-10-09,94.0,-1.77,6.91,"Nippert, Dustin",150289,430962,,blocked_ball,...,2,4,2,4,2,,,,0.000,0.085
4,FF,2010-10-09,92.8,-1.74,6.88,"Nippert, Dustin",150289,430962,,ball,...,2,4,2,4,2,,,,0.000,0.024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853,,2018-02-28,,,,"Ramírez, Yefry",502054,606162,field_out,hit_into_play,...,0,0,0,0,0,,,,0.037,
1854,,2018-02-28,,,,"Ramírez, Yefry",594824,606162,field_out,hit_into_play,...,0,0,0,0,0,,,,0.039,
1855,,2018-02-28,,,,"Ramírez, Yefry",594824,606162,,ball,...,0,0,0,0,0,,,,-0.019,
1856,,2018-02-28,,,,"Ramírez, Yefry",664056,606162,single,hit_into_play,...,0,0,0,0,0,,,,-0.028,


# 파일로 저장

In [88]:
import pickle
import gzip

# save and compress.
with gzip.open('new_data/savant_df_zip.pickle', 'wb') as f:
    pickle.dump(df, f)

In [17]:
players_df_ver2 = players_df[players_df['kor_name']!='메릴 켈리']
players_df_ver2.to_csv('new_data/players_df_ver2.csv',index=False)