In [1]:
import pandas as pd
from pybaseball import pitching_stats
from pybaseball import cache

cache.enable()

# retrieve one row per player per season of 2022 
pitching_data_2022 = pitching_stats(2022, 2022)

In [2]:
# 瞭解 pitching_data_2021_to_2022 的資料型態

save_situation_related = []

print(pitching_data_2022.columns)

for col in pitching_data_2022.columns:
    assert col in pitching_data_2022.columns
    assert isinstance(col, str)
    # to find columns that might be related to save situation
    if 's' in col.lower():
        save_situation_related.append(col)

print(save_situation_related)


Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'W', 'L', 'WAR', 'ERA', 'G',
       ...
       'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events',
       'CStr%', 'CSW%', 'xERA'],
      dtype='object', length=334)
['Season', 'GS', 'ShO', 'SV', 'BS', 'SO', 'Balls', 'Strikes', 'Pitches', 'RS', 'Starting', 'Start-IP', 'Dollars', 'Pulls', 'SL%', 'SLv', 'SF%', 'SFv', 'wSL', 'wSF', 'wSL/C', 'wSF/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'F-Strike%', 'SwStr%', 'SD', 'SIERA', 'RS/9', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL% (sc)', 'CU% (sc)', 'KC% (sc)', 'EP% (sc)', 'CH% (sc)', 'SC% (sc)', 'KN% (sc)', 'UN% (sc)', 'vFA (sc)', 'vFT (sc)', 'vFC (sc)', 'vFS (sc)', 'vFO (sc)', 'vSI (sc)', 'vSL (sc)', 'vCU (sc)', 'vKC (sc)', 'vEP (sc)', 'vCH (sc)', 'vSC (sc)', 'vKN (sc)', 'FA-X (sc)', 'FT-X (sc)', 'FC-X (sc)', 'FS-X (sc)', 'FO-X (sc)', 'SI-X (sc)', 'SL-X (sc)', 'CU-X (sc)', 'KC-X (sc)', 'EP-X (sc)', 'CH-X (sc)', 'SC-X (sc)', 'KN-X (sc)', 'FA-Z (sc)',

## pitching_data_2021_to_2022 我們找到了 `SV` 還有 `BS`



In [3]:
#  pitching_data_2021_to_2022 sorted by `SV`
print('pitching_data_2022 rows:', len(pitching_data_2022))

# 看單季沒有救援成功的投手數量
print('without save:', len(
    pitching_data_2022[pitching_data_2022['SV'] == 0]
))


# 看單季小於 10 勝的投手數量
print('less than 10 wins:', len(
    pitching_data_2022[pitching_data_2022['W'] < 10]
))



pitching_data_2022 rows: 45
without save: 45
less than 10 wins: 6


### `SV` 欄位沒有資料，可能 query 的方式有誤，造成只有先發投手的資料

In [4]:
# 多試幾種 query 的方式

pitching_data_2020_2021 = pitching_stats(2020, 2021)
print(pitching_data_2020_2021['SV'].value_counts())

pitching_data_2022_different_arg = pitching_stats(2022)
print(pitching_data_2022_different_arg['SV'].value_counts())

pitching_data_2004_2007 = pitching_stats(2004, 2007)
print(pitching_data_2004_2007['SV'].value_counts())


SV
0    79
Name: count, dtype: int64
SV
0    45
Name: count, dtype: int64
SV
0    341
1      4
5      1
Name: count, dtype: int64


[pybaseball 官方文件](https://github.com/jldbc/pybaseball/blob/master/docs/pitching_stats.md)說:

> The pitching_stats function returns season-level pitching data from FanGraphs.

看來需要再驗證了。

In [5]:
# get pitching_data_2004_2007 with `Name`, `SV`, sorted by `SV`
pitching_data_2004_2007[['Name', 'SV']].sort_values('SV', ascending=False)

Unnamed: 0,Name,SV
267,Miguel Batista,5
227,Brett Tomko,1
86,Chien-Ming Wang,1
283,Nate Robertson,1
343,Joel Pineiro,1
...,...,...
65,Jon Garland,0
195,Aaron Cook,0
114,Scott Kazmir,0
236,Cory Lidle,0


https://www.baseball-reference.com/players/b/batismi01.shtml

Miguel Batista 這段期間，最多有 31 次的救援成功，

但 2004 - 2007 ，最多只有 5 次的救援成功。

不合理！

In [6]:
# retrieve aggregate player statistics from 2004 to 2007 
pitching_data_2004_2007_aggregated = pitching_stats(2004, 2007, ind=0)
pitching_data_2004_2007_aggregated[['Name', 'SV']].sort_values('SV', ascending=False)

Unnamed: 0,Name,SV
0,Mariano Rivera,160
3,Chad Cordero,127
6,Brad Lidge,122
35,David Weathers,60
47,Braden Looper,57
...,...,...
76,Paul Byrd,0
92,David Wells,0
135,Jeff Weaver,0
54,Aaron Cook,0


以 Mariano Rivera 為例：

https://www.baseball-reference.com/players/r/riverma01.shtml#2004-2007-sum:pitching_standard

這邊就跟 Baseball Reference 的資料對起來了。

### 但是：

In [7]:
# retrieve aggregate player statistics 2022 
pitching_data_2022_aggregated = pitching_stats(2022, None, ind=0)
pitching_data_2022_aggregated[['Name', 'SV']].sort_values('SV', ascending=False).head()

Unnamed: 0,Name,SV
22,Aaron Nola,0
1,Julio Urias,0
37,Corey Kluber,0
20,Kyle Wright,0
23,Miles Mikolas,0


### 決定不用 FanGraphs 的資料了，改用 Baseball Reference 的資料

In [8]:
# TODO: save data_2022_pitching_bref to csv, stand alone only

# from pybaseball import pitching_stats_bref

# retrieve data on the 2022 season
# data_2022_pitching_bref = pitching_stats_bref(2022)
# data_2022_pitching_bref.head()

# data_2022_pitching_bref.to_csv('data-2022-pitching-bref.csv', index=False)


## 來看 pitching_stats_bref

In [9]:
data_2022_pitching_bref = pd.read_csv('data-2022-pitching-bref.csv')


In [10]:
# 瞭解 data_2022_pitching_bref 的資料型態
data_2022_pitching_bref.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W', 'mlbID'],
      dtype='object')

In [11]:
# sorted by `SV`
data_2022_pitching_bref.sort_values(by=['SV'], ascending=False).head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W,mlbID
148,Emmanuel Clase,24,208,Maj-AL,Cleveland,77,0,3.0,4.0,42.0,...,0.14,0.17,0.64,0.19,0.03,0.729,0.224,9.5,7.7,661403
375,Kenley Jansen,34,208,Maj-NL,Atlanta,65,0,5.0,2.0,41.0,...,0.18,0.12,0.3,0.19,0.12,1.047,0.259,12.0,3.86,445276
336,Liam Hendriks,33,209,Maj-AL,Chicago,58,0,4.0,4.0,37.0,...,0.15,0.18,0.36,0.24,0.11,1.041,0.303,13.3,5.31,521230
320,Josh Hader,28,209,Maj-NL,"Milwaukee,San Diego",56,0,2.0,5.0,36.0,...,0.16,0.16,0.31,0.3,0.07,1.28,0.333,14.6,3.86,623352
660,Jordan Romano,29,207,Maj-AL,Toronto,63,0,5.0,4.0,36.0,...,0.16,0.15,0.45,0.2,0.06,1.016,0.258,10.3,3.48,605447


### 用 `SV` 來找資料，看到熟悉的名字了

### 以下為瀏覽 statcast 的內容，確認有無可用的欄位

In [12]:
# Path: 下載 1990 年到 2022 年的 statcast 資料
# TODO: stand alone for download only

# from pybaseball import statcast

# # 1990 年到 2022 年的資料
# data = statcast(start_dt="2022-01-01", end_dt="2022-12-31")

# # Save the data to a CSV file
# data.to_csv("statcast-data-2022.csv", index=False)

# # Read the data from the CSV file
# data_from_csv = pd.read_csv("statcast-data-2022.csv")

# data_from_csv.head()

In [13]:
import pandas as pd

# Read the data from the CSV file
data_from_csv_statcast_2022 = pd.read_csv("statcast-data-2022.csv")

data_from_csv_statcast_2022.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,SL,2022-11-05,89.2,-0.06,6.14,"Pressly, Ryan",592206,519151,field_out,hit_into_play,...,4,1,4,1,4,Standard,Standard,137.0,0.011,-0.219
1,FF,2022-11-05,93.9,-0.18,5.94,"Pressly, Ryan",547180,519151,field_out,hit_into_play,...,4,1,4,1,4,Infield shift,Standard,210.0,0.022,-0.3
2,FF,2022-11-05,93.0,-0.09,5.97,"Pressly, Ryan",592663,519151,single,hit_into_play,...,4,1,4,1,4,Standard,Standard,212.0,-0.02,0.262
3,SL,2022-11-05,88.1,-0.15,6.03,"Pressly, Ryan",656555,519151,field_out,hit_into_play,...,4,1,4,1,4,Infield shift,Standard,120.0,0.018,-0.158
4,SL,2022-11-05,89.0,-0.25,6.06,"Pressly, Ryan",656555,519151,,foul,...,4,1,4,1,4,Infield shift,Standard,121.0,0.0,-0.054


In [14]:
# 瞭解 data_from_csv_statcast_2022 的資料型態
data_from_csv_statcast_2022.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

### 請 GPT 幫忙整理欄位的意義

原先使用 GPT3.5，但瀏覽過去，發現他會把類似 fielder_2 翻譯成二壘手，所以改用 GPT 4

1. pitch_type：球種
2. game_date：比賽日期
3. release_speed：發球速度
4. release_pos_x：發球位置 x 座標
5. release_pos_z：發球位置 z 座標
6. player_name：球員名稱
7. batter：打者
8. pitcher：投手
9. events：事件
10. description：描述
11. spin_dir：旋轉方向（已廢棄）
12. spin_rate_deprecated：旋轉速率（已廢棄）
13. break_angle_deprecated：轉折角度（已廢棄）
14. break_length_deprecated：轉折長度（已廢棄）
15. zone：區域
16. des：詳細描述
17. game_type：比賽類型
18. stand：打者站立方向
19. p_throws：投手投球方向
20. home_team：主場隊伍
21. away_team：客場隊伍
22. type：球類型（好球或壞球）
23. hit_location：擊球位置
24. bb_type：擊球類型
25. balls：壞球數
26. strikes：好球數
27. game_year：比賽年份
28. pfx_x：橫向運動
29. pfx_z：縱向運動
30. plate_x：投球板 x 座標
31. plate_z：投球板 z 座標
32. on_3b：三壘上的跑者
33. on_2b：二壘上的跑者
34. on_1b：一壘上的跑者
35. outs_when_up：出局數
36. inning：局數
37. inning_topbot：局數上半或下半
38. hc_x：擊球點 x 座標
39. hc_y：擊球點 y 座標
40. tfs_deprecated：投球時間（已廢棄）
41. tfs_zulu_deprecated：投球時間（已廢棄）
42. fielder_2：捕手
43. umpire：裁判
44. sv_id：投球識別碼
45. vx0：x 方向初速度
46. vy0：y 方向初速度
47. vz0：z 方向初速度
48. ax：x 方向加速度
49. ay：y 方向加速度
50. az：z 方向加速度
51. sz_top：好球區上邊界
52. sz_bot：好球區下邊界
53. hit_distance_sc：擊球距離
54. launch_speed：擊球速度
55. launch_angle：擊球角度
56. effective_speed：有效速度
57. release_spin_rate：發球旋轉速率
58. release_extension：發球延伸距離
59. game_pk：比賽唯一識別碼
60. pitcher.1：投手（重複）
61. fielder_2.1：捕手（重複）
62. fielder_3：一壘手
63. fielder_4：二壘手
64. fielder_5：三壘手
65. fielder_6：左外野手
66. fielder_7：中外野手
67. fielder_8：右外野手
68. fielder_9：指定打擊手
69. release_pos_y：發球位置 y 座標
70. estimated_ba_using_speedangle：利用速度角估計的打擊率
71. estimated_woba_using_speedangle：利用速度角估計的加權出局率
72. woba_value：加權出局率值
73. woba_denom：加權出局率分母
74. babip_value：球員打擊率的值
75. iso_value：長打率值
76. launch_speed_angle：擊球速度角度
77. at_bat_number：打擊次數
78. pitch_number：投球次數
79. pitch_name：投球名稱
80. home_score：主場得分
81. away_score：客場得分
82. bat_score：打擊方得分
83. fld_score：防守方得分
84. post_away_score：客場隊伍後續得分
85. post_home_score：主場隊伍後續得分
86. post_bat_score：打擊方後續得分
87. post_fld_score：防守方後續得分
88. if_fielding_alignment：內野防守位置
89. of_fielding_alignment：外野防守位置
90. spin_axis：旋轉軸
91. delta_home_win_exp：主場勝率變化
92. delta_run_exp：跑者得分預期變化


### 暫時用不到這些數據

In [15]:
# 瞭解有哪些 events
unique_events = data_from_csv_statcast_2022['events'].unique()
print(unique_events)


['field_out' 'single' nan 'strikeout' 'walk' 'home_run' 'force_out'
 'hit_by_pitch' 'grounded_into_double_play' 'fielders_choice_out' 'double'
 'strikeout_double_play' 'sac_fly' 'field_error' 'sac_bunt_double_play'
 'fielders_choice' 'sac_bunt' 'double_play' 'caught_stealing_2b' 'triple'
 'other_out' 'pickoff_3b' 'catcher_interf' 'caught_stealing_3b'
 'pickoff_caught_stealing_2b' 'triple_play' 'caught_stealing_home'
 'sac_fly_double_play' 'wild_pitch' 'pickoff_1b'
 'pickoff_caught_stealing_home' 'pickoff_caught_stealing_3b'
 'game_advisory' 'pickoff_2b']


### 暫時用不到這些 events