In [1]:
import pandas as pd
from pybaseball import pitching_stats
from pybaseball import cache

cache.enable()

# retrieve one row per player per season of 2022 
pitching_data_2022 = pitching_stats(2022, 2022)

In [2]:
# 瞭解 pitching_data_2021_to_2022 的資料型態

save_situation_related = []

print(pitching_data_2022.columns)

for col in pitching_data_2022.columns:
    assert col in pitching_data_2022.columns
    assert isinstance(col, str)
    # to find columns that might be related to save situation
    if 's' in col.lower():
        save_situation_related.append(col)

print(save_situation_related)


Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'W', 'L', 'WAR', 'ERA', 'G',
       ...
       'LA', 'Barrels', 'Barrel%', 'maxEV', 'HardHit', 'HardHit%', 'Events',
       'CStr%', 'CSW%', 'xERA'],
      dtype='object', length=334)
['Season', 'GS', 'ShO', 'SV', 'BS', 'SO', 'Balls', 'Strikes', 'Pitches', 'RS', 'Starting', 'Start-IP', 'Dollars', 'Pulls', 'SL%', 'SLv', 'SF%', 'SFv', 'wSL', 'wSF', 'wSL/C', 'wSF/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'F-Strike%', 'SwStr%', 'SD', 'SIERA', 'RS/9', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL% (sc)', 'CU% (sc)', 'KC% (sc)', 'EP% (sc)', 'CH% (sc)', 'SC% (sc)', 'KN% (sc)', 'UN% (sc)', 'vFA (sc)', 'vFT (sc)', 'vFC (sc)', 'vFS (sc)', 'vFO (sc)', 'vSI (sc)', 'vSL (sc)', 'vCU (sc)', 'vKC (sc)', 'vEP (sc)', 'vCH (sc)', 'vSC (sc)', 'vKN (sc)', 'FA-X (sc)', 'FT-X (sc)', 'FC-X (sc)', 'FS-X (sc)', 'FO-X (sc)', 'SI-X (sc)', 'SL-X (sc)', 'CU-X (sc)', 'KC-X (sc)', 'EP-X (sc)', 'CH-X (sc)', 'SC-X (sc)', 'KN-X (sc)', 'FA-Z (sc)',

## pitching_data_2021_to_2022 我們找到了 `SV` 還有 `BS`



In [3]:
#  pitching_data_2021_to_2022 sorted by `SV`
print('pitching_data_2022 rows:', len(pitching_data_2022))

# 看單季沒有救援成功的投手數量
print('without save:', len(
    pitching_data_2022[pitching_data_2022['SV'] == 0]
))


# 看單季小於 10 勝的投手數量
print('less than 10 wins:', len(
    pitching_data_2022[pitching_data_2022['W'] < 10]
))



pitching_data_2022 rows: 45
without save: 45
less than 10 wins: 6


### `SV` 欄位沒有資料，可能 query 的方式有誤，造成只有先發投手的資料

In [4]:
# 多試幾種 query 的方式

pitching_data_2020_2021 = pitching_stats(2020, 2021)
print(pitching_data_2020_2021['SV'].value_counts())

pitching_data_2022_different_arg = pitching_stats(2022)
print(pitching_data_2022_different_arg['SV'].value_counts())

pitching_data_2004_2007 = pitching_stats(2004, 2007)
print(pitching_data_2004_2007['SV'].value_counts())


SV
0    79
Name: count, dtype: int64
SV
0    45
Name: count, dtype: int64
SV
0    341
1      4
5      1
Name: count, dtype: int64


[pybaseball 官方文件](https://github.com/jldbc/pybaseball/blob/master/docs/pitching_stats.md)說:

> The pitching_stats function returns season-level pitching data from FanGraphs.

看來需要再驗證了。

In [5]:
# get pitching_data_2004_2007 with `Name`, `SV`, sorted by `SV`
pitching_data_2004_2007[['Name', 'SV']].sort_values('SV', ascending=False)

Unnamed: 0,Name,SV
267,Miguel Batista,5
227,Brett Tomko,1
86,Chien-Ming Wang,1
283,Nate Robertson,1
343,Joel Pineiro,1
...,...,...
65,Jon Garland,0
195,Aaron Cook,0
114,Scott Kazmir,0
236,Cory Lidle,0


https://www.baseball-reference.com/players/b/batismi01.shtml

Miguel Batista 這段期間，最多有 31 次的救援成功，

但 2004 - 2007 ，最多只有 5 次的救援成功。

不合理！

In [6]:
# retrieve aggregate player statistics from 2004 to 2007 
pitching_data_2004_2007_aggregated = pitching_stats(2004, 2007, ind=0)
pitching_data_2004_2007_aggregated[['Name', 'SV']].sort_values('SV', ascending=False)

Unnamed: 0,Name,SV
0,Mariano Rivera,160
3,Chad Cordero,127
6,Brad Lidge,122
35,David Weathers,60
47,Braden Looper,57
...,...,...
76,Paul Byrd,0
92,David Wells,0
135,Jeff Weaver,0
54,Aaron Cook,0


以 Mariano Rivera 為例：

https://www.baseball-reference.com/players/r/riverma01.shtml#2004-2007-sum:pitching_standard

這邊就跟 Baseball Reference 的資料對起來了。

### 但是：

In [7]:
# retrieve aggregate player statistics 2022 
pitching_data_2022_aggregated = pitching_stats(2022, None, ind=0)
pitching_data_2022_aggregated[['Name', 'SV']].sort_values('SV', ascending=False).head()

Unnamed: 0,Name,SV
22,Aaron Nola,0
1,Julio Urias,0
37,Corey Kluber,0
20,Kyle Wright,0
23,Miles Mikolas,0


### 決定不用 FanGraphs 的資料了，改用 Baseball Reference 的資料

In [8]:
# TODO: save data_2022_pitching_bref to csv, stand alone only

# from pybaseball import pitching_stats_bref

# retrieve data on the 2022 season
# data_2022_pitching_bref = pitching_stats_bref(2022)
# data_2022_pitching_bref.head()

# data_2022_pitching_bref.to_csv('data-2022-pitching-bref.csv', index=False)


## 來看 pitching_stats_bref

In [9]:
data_2022_pitching_bref = pd.read_csv('data-2022-pitching-bref.csv')


In [10]:
# 瞭解 data_2022_pitching_bref 的資料型態
data_2022_pitching_bref.columns

Index(['Name', 'Age', '#days', 'Lev', 'Tm', 'G', 'GS', 'W', 'L', 'SV', 'IP',
       'H', 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B', '3B', 'IBB',
       'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit', 'Str', 'StL', 'StS',
       'GB/FB', 'LD', 'PU', 'WHIP', 'BAbip', 'SO9', 'SO/W', 'mlbID'],
      dtype='object')

In [11]:
# sorted by `SV`
data_2022_pitching_bref.sort_values(by=['SV'], ascending=False).head()

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,StL,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W,mlbID
148,Emmanuel Clase,24,208,Maj-AL,Cleveland,77,0,3.0,4.0,42.0,...,0.14,0.17,0.64,0.19,0.03,0.729,0.224,9.5,7.7,661403
375,Kenley Jansen,34,208,Maj-NL,Atlanta,65,0,5.0,2.0,41.0,...,0.18,0.12,0.3,0.19,0.12,1.047,0.259,12.0,3.86,445276
336,Liam Hendriks,33,209,Maj-AL,Chicago,58,0,4.0,4.0,37.0,...,0.15,0.18,0.36,0.24,0.11,1.041,0.303,13.3,5.31,521230
320,Josh Hader,28,209,Maj-NL,"Milwaukee,San Diego",56,0,2.0,5.0,36.0,...,0.16,0.16,0.31,0.3,0.07,1.28,0.333,14.6,3.86,623352
660,Jordan Romano,29,207,Maj-AL,Toronto,63,0,5.0,4.0,36.0,...,0.16,0.15,0.45,0.2,0.06,1.016,0.258,10.3,3.48,605447


### 用 `SV` 來找資料，看到熟悉的名字了

### 以下為瀏覽 statcast 的內容，確認有無可用的欄位

In [12]:
# Path: 下載 1990 年到 2022 年的 statcast 資料
# TODO: stand alone for download only

# from pybaseball import statcast

# # 1990 年到 2022 年的資料
# data = statcast(start_dt="2022-01-01", end_dt="2022-12-31")

# # Save the data to a CSV file
# data.to_csv("statcast-data-2022.csv", index=False)

# # Read the data from the CSV file
# data_from_csv = pd.read_csv("statcast-data-2022.csv")

# data_from_csv.head()

In [13]:
import pandas as pd

# Read the data from the CSV file
data_from_csv_statcast_2022 = pd.read_csv("statcast-data-2022.csv")

data_from_csv_statcast_2022.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,SL,2022-11-05,89.2,-0.06,6.14,"Pressly, Ryan",592206,519151,field_out,hit_into_play,...,4,1,4,1,4,Standard,Standard,137.0,0.011,-0.219
1,FF,2022-11-05,93.9,-0.18,5.94,"Pressly, Ryan",547180,519151,field_out,hit_into_play,...,4,1,4,1,4,Infield shift,Standard,210.0,0.022,-0.3
2,FF,2022-11-05,93.0,-0.09,5.97,"Pressly, Ryan",592663,519151,single,hit_into_play,...,4,1,4,1,4,Standard,Standard,212.0,-0.02,0.262
3,SL,2022-11-05,88.1,-0.15,6.03,"Pressly, Ryan",656555,519151,field_out,hit_into_play,...,4,1,4,1,4,Infield shift,Standard,120.0,0.018,-0.158
4,SL,2022-11-05,89.0,-0.25,6.06,"Pressly, Ryan",656555,519151,,foul,...,4,1,4,1,4,Infield shift,Standard,121.0,0.0,-0.054


In [14]:
# 瞭解 data_from_csv_statcast_2022 的資料型態
data_from_csv_statcast_2022.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

### 請 GPT 幫忙整理欄位的意義

原先使用 GPT3.5，但瀏覽過去，發現他會把類似 fielder_2 翻譯成二壘手，所以改用 GPT 4

1. pitch_type：球種
2. game_date：比賽日期
3. release_speed：發球速度
4. release_pos_x：發球位置 x 座標
5. release_pos_z：發球位置 z 座標
6. player_name：球員名稱
7. batter：打者
8. pitcher：投手
9. events：事件
10. description：描述
11. spin_dir：旋轉方向（已廢棄）
12. spin_rate_deprecated：旋轉速率（已廢棄）
13. break_angle_deprecated：轉折角度（已廢棄）
14. break_length_deprecated：轉折長度（已廢棄）
15. zone：區域
16. des：詳細描述
17. game_type：比賽類型
18. stand：打者站立方向
19. p_throws：投手投球方向
20. home_team：主場隊伍
21. away_team：客場隊伍
22. type：球類型（好球或壞球）
23. hit_location：擊球位置
24. bb_type：擊球類型
25. balls：壞球數
26. strikes：好球數
27. game_year：比賽年份
28. pfx_x：橫向運動
29. pfx_z：縱向運動
30. plate_x：投球板 x 座標
31. plate_z：投球板 z 座標
32. on_3b：三壘上的跑者
33. on_2b：二壘上的跑者
34. on_1b：一壘上的跑者
35. outs_when_up：出局數
36. inning：局數
37. inning_topbot：局數上半或下半
38. hc_x：擊球點 x 座標
39. hc_y：擊球點 y 座標
40. tfs_deprecated：投球時間（已廢棄）
41. tfs_zulu_deprecated：投球時間（已廢棄）
42. fielder_2：捕手
43. umpire：裁判
44. sv_id：投球識別碼
45. vx0：x 方向初速度
46. vy0：y 方向初速度
47. vz0：z 方向初速度
48. ax：x 方向加速度
49. ay：y 方向加速度
50. az：z 方向加速度
51. sz_top：好球區上邊界
52. sz_bot：好球區下邊界
53. hit_distance_sc：擊球距離
54. launch_speed：擊球速度
55. launch_angle：擊球角度
56. effective_speed：有效速度
57. release_spin_rate：發球旋轉速率
58. release_extension：發球延伸距離
59. game_pk：比賽唯一識別碼
60. pitcher.1：投手（重複）
61. fielder_2.1：捕手（重複）
62. fielder_3：一壘手
63. fielder_4：二壘手
64. fielder_5：三壘手
65. fielder_6：左外野手
66. fielder_7：中外野手
67. fielder_8：右外野手
68. fielder_9：指定打擊手
69. release_pos_y：發球位置 y 座標
70. estimated_ba_using_speedangle：利用速度角估計的打擊率
71. estimated_woba_using_speedangle：利用速度角估計的加權出局率
72. woba_value：加權出局率值
73. woba_denom：加權出局率分母
74. babip_value：球員打擊率的值
75. iso_value：長打率值
76. launch_speed_angle：擊球速度角度
77. at_bat_number：打擊次數
78. pitch_number：投球次數
79. pitch_name：投球名稱
80. home_score：主場得分
81. away_score：客場得分
82. bat_score：打擊方得分
83. fld_score：防守方得分
84. post_away_score：客場隊伍後續得分
85. post_home_score：主場隊伍後續得分
86. post_bat_score：打擊方後續得分
87. post_fld_score：防守方後續得分
88. if_fielding_alignment：內野防守位置
89. of_fielding_alignment：外野防守位置
90. spin_axis：旋轉軸
91. delta_home_win_exp：主場勝率變化
92. delta_run_exp：跑者得分預期變化


### 暫時用不到這些數據

In [15]:
# 瞭解有哪些 events
unique_events = data_from_csv_statcast_2022['events'].unique()
print(unique_events)


['field_out' 'single' nan 'strikeout' 'walk' 'home_run' 'force_out'
 'hit_by_pitch' 'grounded_into_double_play' 'fielders_choice_out' 'double'
 'strikeout_double_play' 'sac_fly' 'field_error' 'sac_bunt_double_play'
 'fielders_choice' 'sac_bunt' 'double_play' 'caught_stealing_2b' 'triple'
 'other_out' 'pickoff_3b' 'catcher_interf' 'caught_stealing_3b'
 'pickoff_caught_stealing_2b' 'triple_play' 'caught_stealing_home'
 'sac_fly_double_play' 'wild_pitch' 'pickoff_1b'
 'pickoff_caught_stealing_home' 'pickoff_caught_stealing_3b'
 'game_advisory' 'pickoff_2b']


### 暫時用不到這些 events

### 調查如何根據個別球員，找到個別年度的出賽資料，並且判斷救援成功以及救援失敗的情況

In [3]:
import pandas as pd
from pybaseball import get_splits

pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

#find the pitching split stats for Jon Lester
pitching_split_stats_Jon_Lester = get_splits('lestejo01', pitching_splits=True)

# print data type
print(type(pitching_split_stats_Jon_Lester))

pitching_split_stats_Jon_Lester[0]


  commentsoup = bs.BeautifulSoup(comment[i], 'lxml')


<class 'tuple'>


Unnamed: 0_level_0,Unnamed: 1_level_0,G,PA,AB,R,H,2B,3B,HR,SB,CS,...,TB,GDP,HBP,SH,SF,IBB,ROE,BAbip,tOPS+,1B
Split Type,Split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Season Totals,Career Totals,452,11487.0,10376,1215.0,2610,517,40,294,246,108,...,4089.0,250.0,90.0,61,67,13,99,0.302,100,1759
Platoon Splits,vs RHB,452,8785.0,7898,880.0,2003,396,26,229,191,74,...,3138.0,199.0,67.0,49,49,12,77,0.302,103,1352
Platoon Splits,vs LHB,426,2702.0,2478,266.0,607,121,14,65,55,34,...,951.0,51.0,23.0,12,18,1,22,0.304,92,407
Platoon Splits,vs RHB as LHP,452,8785.0,7898,880.0,2003,396,26,229,191,74,...,3138.0,199.0,67.0,49,49,12,77,0.302,103,1352
Platoon Splits,vs LHB as LHP,426,2702.0,2478,266.0,607,121,14,65,55,34,...,951.0,51.0,23.0,12,18,1,22,0.304,92,407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
By Umpire,Tony Randazzo,7,164.0,151,19.0,40,9,0,7,4,1,...,70.0,2.0,2.0,2,0,0,0,0.317,118,24
By Umpire,Tripp Gibson,3,80.0,70,8.0,14,1,0,4,6,1,...,27.0,4.0,0.0,1,1,0,0,0.204,87,9
By Umpire,Vic Carapazza,7,182.0,167,12.0,37,5,2,5,6,5,...,61.0,4.0,1.0,1,2,0,1,0.267,79,25
By Umpire,Wally Bell,3,78.0,72,5.0,18,3,1,1,2,1,...,26.0,4.0,0.0,0,1,0,0,0.298,86,13


In [4]:
# print type of pitching_split_stats_Jon_Lester[0]
print(type(pitching_split_stats_Jon_Lester[0]))


<class 'pandas.core.frame.DataFrame'>


In [5]:
# print type of pitching_split_stats_Jon_Lester[0]
print(type(pitching_split_stats_Jon_Lester[0]))


<class 'pandas.core.frame.DataFrame'>


In [7]:
print(pitching_split_stats_Jon_Lester[0].columns)


Index(['G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'SB', 'CS', 'BB', 'SO',
       'SO/W', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDP', 'HBP', 'SH', 'SF',
       'IBB', 'ROE', 'BAbip', 'tOPS+', '1B'],
      dtype='object')


In [20]:
# 將 multi-index 轉成 list
split_type_list_for_starter = pitching_split_stats_Jon_Lester[0].index.get_level_values('Split Type').tolist()
split_list_for_starter = pitching_split_stats_Jon_Lester[0].index.get_level_values('Split').tolist()

print(split_type_list_for_starter)
print(split_list_for_starter)


['Season Totals', 'Platoon Splits', 'Platoon Splits', 'Platoon Splits', 'Platoon Splits', 'Home or Away', 'Home or Away', 'First or Second Half', 'First or Second Half', 'Months', 'Months', 'Months', 'Months', 'Months', 'Months', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Pitching Role', 'Pitching Role', 'Run Support', 'Run Support', 'Run Support', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Swung or Took First Pitch of PA', 'Swung or Took First Pitch of PA', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', '

In [21]:
unique_starter_split_types = set(split_type_list_for_starter)
unique_starter_split_types

{'Ballparks',
 'Bases Occupied',
 'Batting Order Positions',
 'By Umpire',
 'Clutch Stats',
 'Count/Balls-Strikes',
 'Days of Rest',
 'First or Second Half',
 'Game Conditions',
 'Game Outcome for Pitcher',
 'Hit Location',
 'Hit Trajectory',
 'Home or Away',
 'Leading Off Inning',
 'Leverage',
 'Months',
 'Number of Outs in Inning',
 'Opponent',
 'Opposition Defensive Position',
 'Pitch Count',
 'Pitching Role',
 'Platoon Splits',
 'Run Support',
 'Season Totals',
 'Swung or Took First Pitch of PA',
 'Times Facing Opponent in Game'}

In [22]:
# unique split_list
unique_starter_splits = set(split_list_for_starter)
# list top 10 unique_split_list
list(unique_starter_splits)[:10]

['Paul Schrieber',
 '3rd PA in G, as SP',
 'San Diego Padres',
 '4th+ PA in G, as SP',
 'SDP-Petco Pk',
 'Pitch 51-75',
 'Opp Fld-LHB',
 'Pitch 101+',
 'Chris Guccione',
 'in non-Sv']

### 仔細看了一下，只有看到 `non-save situation`，但沒有看到 `save situation`，或許是因為投手的關係？
將前面的變數重新命名，並找救援投手分析。

In [44]:
#find the pitching split stats for Mariano Rivera
pitching_split_stats_Mariano_Rivera = get_splits('riverma01',
                                                 pitching_splits=True)


  commentsoup = bs.BeautifulSoup(comment[i], 'lxml')


In [29]:
# 將 multi-index 轉成 list, in Mariano Rivera's case
split_type_list_for_closer = pitching_split_stats_Mariano_Rivera[0].index.get_level_values('Split Type').tolist()
split_list_for_closer = pitching_split_stats_Mariano_Rivera[0].index.get_level_values('Split').tolist()

print(split_type_list_for_closer)
print(split_list_for_closer)


['Season Totals', 'Platoon Splits', 'Platoon Splits', 'Platoon Splits', 'Platoon Splits', 'Home or Away', 'Home or Away', 'First or Second Half', 'First or Second Half', 'Months', 'Months', 'Months', 'Months', 'Months', 'Months', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Game Outcome for Pitcher', 'Pitching Role', 'Pitching Role', 'Run Support', 'Run Support', 'Run Support', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Batting Order Positions', 'Swung or Took First Pitch of PA', 'Swung or Took First Pitch of PA', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-Strikes', 'Count/Balls-S

In [39]:
# Selecting rows with Split Type == 'Game Outcome for Pitcher', 'Season Totals', 'Pitching Role'
outcome_split_stats_Mariano_Rivera = pitching_split_stats_Mariano_Rivera[
    0].loc[pitching_split_stats_Mariano_Rivera[0].index.get_level_values(
        'Split Type').isin(['Game Outcome for Pitcher', 'Season Totals', 'Pitching Role']), :]
outcome_split_stats_Mariano_Rivera


Unnamed: 0_level_0,Unnamed: 1_level_0,G,PA,AB,R,H,2B,3B,HR,SB,CS,...,TB,GDP,HBP,SH,SF,IBB,ROE,BAbip,tOPS+,1B
Split Type,Split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Season Totals,Career Totals,1115,5103.0,4719,340.0,998,141,15,71,70,21,...,1382.0,88.0,46.0,26,25,41,56,0.265,100,771
Game Outcome for Pitcher,in Wins,82,567.0,509,35.0,113,18,0,12,8,2,...,167.0,7.0,3.0,6,2,9,4,0.272,123,83
Game Outcome for Pitcher,in Losses,60,405.0,351,134.0,162,30,5,23,16,6,...,271.0,3.0,4.0,4,1,16,11,0.519,364,104
Game Outcome for Pitcher,in No Dec.,321,1596.0,1473,117.0,332,36,7,26,26,4,...,460.0,29.0,10.0,10,9,15,21,0.278,111,263
Game Outcome for Pitcher,in Saves,652,2535.0,2386,54.0,391,57,3,10,20,9,...,484.0,49.0,29.0,6,13,1,20,0.216,48,321
Game Outcome for Pitcher,in Sv Situ,763,3267.0,3031,192.0,620,90,8,40,36,12,...,846.0,57.0,36.0,18,21,19,29,0.258,91,482
Game Outcome for Pitcher,in non-Sv,342,1603.0,1479,113.0,314,40,6,23,32,7,...,435.0,29.0,8.0,8,2,22,26,0.268,103,245
Pitching Role,as Starter,10,233.0,209,35.0,64,11,1,8,2,2,...,101.0,2.0,2.0,0,2,0,1,0.339,206,44
Pitching Role,as Reliever,1105,4870.0,4510,305.0,934,130,14,63,68,19,...,1281.0,86.0,44.0,26,23,41,55,0.261,95,727


In [40]:
in_sv_situ_PA = pitching_split_stats_Mariano_Rivera[0].loc[('Game Outcome for Pitcher', 'in Sv Situ'), 'PA']
in_non_sv_PA = pitching_split_stats_Mariano_Rivera[0].loc[('Game Outcome for Pitcher', 'in non-Sv'), 'PA']

assert isinstance(in_sv_situ_PA, float)
assert isinstance(in_non_sv_PA, float)

both_in_sv_non_sv_combined = in_sv_situ_PA + in_non_sv_PA
print(both_in_sv_non_sv_combined)


4870.0


In [43]:
total_career_PA = pitching_split_stats_Mariano_Rivera[0].loc[('Season Totals', 'Career Totals'), 'PA']

print('Total career PA: {}'.format(total_career_PA))

as_reliever_PA = pitching_split_stats_Mariano_Rivera[0].loc[('Pitching Role', 'as Reliever'), 'PA']

print('Total career PA as reliever: {}'.format(as_reliever_PA))


Total career PA: 5103.0
Total career PA as reliever: 4870.0


### 假設： both_in_sv_non_sv_combined 與 as_reliever_PA 應該要對得起來
需要再一個投手來驗證

In [46]:
# Archie Bradley 有先發過，也有擔任非終結者的角色，很適合拿來驗證
# find the pitching split stats for Archie Bradley
pitching_split_stats_Archie_Bradley = get_splits('bradlar01',
                                                    pitching_splits=True)

# Selecting rows with Split Type == 'Game Outcome for Pitcher', 'Season Totals', 'Pitching Role'
outcome_split_stats_Archie_Bradley = pitching_split_stats_Archie_Bradley[
    0].loc[pitching_split_stats_Archie_Bradley[0].index.get_level_values(
        'Split Type').isin(['Game Outcome for Pitcher', 'Season Totals', 'Pitching Role']), :]
outcome_split_stats_Archie_Bradley




  commentsoup = bs.BeautifulSoup(comment[i], 'lxml')


Unnamed: 0_level_0,Unnamed: 1_level_0,G,PA,AB,R,H,2B,3B,HR,SB,CS,...,TB,GDP,HBP,SH,SF,IBB,ROE,BAbip,tOPS+,1B
Split Type,Split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Season Totals,Career Totals,329,2077.0,1840,224.0,459,99,12,44,33,8,...,714.0,46.0,20.0,7,12,16,16,0.313,100,304
Season Totals,Last 365 days,13,43.0,38,6.0,9,2,0,0,3,0,...,11.0,2.0,0.0,0,0,0,1,0.3,74,7
Game Outcome for Pitcher,in Wins,30,364.0,328,23.0,69,16,1,4,2,2,...,99.0,10.0,4.0,0,3,0,3,0.271,63,48
Game Outcome for Pitcher,in Losses,29,373.0,315,97.0,114,23,3,22,12,1,...,209.0,5.0,7.0,2,2,6,4,0.42,209,66
Game Outcome for Pitcher,in No Dec.,238,1208.0,1078,95.0,252,52,8,16,18,5,...,368.0,26.0,8.0,5,6,10,9,0.303,82,176
Game Outcome for Pitcher,in Saves,32,132.0,119,9.0,24,8,0,2,1,0,...,38.0,5.0,1.0,0,1,0,0,0.247,66,14
Game Outcome for Pitcher,in Sv Situ,142,610.0,546,58.0,129,25,1,13,8,1,...,195.0,18.0,4.0,4,4,1,3,0.304,85,90
Game Outcome for Pitcher,in non-Sv,152,657.0,592,55.0,135,28,6,12,8,1,...,211.0,15.0,10.0,0,0,6,8,0.291,85,89
Pitching Role,as Starter,35,810.0,702,111.0,195,46,5,19,17,6,...,308.0,13.0,6.0,3,8,9,5,0.337,124,125
Pitching Role,as Reliever,294,1267.0,1138,113.0,264,53,7,25,16,2,...,406.0,33.0,14.0,4,4,7,11,0.297,85,179


In [48]:
in_sv_situ_PA_Archie_Bradley = pitching_split_stats_Archie_Bradley[0].loc[('Game Outcome for Pitcher', 'in Sv Situ'), 'PA']
in_non_sv_PA_Archie_Bradley = pitching_split_stats_Archie_Bradley[0].loc[('Game Outcome for Pitcher', 'in non-Sv'), 'PA']

assert isinstance(in_sv_situ_PA_Archie_Bradley, float)
assert isinstance(in_non_sv_PA_Archie_Bradley, float)

both_in_sv_non_sv_combined_Archie_Bradley = in_sv_situ_PA_Archie_Bradley + in_non_sv_PA_Archie_Bradley

total_career_PA_Archie_Bradley = pitching_split_stats_Archie_Bradley[0].loc[('Season Totals', 'Career Totals'), 'PA']
as_reliever_PA_Archie_Bradley = pitching_split_stats_Archie_Bradley[0].loc[('Pitching Role', 'as Reliever'), 'PA']

print('Total career PA: {}'.format(total_career_PA_Archie_Bradley))

# validate if both_in_sv_non_sv_combined_Archie_Bradley is equal to as_reliever_PA_Archie_Bradley
if both_in_sv_non_sv_combined_Archie_Bradley == as_reliever_PA_Archie_Bradley:
    print("Archie Bradley's total plate appearances as a reliever, including both in saves and non-save situations, is equal to the sum of those two situations")
else:
    print('not equal')




Total career PA: 2077.0
Archie Bradley's total plate appearances as a reliever, including both in saves and non-save situations, is equal to the sum of those two situations


### 小結：non-save situation 是只有 as reliever 才有的

### 以下看看 team-level 的投球資料

In [None]:
from pybaseball import team_game_logs, schedule_and_record

team_pitching_logs_2019_ATL = team_game_logs(2019, "ATL", "pitching")
team_pitching_logs_2019_ATL


Unnamed: 0,Game,Date,Home,Opp,Rslt,IP,H,R,ER,UER,...,2B,3B,IBB,SH,SF,ROE,GDP,NumPlayers,Umpire,PitchersUsed
0,1,Mar 28,False,PHI,"L,4-10",8.0,7,10,10,0,...,0,0,2,0,0,0,0,5,Mike Winters,"J.Teheran (99-52-L), S.Carle (99), W.Parsons (..."
1,2,Mar 30,False,PHI,"L,6-8",8.0,9,8,8,0,...,2,1,1,0,0,0,0,6,Tim Timmons,"B.Wilson (99-33), W.Parsons (1-L), J.Venters (..."
2,3,Mar 31,False,PHI,"L,1-5",8.0,5,5,5,0,...,1,0,0,0,1,0,1,4,Rob Drake,"K.Wright (99-46-L), M.Fried (2), S.Carle (2), ..."
3,4,Apr 1,True,CHC,"W,8-0",9.0,9,0,0,0,...,3,0,0,0,0,1,3,5,Gary Cederstrom,"S.Newcomb (99-49), W.Parsons (1-W), J.Biddle (..."
4,5,Apr 3,True,CHC,"W,6-4",9.0,12,4,4,0,...,0,0,0,0,0,0,1,6,Marvin Hudson,"J.Teheran (5-55), J.Venters (3-BSv), W.Parsons..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,158,Sep 24,False,KCR,"L,6-9",8.0,12,9,9,0,...,7,0,0,0,2,0,0,7,Marvin Hudson,"J.Teheran (5-21-L), G.Dayton (6), A.Swarzak (2..."
158,159,Sep 25,False,KCR,"W,10-2",9.0,8,2,2,0,...,1,1,0,0,0,0,0,7,Ryan Blakney,"J.Tomlin (4-50), S.Newcomb (3), D.O'Day (0), L..."
159,160,Sep 27,False,NYM,"L,2-4",8.0,9,4,4,0,...,3,0,0,0,0,0,1,4,Mark Wegner,"D.Keuchel (4-37-L), A.Swarzak (2), S.Greene (1..."
160,161,Sep 28,False,NYM,"L,0-3",8.0,4,3,3,0,...,0,0,0,0,0,0,1,3,Stu Scheurwater,"M.Foltynewicz (7-48-L), M.Fried (6), C.Martin (3)"


In [None]:
atlanta_braves_2019_schedule_and_record = schedule_and_record(2019, "ATL")
atlanta_braves_2019_schedule_and_record

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Orig. Scheduled
1,"Thursday, Mar 28",ATL,@,PHI,L,4.0,10.0,9.0,0-1,3.0,1.0,Nola,Teheran,,3:04,D,44469.0,1.08,-1,
2,"Saturday, Mar 30",ATL,@,PHI,L,6.0,8.0,9.0,0-2,4.0,2.0,Morgan,Parsons,,3:27,D,44597.0,1.07,-2,
3,"Sunday, Mar 31",ATL,@,PHI,L,1.0,5.0,9.0,0-3,5.0,3.0,Arrieta,Wright,,3:17,N,41410.0,1.01,-3,
4,"Monday, Apr 1",ATL,Home,CHC,W,8.0,0.0,9.0,1-3,5.0,2.5,Parsons,Hendricks,,3:12,N,41912.0,.80,1,
5,"Wednesday, Apr 3",ATL,Home,CHC,W,6.0,4.0,9.0,2-3,3.0,2.5,Jackson,Cishek,Vizcaíno,3:32,N,37398.0,.87,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,"Tuesday, Sep 24",ATL,@,KCR,L,6.0,9.0,9.0,96-62,1.0,up 7.5,Duffy,Teheran,,3:28,N,16688.0,.07,-2,
159,"Wednesday, Sep 25",ATL,@,KCR,W,10.0,2.0,9.0,97-62,1.0,up 7.5,Jackson,Barnes,,3:31,N,16931.0,.06,1,
160,"Friday, Sep 27",ATL,@,NYM,L,2.0,4.0,9.0,97-63,1.0,up 6.0,Stroman,Keuchel,Lugo,3:09,N,26264.0,.06,-1,
161,"Saturday, Sep 28",ATL,@,NYM,L,0.0,3.0,9.0,97-64,1.0,up 5.0,Matz,Foltynewicz,Díaz,2:33,N,32210.0,.05,-2,


In [None]:
atlanta_braves_2019_schedule_and_record.columns

Index(['Date', 'Tm', 'Home_Away', 'Opp', 'W/L', 'R', 'RA', 'Inn', 'W-L',
       'Rank', 'GB', 'Win', 'Loss', 'Save', 'Time', 'D/N', 'Attendance', 'cLI',
       'Streak', 'Orig. Scheduled'],
      dtype='object')

In [None]:
team_pitching_logs_2019_ATL.columns

Index(['Game', 'Date', 'Home', 'Opp', 'Rslt', 'IP', 'H', 'R', 'ER', 'UER',
       'BB', 'SO', 'HR', 'HBP', 'ERA', 'BF', 'Pit', 'Str', 'IR', 'IS', 'SB',
       'CS', 'AB', '2B', '3B', 'IBB', 'SH', 'SF', 'ROE', 'GDP', 'NumPlayers',
       'Umpire', 'PitchersUsed'],
      dtype='object')

In [None]:
atlanta_braves_2019_schedule_and_record.iloc[0]

Date               Thursday, Mar 28
Tm                              ATL
Home_Away                         @
Opp                             PHI
W/L                               L
R                               4.0
RA                             10.0
Inn                             9.0
W-L                             0-1
Rank                            3.0
GB                              1.0
Win                            Nola
Loss                        Teheran
Save                           None
Time                           3:04
D/N                               D
Attendance                  44469.0
cLI                            1.08
Streak                           -1
Orig. Scheduled                None
Name: 1, dtype: object