In [2]:
import pandas as pd

abs_path = "./labeled/50_abs_jose_ramos_20240308.csv"
pitches_path = "./labeled/95_pitches_jose_ramos_20240308.csv"

abs_df = pd.read_csv(abs_path)
pitches_df = pd.read_csv(pitches_path)

abs_df.head(1)

Unnamed: 0,abs_id,file_name,file_name_date,file_name_player_name,file_name_pitcher_type,file_name_at_bat_number,file_name_result,is_filename_result_consistent,inning,at_bat_result,weather,game_time
0,1,2022-09-10 Jose Ramos VS 右投 打席2_二飛.mp4,2022-09-10,Jose Ramos,RHP,2,二飛,True,4,4F,sunny,day


In [3]:
pitches_df.head(1)

Unnamed: 0,id,file_name,file_name_date,file_name_player_name,file_name_pitcher_type,file_name_at_bat_number,file_name_result,inning,abs_id,nth_pitch_ab,...,score_diffrence,is_team_leading,weather,game_time,has_swing_intention,horizontal_ending,vertical_ending,is_swing_delayed,is_swing_early,is_whiff
0,4e1e4436-2ba9-41a0-b88a-5cc4641edcbe,2022-09-10 Jose Ramos VS 右投 打席2_二飛.mp4,2022-09-10,Jose Ramos,RHP,2.0,二飛,4.0,1,1,...,1,True,sunny,day,False,outside,low,False,False,False


### 確認欄位資訊 & 清理

In [4]:
print(f"abs_df row count: {abs_df.shape[0]}, column count: {abs_df.shape[1]}")
print(f"pitches_df row count: {pitches_df.shape[0]}, column count: {pitches_df.shape[1]}")


abs_df row count: 50, column count: 12
pitches_df row count: 93, column count: 36


In [5]:
pitches_df.columns


Index(['id', 'file_name', 'file_name_date', 'file_name_player_name',
       'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
       'inning', 'abs_id', 'nth_pitch_ab', 'out_count', 'strike_count',
       'ball_count', 'is_rhp', 'at_bat_result',
       'is_filename_result_consistent', 'has_stealing_attempt', 'pitch_type',
       'pitch_release', 'is_obvious_off_zone', 'is_pitch_ended_catcher_want',
       'has_swing', 'has_first_base_runner', 'has_second_base_runner',
       'has_third_base_runner', 'has_visible_shift', 'score_diffrence',
       'is_team_leading', 'weather', 'game_time', 'has_swing_intention',
       'horizontal_ending', 'vertical_ending', 'is_swing_delayed',
       'is_swing_early', 'is_whiff'],
      dtype='object')

In [22]:
abs_df.columns

Index(['abs_id', 'file_name', 'file_name_date', 'file_name_player_name',
       'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
       'is_filename_result_consistent', 'inning', 'at_bat_result', 'weather',
       'game_time'],
      dtype='object')

In [20]:
# 刪除 pitches 無用欄位
# 因 abs 已有正確及詳細的
columns_to_drop = [
    'file_name', 'file_name_date', 'file_name_player_name',
    'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
    'inning', 'is_filename_result_consistent', 'at_bat_result', 'game_time',  'weather'
]

# 逐球記錄的資料量
pitches_df_cleaned = pitches_df.drop(columns=columns_to_drop, axis=1)
print(f"Columns count: {pitches_df_cleaned.shape[1]}")
print(f"row count: {pitches_df_cleaned.shape[0]}")
print(
    f"cell count: {pitches_df_cleaned.shape[0] *pitches_df_cleaned.shape[1] }")

pitches_df_cleaned.columns


Columns count: 25
row count: 93
cell count: 2325


Index(['id', 'abs_id', 'nth_pitch_ab', 'out_count', 'strike_count',
       'ball_count', 'is_rhp', 'has_stealing_attempt', 'pitch_type',
       'pitch_release', 'is_obvious_off_zone', 'is_pitch_ended_catcher_want',
       'has_swing', 'has_first_base_runner', 'has_second_base_runner',
       'has_third_base_runner', 'has_visible_shift', 'score_diffrence',
       'is_team_leading', 'has_swing_intention', 'horizontal_ending',
       'vertical_ending', 'is_swing_delayed', 'is_swing_early', 'is_whiff'],
      dtype='object')

In [21]:
# 打席數的資料量
print(f"abs_df cell counts: {abs_df.shape[0] *abs_df.shape[1]}")

abs_df row count: 50, column count: 12
abs_df cell counts: 600


In [7]:
# 確認是否各球欄位都是預設的值
fields_to_validate = ['vertical_ending', 'horizontal_ending', 'is_rhp', 'pitch_type', 'pitch_release']

for field in fields_to_validate:
    print(f"{field} unique value: {pitches_df_cleaned[field].unique()}")


vertical_ending unique value: ['low' 'high' 'middle']
horizontal_ending unique value: ['outside' 'middle' 'inside']
is_rhp unique value: [ True False]
pitch_type unique value: ['slider' 'four_seam' 'sinker' 'curve' 'changeup' 'cutter']
pitch_release unique value: ['overhand' 'side_arm']


In [60]:
import os

output_file_path = './jose_ramos_pitches_df_cleaned.csv'

if not os.path.exists(output_file_path):
    pitches_df_cleaned.to_csv(output_file_path, index=False)
    print(f"File saved: {output_file_path}")
else:
    print("File already exists.")

File saved: ./jose_ramos_pitches_df_cleaned.csv


### 完成初步清理

### 影片分類邏輯 & 基礎數據



In [65]:
abs_df.columns

Index(['abs_id', 'file_name', 'file_name_date', 'file_name_player_name',
       'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
       'is_filename_result_consistent', 'inning', 'at_bat_result', 'weather',
       'game_time'],
      dtype='object')

In [80]:
abs_df['at_bat_result'].unique()


array(['4F', 'SK', '6G', '7_HR', 'CK', '7_1B', '5G', '7F', 'BB', '7_2B',
       '6_1B', '9F', '8_2B', '1G', '4_1B', '8F', 'DB', '8_1B'],
      dtype=object)

In [81]:
def categorize_at_bat_result(result):
    if '_' in result :
        return '安打'
    elif result in ['DB', 'BB']:
        return '保送'
    elif result.endswith('F'):
        return '飛球'
    elif result.endswith('G'):
        return '滾地球'
    elif 'K' in result:
        return '三振'
    else:
        return '其他'

abs_df['at_bat_result_group'] = abs_df['at_bat_result'].apply(categorize_at_bat_result)

# 驗證有無其他類別
print(abs_df[abs_df['at_bat_result_group'] == '其他'][['at_bat_result', 'at_bat_result_group']])


Empty DataFrame
Columns: [at_bat_result, at_bat_result_group]
Index: []


In [100]:
# 計算打席結果比例
group_counts = abs_df['at_bat_result_group'].value_counts(normalize=True) * 100

print(group_counts)

at_bat_result_group
三振     34.0
安打     20.0
飛球     18.0
滾地球    18.0
保送     10.0
Name: proportion, dtype: float64


In [117]:
# 計算壘打數
def calculate_tb(result):
    if '1B' in result:
        return 1
    elif '2B' in result:
        return 2
    elif '3B' in result:
        return 3
    elif 'HR' in result:
        return 4
    else:
        return 0

abs_df['total_bases'] = abs_df['at_bat_result'].apply(calculate_tb)

total_bases = abs_df['total_bases'].sum()

print(f"Total bases from hits: {total_bases}")


Total bases from hits: 18


In [118]:
# 出局數
ground_outs = abs_df[abs_df['at_bat_result_group'] == '滾地球'].shape[0]
fly_outs = abs_df[abs_df['at_bat_result_group'] == '飛球'].shape[0]

# 滾飛比計算
go_ao_ratio = ground_outs / fly_outs if fly_outs > 0 else 0
print(f"GO/AO ratio: {go_ao_ratio:.2f}")


GO/AO ratio: 1.00


In [141]:
# 整體數據
hits = abs_df[abs_df['at_bat_result_group'] == '安打'].shape[0]
walks = abs_df[abs_df['at_bat_result_group'] == '保送'].shape[0]
at_bats = abs_df.shape[0]

overall_avg = hits / at_bats
overall_obp = (hits + walks) / at_bats
overall_slg = total_bases / at_bats
overall_ops = overall_obp + overall_slg

print(f"AVG: {overall_avg:.3f}, OBP: {overall_obp:.3f}, SLG: {overall_slg:.3f}, OPS: {overall_ops:.3f}")


AVG: 0.200, OBP: 0.300, SLG: 0.360, OPS: 0.660


In [142]:
# 根據不同情況 split

def calculate_different_situation_traditional_stats(group):
    hits = group[group['at_bat_result_group'] == '安打'].shape[0]
    walks = group[group['at_bat_result_group'] == '保送'].shape[0]
    at_bats = group.shape[0]
    total_bases = group['total_bases'].sum()

    avg = hits / at_bats if at_bats > 0 else 0
    obp = (hits + walks) / at_bats if at_bats > 0 else 0
    slg = total_bases / at_bats if at_bats > 0 else 0
    ops = slg+obp

    return pd.Series({'AVG': avg, 'OBP': obp, 'SLG': slg, 'OPS': ops})


stats_by_pitcher_type = abs_df.groupby('file_name_pitcher_type').apply(
    calculate_different_situation_traditional_stats)
stats_by_game_time = abs_df.groupby('game_time').apply(
    calculate_different_situation_traditional_stats)
stats_by_weather = abs_df.groupby('weather').apply(
    calculate_different_situation_traditional_stats)

print("根據左右投:")
print(stats_by_pitcher_type)
print("\n根據比賽時間不同:")
print(stats_by_game_time)
print("\n根據天氣不同:")
print(stats_by_weather)


根據左右投:
                             AVG       OBP       SLG       OPS
file_name_pitcher_type                                        
LHP                     0.058824  0.176471  0.235294  0.411765
RHP                     0.272727  0.363636  0.424242  0.787879

根據比賽時間不同:
                AVG       OBP       SLG       OPS
game_time                                        
day        0.178571  0.285714  0.285714  0.571429
night      0.238095  0.333333  0.476190  0.809524
unknown    0.000000  0.000000  0.000000  0.000000

根據天氣不同:
              AVG       OBP       SLG       OPS
weather                                        
cloudy   0.285714  0.285714  0.285714  0.571429
sunny    0.190476  0.309524  0.380952  0.690476
unknown  0.000000  0.000000  0.000000  0.000000


In [136]:
# LHP 較差？
at_bats_by_pitcher_type = abs_df.groupby('file_name_pitcher_type').size()

print("abs according to LHP/RHP:")
print(at_bats_by_pitcher_type)


abs according to LHP/RHP:
file_name_pitcher_type
LHP    17
RHP    33
dtype: int64


In [145]:
# 左右投的表現與平均的差異
stats_by_pitcher_type['AVG_Diff'] = stats_by_pitcher_type['AVG'] - overall_avg
stats_by_pitcher_type['OBP_Diff'] = stats_by_pitcher_type['OBP'] - overall_obp
stats_by_pitcher_type['SLG_Diff'] = stats_by_pitcher_type['SLG'] - overall_slg
stats_by_pitcher_type['OPS_Diff'] = stats_by_pitcher_type['OPS'] - overall_ops

print("左右投與整體的差距:")
print(stats_by_pitcher_type[['AVG_Diff', 'OBP_Diff', 'SLG_Diff','OPS_Diff']])


左右投與整體的差距:
                        AVG_Diff  OBP_Diff  SLG_Diff  OPS_Diff
file_name_pitcher_type                                        
LHP                    -0.141176 -0.123529 -0.124706 -0.248235
RHP                     0.072727  0.063636  0.064242  0.127879


In [137]:
# LHP 較差？
at_bats_by_weather = abs_df.groupby('weather').size()

print("abs according to weather:")
print(at_bats_by_weather)


abs according to weather:
weather
cloudy      7
sunny      42
unknown     1
dtype: int64


### 結論
1. 左投打不好， OPS 可以差右投到 0.37 
2. 夜間比賽表現較好，有 0.47 左右的 SLG，相較白天只有 0.26
#### 次要結論：
1. 天氣陰陰的較容易出現安打，但整體攻擊水準沒有提升
2. 極高三振率，但可能是因為小樣本誤差


### 觀察：第一球似乎不愛出棒

In [133]:
# 合併兩張表
combined_df = pd.merge(pitches_df_cleaned, abs_df, on='abs_id', how='left')

first_pitches_df = combined_df.loc[combined_df.groupby('abs_id')[
    'nth_pitch_ab'].idxmin()]

print(f"original row counts: {first_pitches_df.shape[0]}")

# 有些打席不是從第一球開始
first_pitches_df = first_pitches_df[first_pitches_df['nth_pitch_ab'] == 1]

print(f"latter row counts: {first_pitches_df.shape[0]}")

original row counts: 26
latter row counts: 23


In [121]:
# 整體出棒意圖 & 整體揮空比例
overall_swing_intention_rate = pitches_df_cleaned['has_swing_intention'].mean()
overall_swing_rate = pitches_df_cleaned['has_swing'].mean()

overall_whiff_rate = pitches_df_cleaned[pitches_df_cleaned['has_swing'] == True]['is_whiff'].mean()

print(f"overall swing intention: {overall_swing_intention_rate:.2f}")
print(f"overall swing: {overall_swing_rate:.2f}")
print(f"overall whiff rate: {overall_whiff_rate:.2f}")

overall swing intention: 0.58
overall swing: 0.52
overall whiff rate: 0.33


#### 是相對愛出棒的打者

以 Juan Soto 為例：
```
Juan Soto, 2021:

35.0% swing rate, lowest in @MLB (min 1750 pitches seen)
```
https://twitter.com/SlangsOnSports/status/1508600071130132485
來源

In [122]:
# 第一球出棒情況
first_ball_swing_intention_rate = first_pitches_df['has_swing_intention'].mean(
)

first_ball_swing_rate = first_pitches_df['has_swing'].mean()

first_ball_whiff_rate = first_pitches_df[first_pitches_df['has_swing']
                                         == True]['is_whiff'].mean()

print(
    f"Intention to swing on first pitch: {first_ball_swing_intention_rate:.2f}")
print(f"Swing rate on the first pitch: {first_ball_swing_rate:.2f}")
print(
    f"Whiff rate on the first pitch when swinging: {first_ball_whiff_rate:.2f}")


Intention to swing on first pitch: 0.43
Swing rate on the first pitch: 0.39
Whiff rate on the first pitch when swinging: 0.33


### 結論
第一球是有下降 15% 的整體揮棒意圖，實際揮棒上也將低了 13% ，說明比較容易看第一球。

喜歡先等一球，如果有機會再精細分析球種以及不同投手的比較。

### 觀察：壘上有人較積極?
亦即出棒意圖，比壘上沒人的時候還要多

In [123]:
# 新增任一壘包有人的欄位
pitches_df_cleaned['any_on_base'] = pitches_df_cleaned[['has_first_base_runner', 'has_second_base_runner', 'has_third_base_runner']].any(axis=1)

pitches_df_cleaned['any_on_base'].head()

0    False
1    False
2     True
3     True
4     True
Name: any_on_base, dtype: bool

In [124]:
# group 計算有人及無人的出棒意圖比例
swing_intention_rates = pitches_df_cleaned.groupby('any_on_base')['has_swing_intention'].mean()

swing_intention_rates

any_on_base
False    0.509804
True     0.666667
Name: has_swing_intention, dtype: float64

In [125]:
# 定義八種壘上有人情況
def label_bases(row):
    return f"{int(row['has_first_base_runner'])}{int(row['has_second_base_runner'])}{int(row['has_third_base_runner'])}"

# TODO: make string format better
# def label_bases(row):
#     bases = ['一', '二', '三']
#     runners = [row['has_first_base_runner'], row['has_second_base_runner'], row['has_third_base_runner']]
#     description = ''.join([bases[i] for i, runner in enumerate(runners) if runner])
#     return f"{description}壘有人" if description else "壘上無人"

pitches_df_cleaned['base_combination'] = pitches_df_cleaned.apply(label_bases, axis=1)

swing_intention_rates_by_combination = pitches_df_cleaned.groupby('base_combination')['has_swing_intention'].mean()

print(swing_intention_rates_by_combination)


base_combination
000    0.509804
001    0.166667
010    0.666667
011    1.000000
100    0.631579
101    0.750000
110    1.000000
111    1.000000
Name: has_swing_intention, dtype: float64


### 結論
壘上有人的時候較積極，整體出棒意圖大概多出 15%。
二三壘有人以及滿壘，為小樣本數，印象中僅一個打席。

### 觀察：喜歡揮高球且容易揮空？

In [126]:
# pitch ending 高位的數據
high_off_zone_pitch_df = pitches_df_cleaned[(pitches_df_cleaned['vertical_ending'] == 'high') & 
                                            (pitches_df_cleaned['is_obvious_off_zone'] == True)]

# 計算高位 offzone 投球時的出棒意圖
high_pitch_swing_intention_rate = high_off_zone_pitch_df['has_swing_intention'].mean()

# 計算高位 offzone 揮空比例
high_off_zone_swings = high_off_zone_pitch_df[high_off_zone_pitch_df['has_swing'] == True]

high_pitch_whiff_rate = high_off_zone_swings['is_whiff'].mean()


print(f"釣高球時的出棒意圖比例: {high_pitch_swing_intention_rate:.2f}")
print(f"釣高球時的揮空比例: {high_pitch_whiff_rate:.2f}")


釣高球時的出棒意圖比例: 0.35
釣高球時的揮空比例: 0.40


但此數據跟看影片時的印象不一樣，因此進一步根據內外角去分析

In [127]:
# 不同橫向位置位置釣高球的比例
swing_intention_rates_by_position = high_off_zone_pitch_df.groupby([ 'horizontal_ending'])['has_swing_intention'].mean()

print("釣高球不同橫向位置的出棒意圖：")
print(swing_intention_rates_by_position)

釣高球不同橫向位置的出棒意圖：
horizontal_ending
inside     0.000000
middle     0.666667
outside    0.333333
Name: has_swing_intention, dtype: float64


In [128]:
# 不同橫向位置釣高球揮空比例
high_off_zone_swings = high_off_zone_pitch_df[high_off_zone_pitch_df['has_swing'] == True]
whiff_rates_by_position = high_off_zone_swings.groupby(['vertical_ending', 'horizontal_ending'])['is_whiff'].mean()

print("\n不同橫向位置釣高球揮空比例：")
print(whiff_rates_by_position)



不同橫向位置釣高球揮空比例：
vertical_ending  horizontal_ending
high             middle               0.5
                 outside              0.0
Name: is_whiff, dtype: float64


釣高球要投在中間的位置最有效

In [129]:
high_off_zone_intension_detailed = high_off_zone_pitch_df.groupby(
    ['vertical_ending', 'horizontal_ending', 'pitch_type', 'strike_count']
)['has_swing_intention'].mean()

print("釣高球在不同橫向位置，在不同球數及球種的出棒意圖")
print(high_off_zone_intension_detailed)

high_off_zone_whiff_detailed = high_off_zone_swings.groupby(
    ['vertical_ending', 'horizontal_ending', 'pitch_type', 'strike_count']
)['is_whiff'].mean()

print("\n釣高球在不同橫向位置，球種以及球數，揮空統計")
print(high_off_zone_whiff_detailed)

釣高球在不同橫向位置，在不同球數及球種的出棒意圖
vertical_ending  horizontal_ending  pitch_type  strike_count
high             inside             changeup    1               0.000000
                                    four_seam   0               0.000000
                                                2               0.000000
                                    sinker      2               0.000000
                                    slider      0               0.000000
                 middle             four_seam   0               1.000000
                                                1               0.333333
                                                2               1.000000
                 outside            curve       1               0.000000
                                    cutter      2               0.000000
                                    four_seam   1               0.000000
                                                2               0.500000
                                    sl

In [130]:
# TODO: 先不要這一段
# # 計算差距
# middle_swing_intention = high_off_zone_intension_detailed.xs('middle', level='horizontal_ending')
# middle_whiff_rate = high_off_zone_whiff_detailed.xs('middle', level='horizontal_ending')

# middle_swing_intention_avg = middle_swing_intention.mean()
# middle_whiff_rate_avg = middle_whiff_rate.mean()

# swing_intention_diff = middle_swing_intention_avg - overall_swing_intention_rate
# whiff_rate_diff = middle_whiff_rate_avg - overall_whiff_rate

# swing_intention_diff_pct = (swing_intention_diff / overall_swing_intention_rate) * 100
# whiff_rate_diff_pct = (whiff_rate_diff / overall_whiff_rate) * 100

# print(f"中間位置揮棒意圖跟整體的差異: {swing_intention_diff_pct:.2f}%")
# print(f"中間位置揮空比例跟整體的差異: {whiff_rate_diff_pct:.2f}%")


#### 中間位置釣高球是有效的，但不能太外角
且不論球速是否領先都有一定效果

好球帶附近位置又是如何呢？

In [163]:
# 根據垂直位置，球若在好球帶附近，看出棒意圖以及各自揮空的比率
# TODO: 視覺化 九宮格
for vertical_position in ['high', 'middle', 'low']:
    specific_near_zone_df = pitches_df_cleaned[(pitches_df_cleaned['vertical_ending'] == vertical_position)]
    
    rhp_swing_intention_rate = specific_near_zone_df['has_swing_intention'].mean()
    rhp_out_side_low_swings_df = specific_near_zone_df[specific_near_zone_df['has_swing'] == True]
    rhp_out_side_low_whiff_rate = rhp_out_side_low_swings_df['is_whiff'].mean()
    
    print(f"進壘點在 {vertical_position} 位 zone 時的出棒意圖比例: {rhp_swing_intention_rate:.2f}")
    print(f"進壘點在 {vertical_position} 位 zone 時的揮空比例: {rhp_out_side_low_whiff_rate:.2f}")


進壘點在 high 位 zone 時的出棒意圖比例: 0.54
進壘點在 high 位 zone 時的揮空比例: 0.39
進壘點在 middle 位 zone 時的出棒意圖比例: 0.79
進壘點在 middle 位 zone 時的揮空比例: 0.14
進壘點在 low 位 zone 時的出棒意圖比例: 0.51
進壘點在 low 位 zone 時的揮空比例: 0.44


喜歡內角，也不容易揮空

In [164]:
# 根據垂直位置 off-zone 
for vertical_position in ['high', 'middle', 'low']:
    specific_off_zone_df = pitches_df_cleaned[(pitches_df_cleaned['vertical_ending'] == vertical_position) &
                                              (pitches_df_cleaned['is_obvious_off_zone'] == True)]
    
    rhp_swing_intention_rate = specific_off_zone_df['has_swing_intention'].mean()
    rhp_out_side_low_swings_df = specific_off_zone_df[specific_off_zone_df['has_swing'] == True]
    rhp_out_side_low_whiff_rate = rhp_out_side_low_swings_df['is_whiff'].mean()
    
    print(f"進壘點在 {vertical_position} 位 offzone 時的出棒意圖比例: {rhp_swing_intention_rate:.2f}")
    print(f"進壘點在 {vertical_position} 位 offzone 時的揮空比例: {rhp_out_side_low_whiff_rate:.2f}")


進壘點在 high 位 offzone 時的出棒意圖比例: 0.35
進壘點在 high 位 offzone 時的揮空比例: 0.40
進壘點在 middle 位 offzone 時的出棒意圖比例: nan
進壘點在 middle 位 offzone 時的揮空比例: nan
進壘點在 low 位 offzone 時的出棒意圖比例: 0.53
進壘點在 low 位 offzone 時的揮空比例: 0.80


**外角 offzone 低球** 容易吸引出棒且揮空

但是出棒意圖需要進一步評估，需要驗證是否因為球種不同而有差

這次的集合沒有觀察到 Middle off zone 的情況
例如腰帶附近的觸身球

### 結論：中間位置釣高球是有效的
1. 但不能太外角，且不論球速是否領先都有一定效果
2. 喜歡打中間位置的球
3. 外角 offzone 低球，容易吸引出棒且揮空

### 觀察：極度拉打？
或許可以從兩個面向來分析
1. 佈陣
   - 即使看得到的佈陣不多，但仍不失為一個資訊
2. 打席結果
   - 如果落點在 7, 5, 6 超過一定比例

In [146]:
# 看佈陣的種類
pitches_df_cleaned['has_visible_shift'].unique()

array(['FALSE', '4_2B'], dtype=object)

In [147]:
# 看二壘手往 2B 移動的情形比例
non_false_shifts = pitches_df_cleaned[pitches_df_cleaned['has_visible_shift'] != 'FALSE'].shape[0]

non_false_shifts

3

In [151]:
# 打席結果
abs_df['is_numeric_start'] = abs_df['at_bat_result'].apply(lambda x: x[0].isdigit())


abs_df['is_756_start'] = abs_df['at_bat_result'].apply(lambda x: x.startswith(('7', '5', '6')) if x[0].isdigit() else False)

extreme_pull_count = abs_df['is_756_start'].sum()
numeric_start_count = abs_df['is_numeric_start'].sum()

extreme_pull_ratio = extreme_pull_count / numeric_start_count if numeric_start_count > 0 else 0

print(f"打向左半邊的比例: {extreme_pull_ratio:.2%}")



打向左半邊的比例: 67.86%


# 結論：是極度拉打
1. 有看到佈陣的情況，都是二壘手往二壘方向移動
2. 高達 68% 的球是打向左半邊的
3. 影片的觀察過程，界外球也都多往左半邊打

### 觀察：右投手外角 breaking ball 打不好？
breaking ball：滑球，曲球

In [154]:
print(pitches_df_cleaned['horizontal_ending'].unique())
print(pitches_df_cleaned['vertical_ending'].unique())
print(pitches_df_cleaned['pitch_type'].unique())

['outside' 'middle' 'inside']
['low' 'high' 'middle']
['slider' 'four_seam' 'sinker' 'curve' 'changeup' 'cutter']


In [159]:
# 右投手外角 breaking ball 集合
rhp_out_side_low_breaking_balls = pitches_df_cleaned[
    (pitches_df_cleaned['is_rhp'] == True) &
    (pitches_df_cleaned['horizontal_ending'] == 'outside') &
    (pitches_df_cleaned['vertical_ending'] == 'low') &
    (pitches_df_cleaned['pitch_type'].isin(['slider', 'curve', 'changeup']))
]

rhp_swing_intention_rate = rhp_out_side_low_breaking_balls['has_swing_intention'].mean(
)

rhp_out_side_low_swings_df = rhp_out_side_low_breaking_balls[
    rhp_out_side_low_breaking_balls['has_swing'] == True]

rhp_out_side_low_has_swing_rate = rhp_out_side_low_breaking_balls['has_swing'].mean()
rhp_out_side_low_whiff_rate = rhp_out_side_low_swings_df['is_whiff'].mean()

print(f"右投手外角 breaking ball 出棒意圖比例: {rhp_swing_intention_rate:.2f}")
print(f"右投手外角 breaking ball 出棒比例: {rhp_out_side_low_has_swing_rate:.2f}")
print(f"右投手外角 breaking ball 揮空比例: {rhp_out_side_low_whiff_rate:.2f}")


右投手外角 breaking ball 出棒意圖比例: 0.62
右投手外角 breaking ball 出棒比例: 0.50
右投手外角 breaking ball 揮空比例: 0.62


In [160]:
print(f"overall whiff rate: {overall_whiff_rate:.2f}")

overall whiff rate: 0.33


In [165]:
# 若為捕手 call 外角 breaking_ball 且投手投到位
rhp_out_side_low_breaking_balls_catcher_called = rhp_out_side_low_breaking_balls[
    rhp_out_side_low_breaking_balls['is_pitch_ended_catcher_want'] == True]

rhp_out_side_low_breaking_balls_catcher_called_swing_intention_rate = rhp_out_side_low_breaking_balls_catcher_called['has_swing_intention'].mean(
)

rhp_out_side_low_breaking_balls_catcher_called__swing_rate = rhp_out_side_low_breaking_balls_catcher_called['has_swing'].mean(
)

rhp_out_side_low_breaking_balls_catcher_called_swings_df = rhp_out_side_low_breaking_balls_catcher_called[
    rhp_out_side_low_breaking_balls_catcher_called['has_swing'] == True]
rhp_catcher_want_whiff_rate = rhp_out_side_low_breaking_balls_catcher_called_swings_df['is_whiff'].mean(
)

print(
    f"捕手期望的進壘點時的出棒意圖比例: {rhp_out_side_low_breaking_balls_catcher_called_swing_intention_rate:.2f}")
print(
    f"捕手期望的進壘點時的出棒比例: {rhp_out_side_low_breaking_balls_catcher_called__swing_rate:.2f}")
print(f"捕手期望的進壘點時的揮空比例: {rhp_catcher_want_whiff_rate:.2f}")


捕手期望的進壘點時的出棒意圖比例: 0.80
捕手期望的進壘點時的出棒比例: 0.80
捕手期望的進壘點時的揮空比例: 0.62


In [None]:
# TODO: 造成好球的分析
# 時間不足，待有機會分析
# pitches_df_sorted = pitches_df_cleaned.sort_values(by=['abs_id', 'nth_pitch_ab'])

# pitches_df_sorted['is_rhp_out_side_low_breaking_ball'] = (
#     (pitches_df_sorted['is_rhp'] == True) &
#     (pitches_df_sorted['horizontal_ending'] == 'outside') &
#     (pitches_df_sorted['vertical_ending'] == 'low') &
#     (pitches_df_sorted['pitch_type'].isin(['slider', 'curve', 'changeup']))
# )
# pitches_df_sorted['next_strike_count'] = pitches_df_sorted.groupby('abs_id')['strike_count'].shift(-1)
# pitches_df_sorted['is_strike_count_increased'] = pitches_df_sorted['next_strike_count'] > pitches_df_sorted['strike_count']


### 結論：
1. 右投手外角 breaking ball 製造揮空的情形較其他情形，多出 100%
2. 若捕手配球並且投手投到期望的外角低的位置，出棒的比例會從 50% 上升至 80%，將近 6 成

### 觀察：外角低速球容易拉成滾地球？
待驗證

### 觀察：時機大致上正確，但是擊球點不好？
待驗證