In [2]:
import pandas as pd

abs_path = "./labeled/50_abs_jose_ramos_20240308.csv"
pitches_path = "./labeled/95_pitches_jose_ramos_20240308.csv"

abs_df = pd.read_csv(abs_path)
pitches_df = pd.read_csv(pitches_path)

abs_df.head(1)

Unnamed: 0,abs_id,file_name,file_name_date,file_name_player_name,file_name_pitcher_type,file_name_at_bat_number,file_name_result,is_filename_result_consistent,inning,at_bat_result,weather,game_time
0,1,2022-09-10 Jose Ramos VS 右投 打席2_二飛.mp4,2022-09-10,Jose Ramos,RHP,2,二飛,True,4,4F,sunny,day


In [3]:
pitches_df.head(1)

Unnamed: 0,id,file_name,file_name_date,file_name_player_name,file_name_pitcher_type,file_name_at_bat_number,file_name_result,inning,abs_id,nth_pitch_ab,...,score_diffrence,is_team_leading,weather,game_time,has_swing_intention,horizontal_ending,vertical_ending,is_swing_delayed,is_swing_early,is_whiff
0,4e1e4436-2ba9-41a0-b88a-5cc4641edcbe,2022-09-10 Jose Ramos VS 右投 打席2_二飛.mp4,2022-09-10,Jose Ramos,RHP,2.0,二飛,4.0,1,1,...,1,True,sunny,day,False,outside,low,False,False,False


### 確認欄位資訊 & 清理

In [4]:
print(f"abs_df row count: {abs_df.shape[0]}, column count: {abs_df.shape[1]}")
print(f"pitches_df row count: {pitches_df.shape[0]}, column count: {pitches_df.shape[1]}")


abs_df row count: 50, column count: 12
pitches_df row count: 93, column count: 36


In [5]:
pitches_df.columns


Index(['id', 'file_name', 'file_name_date', 'file_name_player_name',
       'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
       'inning', 'abs_id', 'nth_pitch_ab', 'out_count', 'strike_count',
       'ball_count', 'is_rhp', 'at_bat_result',
       'is_filename_result_consistent', 'has_stealing_attempt', 'pitch_type',
       'pitch_release', 'is_obvious_off_zone', 'is_pitch_ended_catcher_want',
       'has_swing', 'has_first_base_runner', 'has_second_base_runner',
       'has_third_base_runner', 'has_visible_shift', 'score_diffrence',
       'is_team_leading', 'weather', 'game_time', 'has_swing_intention',
       'horizontal_ending', 'vertical_ending', 'is_swing_delayed',
       'is_swing_early', 'is_whiff'],
      dtype='object')

In [22]:
abs_df.columns

Index(['abs_id', 'file_name', 'file_name_date', 'file_name_player_name',
       'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
       'is_filename_result_consistent', 'inning', 'at_bat_result', 'weather',
       'game_time'],
      dtype='object')

In [20]:
# 刪除 pitches 無用欄位
# 因 abs 已有正確及詳細的
columns_to_drop = [
    'file_name', 'file_name_date', 'file_name_player_name',
    'file_name_pitcher_type', 'file_name_at_bat_number', 'file_name_result',
    'inning', 'is_filename_result_consistent', 'at_bat_result', 'game_time',  'weather'
]

# 逐球記錄的資料量
pitches_df_cleaned = pitches_df.drop(columns=columns_to_drop, axis=1)
print(f"Columns count: {pitches_df_cleaned.shape[1]}")
print(f"row count: {pitches_df_cleaned.shape[0]}")
print(
    f"cell count: {pitches_df_cleaned.shape[0] *pitches_df_cleaned.shape[1] }")

pitches_df_cleaned.columns


Columns count: 25
row count: 93
cell count: 2325


Index(['id', 'abs_id', 'nth_pitch_ab', 'out_count', 'strike_count',
       'ball_count', 'is_rhp', 'has_stealing_attempt', 'pitch_type',
       'pitch_release', 'is_obvious_off_zone', 'is_pitch_ended_catcher_want',
       'has_swing', 'has_first_base_runner', 'has_second_base_runner',
       'has_third_base_runner', 'has_visible_shift', 'score_diffrence',
       'is_team_leading', 'has_swing_intention', 'horizontal_ending',
       'vertical_ending', 'is_swing_delayed', 'is_swing_early', 'is_whiff'],
      dtype='object')

In [21]:
# 打席數的資料量
print(f"abs_df cell counts: {abs_df.shape[0] *abs_df.shape[1]}")

abs_df row count: 50, column count: 12
abs_df cell counts: 600


In [7]:
# 確認是否各球欄位都是預設的值
fields_to_validate = ['vertical_ending', 'horizontal_ending', 'is_rhp', 'pitch_type', 'pitch_release']

for field in fields_to_validate:
    print(f"{field} unique value: {pitches_df_cleaned[field].unique()}")


vertical_ending unique value: ['low' 'high' 'middle']
horizontal_ending unique value: ['outside' 'middle' 'inside']
is_rhp unique value: [ True False]
pitch_type unique value: ['slider' 'four_seam' 'sinker' 'curve' 'changeup' 'cutter']
pitch_release unique value: ['overhand' 'side_arm']


In [60]:
import os

output_file_path = './jose_ramos_pitches_df_cleaned.csv'

if not os.path.exists(output_file_path):
    pitches_df_cleaned.to_csv(output_file_path, index=False)
    print(f"File saved: {output_file_path}")
else:
    print("File already exists.")

File saved: ./jose_ramos_pitches_df_cleaned.csv


### 完成初步清理

### 觀察：第一球似乎不愛出棒

In [47]:
# 合併兩張表
combined_df = pd.merge(pitches_df_cleaned, abs_df, on='abs_id', how='left')

first_pitches_df = combined_df.loc[combined_df.groupby('abs_id')[
    'nth_pitch_ab'].idxmin()]

print(f"original row counts: {first_pitches_df.shape[0]}")

# 有些打席不是從第一球開始
first_pitches_df = first_pitches_df[first_pitches_df['nth_pitch_ab'] == 1]

print(f"latter row counts: {first_pitches_df.shape[0]}")

first_pitches_df.head()

original row counts: 26
latter row counts: 23


Unnamed: 0,id,abs_id,nth_pitch_ab,out_count,strike_count,ball_count,is_rhp,has_stealing_attempt,pitch_type,pitch_release,...,file_name_date,file_name_player_name,file_name_pitcher_type,file_name_at_bat_number,file_name_result,is_filename_result_consistent,inning,at_bat_result,weather,game_time
0,4e1e4436-2ba9-41a0-b88a-5cc4641edcbe,1,1,0,0,0,True,False,slider,overhand,...,2022-09-10,Jose Ramos,RHP,2,二飛,True,4,4F,sunny,day
2,bc4092b0-4083-4fd7-bf98-3863f7a4c82b,2,1,1,0,0,False,False,four_seam,overhand,...,2022-09-01,Jose Ramos,LHP,1,三振,True,1,SK,sunny,day
5,4feecff9-2d3c-4b1e-a22f-9638e3dcb921,3,1,1,0,0,True,False,four_seam,overhand,...,2022-08-19,Jose Ramos,RHP,3,游滾,True,5,6G,unknown,unknown
8,6c7c1ff4-8380-4f9f-9e64-09b87a69549e,4,1,1,0,0,False,False,four_seam,overhand,...,2022-08-23,Jose Ramos,LHP,1,左全,True,1,7_HR,sunny,day
12,6bb91152-4119-40ee-b644-457ce8f5281a,5,1,2,0,0,True,False,curve,overhand,...,2022-08-10,Jose Ramos,RHP,4,三振,True,8,CK,sunny,night


In [48]:
# 整體出棒意圖 & 整體揮空比例
overall_swing_intention_rate = pitches_df_cleaned['has_swing_intention'].mean()
overall_swing_rate = pitches_df_cleaned['has_swing'].mean()

overall_whiff_rate = pitches_df_cleaned[pitches_df_cleaned['has_swing'] == True]['is_whiff'].mean()

print(f"overall swing intention: {overall_swing_intention_rate:.2f}")
print(f"overall swing: {overall_swing_rate:.2f}")
print(f"overall whiff rate: {overall_whiff_rate:.2f}")

overall swing intention: 0.58
overall swing: 0.52
overall whiff rate: 0.33


In [49]:
# 第一球出棒情況
first_ball_swing_intention_rate = first_pitches_df['has_swing_intention'].mean(
)

first_ball_swing_rate = first_pitches_df['has_swing'].mean()

first_ball_whiff_rate = first_pitches_df[first_pitches_df['has_swing']
                                         == True]['is_whiff'].mean()

print(
    f"Intention to swing on first pitch: {first_ball_swing_intention_rate:.2f}")
print(f"Swing rate on the first pitch: {first_ball_swing_rate:.2f}")
print(
    f"Whiff rate on the first pitch when swinging: {first_ball_whiff_rate:.2f}")


Intention to swing on first pitch: 0.43
Swing rate on the first pitch: 0.39
Whiff rate on the first pitch when swinging: 0.33


第一球是有下降 15% 的整體揮棒意圖，實際揮棒上也將低了 13% ，說明比較容易看第一球。


### 結論
喜歡先等一球，如果有機會再精細分析球種以及不同投手的比較。

### 觀察：壘上有人較積極?
亦即出棒意圖，比壘上沒人的時候還要多

In [50]:
# 新增任一壘包有人的欄位
pitches_df_cleaned['any_on_base'] = pitches_df_cleaned[['has_first_base_runner', 'has_second_base_runner', 'has_third_base_runner']].any(axis=1)

pitches_df_cleaned['any_on_base'].head()

0    False
1    False
2     True
3     True
4     True
Name: any_on_base, dtype: bool

In [51]:
# group 計算有人及無人的出棒意圖比例
swing_intention_rates = pitches_df_cleaned.groupby('any_on_base')['has_swing_intention'].mean()

swing_intention_rates

any_on_base
False    0.509804
True     0.666667
Name: has_swing_intention, dtype: float64

In [52]:
# 定義八種壘上有人情況
def label_bases(row):
    return f"{int(row['has_first_base_runner'])}{int(row['has_second_base_runner'])}{int(row['has_third_base_runner'])}"

# TODO: make string format better
# def label_bases(row):
#     bases = ['一', '二', '三']
#     runners = [row['has_first_base_runner'], row['has_second_base_runner'], row['has_third_base_runner']]
#     description = ''.join([bases[i] for i, runner in enumerate(runners) if runner])
#     return f"{description}壘有人" if description else "壘上無人"

pitches_df_cleaned['base_combination'] = pitches_df_cleaned.apply(label_bases, axis=1)

swing_intention_rates_by_combination = pitches_df_cleaned.groupby('base_combination')['has_swing_intention'].mean()

print(swing_intention_rates_by_combination)


base_combination
000    0.509804
001    0.166667
010    0.666667
011    1.000000
100    0.631579
101    0.750000
110    1.000000
111    1.000000
Name: has_swing_intention, dtype: float64


### 結論
壘上有人的時候較積極，整體出棒意圖大概多出 15%。
二三壘有人以及滿壘，為小樣本數，印象中僅一個打席。

### 觀察：喜歡揮高球且容易揮空？

#### 是相對愛出棒的打者

以 Juan Soto 為例：
```
Juan Soto, 2021:

35.0% swing rate, lowest in @MLB (min 1750 pitches seen)
```
https://twitter.com/SlangsOnSports/status/1508600071130132485
來源

In [53]:
# pitch ending 高位的數據
high_off_zone_pitch_df = pitches_df_cleaned[(pitches_df_cleaned['vertical_ending'] == 'high') & 
                                            (pitches_df_cleaned['is_obvious_off_zone'] == True)]

# 計算高位 offzone 投球時的出棒意圖
high_pitch_swing_intention_rate = high_off_zone_pitch_df['has_swing_intention'].mean()

# 計算高位 offzone 揮空比例
high_off_zone_swings = high_off_zone_pitch_df[high_off_zone_pitch_df['has_swing'] == True]

high_pitch_whiff_rate = high_off_zone_swings['is_whiff'].mean()


print(f"釣高球時的出棒意圖比例: {high_pitch_swing_intention_rate:.2f}")
print(f"釣高球時的揮空比例: {high_pitch_whiff_rate:.2f}")


釣高球時的出棒意圖比例: 0.35
釣高球時的揮空比例: 0.40


但此數據跟看影片時的印象不一樣，因此進一步根據內外角去分析

In [54]:
# 不同橫向位置位置釣高球的比例
swing_intention_rates_by_position = high_off_zone_pitch_df.groupby([ 'horizontal_ending'])['has_swing_intention'].mean()

print("釣高球不同橫向位置的出棒意圖：")
print(swing_intention_rates_by_position)

釣高球不同橫向位置的出棒意圖：
horizontal_ending
inside     0.000000
middle     0.666667
outside    0.333333
Name: has_swing_intention, dtype: float64


In [55]:
# 不同橫向位置釣高球揮空比例
high_off_zone_swings = high_off_zone_pitch_df[high_off_zone_pitch_df['has_swing'] == True]
whiff_rates_by_position = high_off_zone_swings.groupby(['vertical_ending', 'horizontal_ending'])['is_whiff'].mean()

print("\n不同橫向位置釣高球揮空比例：")
print(whiff_rates_by_position)



不同橫向位置釣高球揮空比例：
vertical_ending  horizontal_ending
high             middle               0.5
                 outside              0.0
Name: is_whiff, dtype: float64


釣高球要投在中間的位置最有效

In [56]:
high_off_zone_intension_detailed = high_off_zone_pitch_df.groupby(
    ['vertical_ending', 'horizontal_ending', 'pitch_type', 'strike_count']
)['has_swing_intention'].mean()

print("釣高球在不同橫向位置，在不同球數及球種的出棒意圖")
print(high_off_zone_intension_detailed)

high_off_zone_whiff_detailed = high_off_zone_swings.groupby(
    ['vertical_ending', 'horizontal_ending', 'pitch_type', 'strike_count']
)['is_whiff'].mean()

print("\n釣高球在不同橫向位置，球種以及球數，揮空統計")
print(high_off_zone_whiff_detailed)

釣高球在不同橫向位置，在不同球數及球種的出棒意圖
vertical_ending  horizontal_ending  pitch_type  strike_count
high             inside             changeup    1               0.000000
                                    four_seam   0               0.000000
                                                2               0.000000
                                    sinker      2               0.000000
                                    slider      0               0.000000
                 middle             four_seam   0               1.000000
                                                1               0.333333
                                                2               1.000000
                 outside            curve       1               0.000000
                                    cutter      2               0.000000
                                    four_seam   1               0.000000
                                                2               0.500000
                                    sl

In [57]:
# TODO: 先不要這一段
# # 計算差距
# middle_swing_intention = high_off_zone_intension_detailed.xs('middle', level='horizontal_ending')
# middle_whiff_rate = high_off_zone_whiff_detailed.xs('middle', level='horizontal_ending')

# middle_swing_intention_avg = middle_swing_intention.mean()
# middle_whiff_rate_avg = middle_whiff_rate.mean()

# swing_intention_diff = middle_swing_intention_avg - overall_swing_intention_rate
# whiff_rate_diff = middle_whiff_rate_avg - overall_whiff_rate

# swing_intention_diff_pct = (swing_intention_diff / overall_swing_intention_rate) * 100
# whiff_rate_diff_pct = (whiff_rate_diff / overall_whiff_rate) * 100

# print(f"中間位置揮棒意圖跟整體的差異: {swing_intention_diff_pct:.2f}%")
# print(f"中間位置揮空比例跟整體的差異: {whiff_rate_diff_pct:.2f}%")


#### 中間位置釣高球是有效的，但不能太外角
且不論球速是否領先都有一定效果

好球帶附近位置又是如何呢？

In [58]:
# 根據垂直位置，球若在好球帶附近，看出棒意圖以及各自揮空的比率
# TODO: 視覺化 九宮格
for vertical_position in ['high', 'middle', 'low']:
    specific_near_zone_df = pitches_df_cleaned[(pitches_df_cleaned['vertical_ending'] == vertical_position)]
    
    swing_intention_rate = specific_near_zone_df['has_swing_intention'].mean()
    swings_df = specific_near_zone_df[specific_near_zone_df['has_swing'] == True]
    whiff_rate = swings_df['is_whiff'].mean()
    
    print(f"投球終點在 {vertical_position} 位 zone 時的出棒意圖比例: {swing_intention_rate:.2f}")
    print(f"投球終點在 {vertical_position} 位 zone 時的揮空比例: {whiff_rate:.2f}")


投球終點在 high 位 zone 時的出棒意圖比例: 0.54
投球終點在 high 位 zone 時的揮空比例: 0.39
投球終點在 middle 位 zone 時的出棒意圖比例: 0.79
投球終點在 middle 位 zone 時的揮空比例: 0.14
投球終點在 low 位 zone 時的出棒意圖比例: 0.51
投球終點在 low 位 zone 時的揮空比例: 0.44


喜歡內角，也不容易揮空

In [59]:
# 根據垂直位置 off-zone 
for vertical_position in ['high', 'middle', 'low']:
    specific_off_zone_df = pitches_df_cleaned[(pitches_df_cleaned['vertical_ending'] == vertical_position) &
                                              (pitches_df_cleaned['is_obvious_off_zone'] == True)]
    
    swing_intention_rate = specific_off_zone_df['has_swing_intention'].mean()
    swings_df = specific_off_zone_df[specific_off_zone_df['has_swing'] == True]
    whiff_rate = swings_df['is_whiff'].mean()
    
    print(f"投球終點在 {vertical_position} 位 offzone 時的出棒意圖比例: {swing_intention_rate:.2f}")
    print(f"投球終點在 {vertical_position} 位 offzone 時的揮空比例: {whiff_rate:.2f}")


投球終點在 high 位 offzone 時的出棒意圖比例: 0.35
投球終點在 high 位 offzone 時的揮空比例: 0.40
投球終點在 middle 位 offzone 時的出棒意圖比例: nan
投球終點在 middle 位 offzone 時的揮空比例: nan
投球終點在 low 位 offzone 時的出棒意圖比例: 0.53
投球終點在 low 位 offzone 時的揮空比例: 0.80


**外角 offzone 低球** 容易吸引出棒且揮空

但是出棒意圖需要進一步評估，需要驗證是否因為球種不同而有差

這次的集合沒有觀察到 Middle off zone 的情況
例如腰帶附近的觸身球

### 極度拉打？

### 外角低速球容易拉成滾地球？

### 右投手外角滑球打不好？

### 時機大致上正確，但是擊球點不好？