In [7]:
%%bash

total_duration=0

for file in ./videos/*.mp4; do
  duration_seconds=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$file")
  duration_minutes=$(echo "$duration_seconds" | awk '{printf "%.2f", $1 / 60}')
  echo "$file: $duration_minutes minutes"
  total_duration=$(echo "$total_duration + $duration_minutes" | bc)
done

total_rounded=$(echo "scale=2; $total_duration / 1" | bc)
echo "Total duration: $total_rounded minutes"


./videos/2022-08-10 Jose Ramos VS 右投 打席1_觸身.mp4: 1.83 minutes
./videos/2022-08-10 Jose Ramos VS 右投 打席2_左安.mp4: 0.88 minutes
./videos/2022-08-10 Jose Ramos VS 右投 打席3_三滾.mp4: 0.77 minutes
./videos/2022-08-10 Jose Ramos VS 右投 打席4_三振.mp4: 2.50 minutes
./videos/2022-08-13 Jose Ramos VS 右投 打席1_中安.mp4: 1.40 minutes
./videos/2022-08-13 Jose Ramos VS 右投 打席2_雙殺.mp4: 0.20 minutes
./videos/2022-08-13 Jose Ramos VS 右投 打席3_左飛.mp4: 1.42 minutes
./videos/2022-08-13 Jose Ramos VS 左投 打席4_三振.mp4: 1.68 minutes
./videos/2022-08-19 Jose Ramos VS 右投 打席1_三振.mp4: 1.18 minutes
./videos/2022-08-19 Jose Ramos VS 右投 打席2_左安.mp4: 0.23 minutes
./videos/2022-08-19 Jose Ramos VS 右投 打席3_游滾.mp4: 0.60 minutes
./videos/2022-08-19 Jose Ramos VS 右投 打席4_游滾.mp4: 0.98 minutes
./videos/2022-08-19 Jose Ramos VS 右投 打席5_中二安.mp4: 0.40 minutes
./videos/2022-08-20 Jose Ramos VS 右投 打席1_三振.mp4: 1.30 minutes
./videos/2022-08-20 Jose Ramos VS 右投 打席2_三振.mp4: 1.12 minutes
./videos/2022-08-20 Jose Ramos VS 右投 打席3_三振.mp4: 1.85 minutes
./video

In [17]:
import os
import pandas as pd
import re

# 根據 field_analysis.md
df_columns = [
    'id', 'inning', 'out_count', 'strike_count', 'ball_count', 'is_rhp', 'at_bat_result',
    'pitch_type', 'pitch_release', 'is_obvious_off_zone', 'is_pitch_ended_catcher_want',
    'has_swing', 'has_first_base_runner', 'has_second_base_runner', 'has_third_base_runner',
    'file_name', 'RE24', 'has_stealing_attempt', 'has_visible_shift', 'is_filename_result_consistent'
]

df = pd.DataFrame(columns=df_columns)
df

Unnamed: 0,id,inning,out_count,strike_count,ball_count,is_rhp,at_bat_result,pitch_type,pitch_release,is_obvious_off_zone,is_pitch_ended_catcher_want,has_swing,has_first_base_runner,has_second_base_runner,has_third_base_runner,file_name,RE24,has_stealing_attempt,has_visible_shift,is_filename_result_consistent


將影片檔名結構化

In [18]:
directory = './videos'

for filename in os.listdir(directory):
    if filename.endswith(".mp4"):
        file_name_pattern = r'(\d{4}-\d{2}-\d{2}) (.+?) VS (左投|右投) 打席(\d)_([^.]+)\.mp4'
        match = re.match(file_name_pattern, filename)
        if match:
            file_name_date, file_name_player_name, file_name_pitcher_type, file_name_at_bat_number, file_name_result = match.groups()
            # 將檔案名稱結構化成資料
            new_row = pd.DataFrame([{
                'file_name': filename,
                'file_name_date': file_name_date,
                'file_name_player_name': file_name_player_name,
                'file_name_pitcher_type': 'RHP' if file_name_pitcher_type == '右投' else 'LHP',
                'file_name_at_bat_number': file_name_at_bat_number,
                'file_name_result': file_name_result
            }])
            
            df = pd.concat([df, new_row], ignore_index=True)

df.head()


Unnamed: 0,id,inning,out_count,strike_count,ball_count,is_rhp,at_bat_result,pitch_type,pitch_release,is_obvious_off_zone,...,file_name,RE24,has_stealing_attempt,has_visible_shift,is_filename_result_consistent,file_name_date,file_name_player_name,file_name_pitcher_type,file_name_at_bat_number,file_name_result
0,,,,,,,,,,,...,2022-09-10 Jose Ramos VS 右投 打席2_二飛.mp4,,,,,2022-09-10,Jose Ramos,RHP,2,二飛
1,,,,,,,,,,,...,2022-09-01 Jose Ramos VS 左投 打席1_三振.mp4,,,,,2022-09-01,Jose Ramos,LHP,1,三振
2,,,,,,,,,,,...,2022-08-19 Jose Ramos VS 右投 打席3_游滾.mp4,,,,,2022-08-19,Jose Ramos,RHP,3,游滾
3,,,,,,,,,,,...,2022-08-23 Jose Ramos VS 左投 打席1_左全.mp4,,,,,2022-08-23,Jose Ramos,LHP,1,左全
4,,,,,,,,,,,...,2022-08-10 Jose Ramos VS 右投 打席4_三振.mp4,,,,,2022-08-10,Jose Ramos,RHP,4,三振


轉擋成 csv 方便在 google sheet 記錄

In [None]:
df.to_csv("jose_ramos_20240308.csv", index=False)

標記完 16 部以後，剩餘的影片再選 10 部進行標記

In [3]:
import random

start = 17
end = 50

random_numbers = random.sample(range(start, end + 1), 10)

random_numbers.sort()

random_numbers

[17, 22, 23, 27, 30, 34, 37, 39, 41, 45]