# YouTube 自動逐字稿產生器 (避免重複 + 儲存音訊至 Google Drive)


## 功能：

YouTube 自動逐字稿產生器, 使用 `yt-dlp` 下載音訊, 並使用 `Whisper` 模型產生逐字稿. 此程式會自動略過已處理的影片, 並將結果儲存至 Google Drive.

- 上傳 `_youtube_links.txt`, 每行一個連結
- 自動略過已處理影片 (由 `processed_links.txt` 記錄, 並保存至 Google Drive)
- 使用 `yt-dlp` 下載音訊 (保留至 Google Drive)
- 使用 `Whisper` 模型產生逐字稿
- 逐字稿 (包括 `srt` 和 `txt`) 輸出至 Google Drive


## Step 1 - 準備工作


### 1.1. Mount Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')


### 1.2. 上傳 `_youtube_links.txt` 檔案

請確保檔案內容格式如下 (每行一個連結, 無需雙引號):

```
https://www.youtube.com/watch?v=VIDEO_ID
https://www.youtube.com/watch?v=ANOTHER_VIDEO_ID
```

也可用拖曳上傳 `_youtube_links.txt` 取代下面 cell


In [None]:
# 上傳連結檔案
from google.colab import files
uploaded = files.upload()


### 1.3. 設定全局變量


In [None]:
INPUT_FILE_NAME = "_youtube_links.txt"
AUDIO_DIR = "_audio_files"

# yt-dlp 參數
AUDIO_FORMAT = "m4a"
TITLE_FORMAT = "%(title)s [%(id)s]"

# On My Google Drive
GD_PROCESSED_TITLES_FILE_PATH = "/content/drive/MyDrive/_字幕/youtube/processed_titles.txt"
GD_PROCESSED_IDS_FILE_PATH = "/content/drive/MyDrive/_字幕/youtube/processed_ids.txt"
GD_AUDIO_DIR = "/content/drive/MyDrive/_字幕/youtube/_audios"
GD_OUTPUT_DIR = "/content/drive/MyDrive/_字幕/youtube/_批量處理"

# Whisper 參數
WHISPER_LANGUAGE = "Chinese"  # Whisper 的輸出預設為簡體中文. 使用 "AUTO" 可自動偵測語言
WHISPER_MODEL = "large"  # base, small, medium, large

# 任務結束音樂
MUSIC_SRC = "YT"
YT_MUSICS = [
    "https://www.youtube.com/watch?v=pd3eV-SG23E",  # Mayday五月天 [ 後來的我們 Here, After, Us ] Official Music Video
    "https://www.youtube.com/watch?v=GSx8Olkop50",  # 孫燕姿 Yanzi Sun - 我懷念的 What I Miss 4K MV (Official 4K UltraHD Video)
    "https://www.youtube.com/watch?v=a8KMIwG4usA",  # 孫燕姿 Sun Yan-Zi - 風箏 Kite (official 官方完整版MV)
    "https://www.youtube.com/watch?v=vFxjK0WTdTQ",  # 費玉清-晚安曲
    "https://www.youtube.com/watch?v=iBB6gkEwEhk",  # 女神卡卡 Lady Gaga - 牽我的手 Hold My Hand（捍衛戰士：獨行俠 電影主題曲）（中字 Official Music Video）
]
YT_MUSIC_FILE_PATH = "_finished.mp3"


### 1.4. 安裝必要的套件

In [None]:
# 安裝必要套件
!pip install -q yt-dlp
!pip install -q openai-whisper


## Step 2 - 獲取新影片的連結


### 2.1. Utils


In [None]:
from pathlib import Path


def _get_input_file_path(file_path: str) -> Path:
    """取得輸入檔案的路徑"""
    input_path = Path(file_path)
    assert input_path.exists(), f"請上傳 {file_path} 檔案"
    return input_path

# 建立 processed_titles.txt
def _get_processed_titles_file_path(file_path: str) -> Path:
    """取得已處理檔案的路徑"""
    processed_titles_path = Path(file_path)
    if not processed_titles_path.exists():
        processed_titles_path.write_text("")
    return processed_titles_path


### 2.2. yt_dlp_helpers

In [None]:
import subprocess


# 用 yt-dlp 取得所有影片的 titles
def _get_youtube_video_titles(urls: list[str]) -> dict[str, str]:
    """取得所有影片的 title"""
    url_2_title = {}
    for url in urls:
        try:
            title = subprocess.check_output(["yt-dlp", "--get-filename", "-o", TITLE_FORMAT, url], text=True).strip()
            url_2_title[url] = title
        except subprocess.CalledProcessError as e:
            print(f"❌ 無法取得標題 - {url}: {e}")
    return url_2_title


### 2.3. 抓影片 title, 並與 `processed_titles.txt` 比對避免重複, 最後返回新影片的連結

In [None]:
# 根據 yt-dlp 抓影片 titles, 並與 processed_titles.txt 比對避免重複
input_path = _get_input_file_path(INPUT_FILE_NAME)
processed_titles_path = _get_processed_titles_file_path(GD_PROCESSED_TITLES_FILE_PATH)

# 讀取 processed titles
processed_titles = set(
    line.strip() for line in processed_titles_path.read_text().splitlines() if line.strip()
)

# 讀取所有 input 連結
with open(input_path, "r") as f:
    all_input_links = [line.strip() for line in f if line.strip()]

# 用 yt-dlp 取得所有影片的 titles
url_2_title = _get_youtube_video_titles(all_input_links)

# 過濾掉已處理過的 titles
filtered_links = [url for url, title in url_2_title.items() if title not in processed_titles]

print(f"共有 {len(filtered_links)} 個新影片：")
for link in filtered_links:
    print("  ", link)

# 寫入新的 yt-dlp 處理清單
with open("_new_links.txt", "w") as f:
    f.write("\n".join(filtered_links))


## Step 3 - 下載音訊檔並儲存副本至 Google Drive

In [None]:
# 下載音訊檔並儲存副本至 Google Drive
!mkdir -p "{AUDIO_DIR}"
!mkdir -p "{GD_AUDIO_DIR}"

# 使用變量, 若沒有特殊自符, 則可以不需要加上引號
!yt-dlp --extract-audio --audio-format {AUDIO_FORMAT} -o "{AUDIO_DIR}/{TITLE_FORMAT}.%(ext)s" -a _new_links.txt

# 複製音訊檔至 Google Drive
!cp "{AUDIO_DIR}"/*.{AUDIO_FORMAT} "{GD_AUDIO_DIR}" || echo "無新檔案可複製"


## Step 4 - 使用 Whisper 模型產生逐字稿, 並儲存至 Google Drive 且同步更新 processed_links.txt

### 4.1. Helpers

In [None]:
def _get_whisper_command(file_path: str, model: str, output_dir: str, language: str) -> list[str]:
    """取得 Whisper 的轉錄指令"""
    whisper_command = [
        "whisper", str(file_path),
        "--model", model,
        "--output_dir", output_dir,
        # "--output_format", "txt"
    ]
    if language != "AUTO":
        whisper_command += ["--language", language]
    return whisper_command


### 4.2. 產生逐字稿

In [None]:
# 產生逐字稿
import subprocess

from pathlib import Path


output_dir = GD_OUTPUT_DIR
Path(output_dir).mkdir(parents=True, exist_ok=True)

processed_titles_path = _get_processed_titles_file_path(GD_PROCESSED_TITLES_FILE_PATH)
processed_titles = set(processed_titles_path.read_text().splitlines())

with open(processed_titles_path, "a") as f:
    for audio_file in Path("_audio_files").glob(f"*.{AUDIO_FORMAT}"):
        # 先前的 _audio_files 已經去重了, 這裡其實可以直接信任 audio_files 的結果,
        # 而不需要再去重一次的邏輯.
        title = audio_file.stem
        if title in processed_titles:
            print(f"⏭️ 略過已處理：{title}")
            continue

        print(f"--- START: {audio_file.name} ---")
        whisper_command = _get_whisper_command(
            str(audio_file), WHISPER_MODEL, output_dir, WHISPER_LANGUAGE
        )
        subprocess.run(whisper_command)
        print(f"--- END: {audio_file.name} ---")

        f.write(title + "\n")
        f.flush()
        print(f"✅ 已處理：{title}")


## Step 5 - 完成後, 撥放音樂提醒


### 5.1 Helpers

In [None]:
import base64

from IPython.display import Audio, display, HTML


def play_finish_music(volume=0.15):
    # 顯示提示
    display(HTML("<h3 style='color:green;'>✅ 任務完成，自動播放結束提示音：</h3>"))

    music_src = "https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3"  # 預設使用音效
    if MUSIC_SRC == "YT":
        # 讀取音檔並轉為 base64
        with open(YT_MUSIC_FILE_PATH, "rb") as f:
            audio_b64 = base64.b64encode(f.read()).decode()
        music_src = f"data:audio/mp3;base64,{audio_b64}"

    # 用 HTML 嵌入 base64 音訊 + JS 控制音量
    display(HTML(f'''
        <audio id="done-audio" autoplay controls>
            <source src="{music_src}" type="audio/mp3">
            Your browser does not support the audio element.
        </audio>
        <script>
            const audio = document.getElementById("done-audio");
            audio.volume = {volume};
        </script>
    '''))


### 5.2. 從 Youtube 隨機下載指定音樂

In [None]:
import random


if MUSIC_SRC == "YT":
    chosen_music = random.choice(YT_MUSICS)
    print(f"🎵 正在下載隨機選取的音樂：{chosen_music}")
    !yt-dlp -x --audio-format mp3 -o {YT_MUSIC_FILE_PATH} {chosen_music}


### 5.3. 撥放音樂與彈窗提醒

In [None]:
play_finish_music(0.05)


### 5.4. 音樂結束後, 彈窗提醒

In [None]:
from IPython.display import Javascript


display(Javascript("alert('✅ 任務完成，請手動關閉 session 以釋放資源!!!');"))


### 5.5. Kill 當前 session

In [None]:
# import os


# os.kill(os.getpid(), 9)  # 無法阻止 Colab 「自動重連」 的行為，這是由 Google 的伺服器控制的，程式無法關閉這項機制
