## YouTubeダウンロード

In [None]:
# 動画のリンクのリスト → 音声ファイル

import os
import subprocess

youtube_links = [
    "https://www.youtube.com/watch?v=xVmShH0-9xY"
]

output_dir = "src"
os.makedirs(output_dir, exist_ok=True)

for link in youtube_links:
    output_template = os.path.join(output_dir, "%(title)s.%(ext)s")
    command = [
        "yt-dlp",
        "-x",
        "--audio-format", "wav", # 動画を取得したければこの行をコメントアウト
        "-o", output_template,
        link
    ]
    
    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print("✅ 成功:", link)
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("❌ 失敗:", link)
        print("標準エラー出力:\n", e.stderr)

In [None]:
# 再生リストのリンクのリスト → 音声ファイル
import os

youtube_playlist_links = [
    "https://youtube.com/playlist?list=PLsbq1qh5ApJK0vcKhSkZfWyRBW82NzSRr&si=VnOYmKQyrzXc9bZ9"
] 

# 出力ディレクトリの作成
output_dir = "src"
os.makedirs(output_dir, exist_ok=True)

# 各リンクに対して音声をダウンロード
for link in youtube_playlist_links:
    # os.system(f'yt-dlp -x --audio-format wav -o "{output_dir}/%(title)s.%(ext)s" {link}')
    # os.system(f'yt-dlp -x --audio-format mp3 -o "{output_dir}/%(title)s.%(ext)s" {link}')
    os.system(f'yt-dlp -o "{output_dir}/%(title)s.%(ext)s" {link}')

## 文字起こし & 話者分類

In [6]:
# ライブラリのインストールとモデルのロード
import torch
from pyannote.audio import Pipeline as PyannotePipeline
from dotenv import load_dotenv
import os

# 話者分離モデル(pyannote.audio)のパイプライン設定
load_dotenv(dotenv_path=".env")
PYANNOTE_AUTH_TOKEN = os.getenv("PYANNOTE_AUTH_TOKEN")
pyannote_pipeline = PyannotePipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=PYANNOTE_AUTH_TOKEN,
)
pyannote_pipeline.to(torch.device("cuda"))

# GPU用の設定
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 音声認識モデル(whisper-large-v3)のロード
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline as transformers_pipeline
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
transformers_pipe = transformers_pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=256,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps="word",
    torch_dtype=torch_dtype,
    device=device,
)

cuda:0


In [None]:
import os
import csv
import time

folder_path = "src"
output_dir = "dst/speech-recognition"

for file_path in os.listdir(folder_path):
    if file_path.endswith(".mp3"):
        full_path = os.path.join(folder_path, file_path)
        print(full_path)

        try:
            tmp_result = transformers_pipe(full_path, generate_kwargs={"language": "english"})
            file_name = os.path.splitext(file_path)[0]

            with open(os.path.join(output_dir, file_name + "-text.csv"), "w", encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(["start", "end", "text"])
                for chunk in tmp_result["chunks"]:
                    writer.writerow([chunk["timestamp"][0], chunk["timestamp"][1], chunk["text"]])
            
            del tmp_result

        except Exception as e:
            print(f"エラーが発生しました: {e}")


In [None]:
# 話者分類
input_dir = "src"
output_dir = "dst/diarization"

for file_name in os.listdir(input_dir):
    if file_name.endswith(".wav"):
        file_path = os.path.join(input_dir, file_name)
        diarization = pyannote_pipeline(file_path)
        base_name = os.path.splitext(file_name)[0]
        output_file = os.path.join(output_dir, f"{base_name}-asr.csv")
        
        with open(output_file, "w", encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["start", "end", "speaker"])
            for turn, _, speaker in diarization.itertracks(yield_label=True):
                writer.writerow([round(turn.start, 2), round(turn.end, 2), speaker])