In [2]:
import os
import re
import uuid
import whisper
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import PlainTextResponse

app = FastAPI()
model = whisper.load_model("small")  # 你可以换成 "base"、"medium" 等模型名

# 分别定义两种时间戳格式
pattern_hhmmss = r"\[\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\]"
pattern_mmss   = r"\[\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}\.\d{3}\]"

# 合并为一个通用匹配正则
timestamp_pattern = f"(?:{pattern_hhmmss}|{pattern_mmss})"

def clean_text(txt_path: str) -> str:
    lines = []
    with open(txt_path, "r", encoding="utf-8") as f:
        for line in f:
            # 去除时间戳
            text = re.sub(timestamp_pattern, "", line).strip()
            if text:
                lines.append(text)
    return " ".join(lines)

@app.post("/transcribe", response_class=PlainTextResponse)
async def transcribe(file: UploadFile = File(...)):
    # 1. 保存上传的音频
    temp_audio = f"/tmp/{uuid.uuid4().hex}_{file.filename}"
    with open(temp_audio, "wb") as f:
        f.write(await file.read())

    # 2. 用 Whisper 转写，生成带时间戳的 txt
    result = model.transcribe(temp_audio)
    temp_txt = temp_audio + ".txt"
    with open(temp_txt, "w", encoding="utf-8") as f_txt:
        for seg in result["segments"]:
            f_txt.write(f"[{seg['start']:.3f} --> {seg['end']:.3f}]  {seg['text'].strip()}\n")

    # 3. 清除时间戳，留下纯文本
    clean = clean_text(temp_txt)

    # 4. 清理临时文件
    os.remove(temp_audio)
    os.remove(temp_txt)

    # 5. 返回处理后的文本
    return clean
