导入包

In [None]:
import cv2 
import json
import csv
#python -m pip install opencv-python
# 或者
#python -m pip install opencv-python-headless
import os
import subprocess

这一部分是定位视频元素的代码

In [2]:


# ——配置区——
VIDEO_PATH = "Douyin-videos/60万开的稀烂.mp4"
TIME_SEC   = 2     # 截第 2 秒那一帧，改成你的视频确实有的时间点
MAX_W, MAX_H = 800, 600   # 窗口最大尺寸

# 全局：存原始坐标
orig_points = []

def on_mouse(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        scale = param['scale']
        ox = int(x/scale); oy = int(y/scale)
        print(f"Display=({x},{y}) → Original=({ox},{oy})")
        orig_points.append((ox, oy))

# 1. 读一帧
cap = cv2.VideoCapture(VIDEO_PATH)
fps = cap.get(cv2.CAP_PROP_FPS)
cap.set(cv2.CAP_PROP_POS_FRAMES, TIME_SEC * fps)
ret, frame = cap.read()
cap.release()
if not ret:
    raise RuntimeError("读帧失败，请检查 TIME_SEC 和 VIDEO_PATH")

# 2. 先按比例缩到不超过 MAX_W×MAX_H
h, w = frame.shape[:2]
scale = min(MAX_W/w, MAX_H/h, 1.0)
display = cv2.resize(frame, (int(w*scale), int(h*scale)))

# 3. 创建可缩放窗口并绑定回调
cv2.namedWindow("pick-point", cv2.WINDOW_NORMAL)
cv2.resizeWindow("pick-point", int(w*scale), int(h*scale))
cv2.setMouseCallback("pick-point", on_mouse, {'scale': scale})

print("请点击想要的点 (Esc 退出)，终端会输出对应原始坐标。")
while True:
    cv2.imshow("pick-point", display)
    if cv2.waitKey(1) & 0xFF == 27:  # Esc
        break

cv2.destroyAllWindows()
print("所有点击的原始坐标：", orig_points)


请点击想要的点 (Esc 退出)，终端会输出对应原始坐标。


KeyboardInterrupt: 

划分分割区域

In [8]:
def crop_and_extract_frames(
    video_path: str,
    output_dir: str,
    crop_region: tuple,
    frame_interval: int = 30,
    title_region: tuple = None
):
    """
    Crop a specific region from a video and extract frames.
    Additionally, extract the title_region once at t=1s.

    :param video_path: Path to the input video file.
    :param output_dir: Directory to save cropped frames.
    :param crop_region: Tuple (x, y, w, h) for the main ROI.
    :param frame_interval: Extract one frame every `frame_interval` frames.
    :param title_region: Tuple (x, y, w, h) for title area; if provided,
                         extract that region once at the 1s mark.
    """
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise FileNotFoundError(f"Cannot open video: {video_path}")

    fps = cap.get(cv2.CAP_PROP_FPS)
    title_frame_idx = int(fps * 1)  # 第一秒对应的帧数
    title_saved = False

    frame_count = 0
    saved_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # —— 在第一秒时刻，单独提取标题区域 —— 
        if title_region and not title_saved and frame_count == title_frame_idx:
            x_t, y_t, w_t, h_t = title_region
            title_crop = frame[y_t:y_t+h_t, x_t:x_t+w_t]
            cv2.imwrite(
                os.path.join(output_dir, "title_frame.jpg"),
                title_crop
            )
            title_saved = True

        # —— 按间隔提取其他 ROI 帧 —— 
        if frame_count % frame_interval == 0:
            x, y, w, h = crop_region
            cropped = frame[y:y+h, x:x+w]
            fname = os.path.join(output_dir, f"frame_{saved_count:04d}.jpg")
            cv2.imwrite(fname, cropped)
            saved_count += 1

        frame_count += 1

    cap.release()
    print(f"Extracted title frame? {title_saved}, plus {saved_count} other frames to {output_dir}")


# —— 使用示例 —— 
crop_and_extract_frames(
    video_path="Douyin-videos/60万开的稀烂.mp4",
    output_dir="rois",
    crop_region=(540,469,1079,1428),   # 后续帧要裁的那个区域
    frame_interval=300,                  # 每隔 30 帧抽一次
    title_region=(9, 230, 1079, 464)     # 只在 t=1s 提取的标题区域
)


Extracted title frame? True, plus 30 other frames to rois


In [9]:
# -*- coding: utf-8 -*-

def extract_audio_same_name(
    video_path: str,
    start_time: str = None,
    duration: str = None,
    sample_rate: int = 16000,
    channels: int = 1
):
    """
    Extract audio from a video and save it with the same base filename.
    """
    base, _ = os.path.splitext(video_path)
    output_dir = "audio-output"
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(video_path))[0]
    audio_path = os.path.join(output_dir, f"{base_name}.wav")

    cmd = ["ffmpeg", "-y", "-i", video_path]
    if start_time:
        cmd += ["-ss", str(start_time)]
    if duration:
        cmd += ["-t", str(duration)]
    cmd += [
        "-vn",
        "-acodec", "pcm_s16le",
        "-ar", str(sample_rate),
        "-ac", str(channels),
        audio_path
    ]

    subprocess.run(cmd, check=True)
    print(f"Extracted audio to: {audio_path}")


def batch_extract_from_folder(
    folder_path: str,
    exts: tuple = (".mp4", ".mov", ".avi", ".mkv"),
    start_time: str = None,
    duration: str = None,
    sample_rate: int = 16000,
    channels: int = 1
):
    """
    遍历 folder_path 下所有指定后缀的视频文件，批量提取音频。
    """
    for entry in os.listdir(folder_path):
        if entry.lower().endswith(exts):
            video_file = os.path.join(folder_path, entry)
            try:
                extract_audio_same_name(
                    video_path=video_file,
                    start_time=start_time,
                    duration=duration,
                    sample_rate=sample_rate,
                    channels=channels
                )
            except subprocess.CalledProcessError as e:
                print(f"❌ 处理失败: {video_file}", e)


if __name__ == "__main__":
    # 你的视频目录
    video_folder = "Douyin-videos"
    # 如果所有视频都全长提取，就不传 start_time 和 duration
    batch_extract_from_folder(video_folder)


Extracted audio to: audio-output\60万开的稀烂.wav
Extracted audio to: audio-output\8000人大学门口40万买铺 这个位置干点什么好？#筷便利大刘  #便利店经营选址 #硬折扣便利店.wav
Extracted audio to: audio-output\中山200万开店.wav
Extracted audio to: audio-output\捡漏！月租2800国道口神铺 如何经营才能“钱景”无限？ #筷便利大刘 #便利店经营选址 #硬折扣便利店.wav


openai api audio2text

In [None]:
os.environ["OPENAI_API_KEY"] = ""

In [9]:
# -*- coding: utf-8 -*-
import os
from openai import OpenAI
import json

# 从环境变量或其它安全方式读取你的 Key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

INPUT_DIR  = "audio-output"
OUTPUT_DIR = "transcripts_ts"
os.makedirs(OUTPUT_DIR, exist_ok=True)
EXTS = (".wav", ".mp3", ".m4a", ".flac")

for fn in os.listdir(INPUT_DIR):
    if not fn.lower().endswith(EXTS):
        continue

    src = os.path.join(INPUT_DIR, fn)
    base = os.path.splitext(fn)[0]
    dst = os.path.join(OUTPUT_DIR, base + ".txt")

    print(f"⏳ Transcribing {fn} with timestamps...")
    with open(src, "rb") as audio_f:
        # 用 verbose_json 或 json 格式，才能拿到 segments 里 start/end
        res = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_f,
            response_format="verbose_json"
        )
    # res 是 dict
    segments = res.segments

    #时间戳
    with open(dst, "w", encoding="utf-8") as out_f:
        for seg in segments:
            start = seg.start    # 秒数（float）
            end   = seg.end      # 秒数（float）
            text  = seg.text.strip()
            # 格式化时间戳
            ts = f"[{int(start//60):02d}:{start%60:05.2f} → {int(end//60):02d}:{end%60:05.2f}]"
            out_f.write(f"{ts}  {text}\n")

    print(f"✅ Saved with timestamps: {dst}")


⏳ Transcribing 60万开的稀烂.wav with timestamps...
✅ Saved with timestamps: transcripts_ts\60万开的稀烂.txt
⏳ Transcribing 8000人大学门口40万买铺 这个位置干点什么好？#筷便利大刘  #便利店经营选址 #硬折扣便利店.wav with timestamps...
✅ Saved with timestamps: transcripts_ts\8000人大学门口40万买铺 这个位置干点什么好？#筷便利大刘  #便利店经营选址 #硬折扣便利店.txt
⏳ Transcribing 中山200万开店.wav with timestamps...
✅ Saved with timestamps: transcripts_ts\中山200万开店.txt
⏳ Transcribing 捡漏！月租2800国道口神铺 如何经营才能“钱景”无限？ #筷便利大刘 #便利店经营选址 #硬折扣便利店.wav with timestamps...
✅ Saved with timestamps: transcripts_ts\捡漏！月租2800国道口神铺 如何经营才能“钱景”无限？ #筷便利大刘 #便利店经营选址 #硬折扣便利店.txt
