In [1]:
import os
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [2]:
def transcribe(audio_filepath, **kwargs) -> str:
    if 'response_format' not in kwargs:
        kwargs['response_format'] = 'text'  # response_format에 따라 반환 형식이 달라져서 오류가 발생하지 않도록 기본값인 "json" 대신 "text"를 사용
    transcript = client.audio.transcriptions.create(
        file=open(audio_filepath, "rb"),
        model="whisper-1",
        **kwargs
    )
    return transcript

Transcribe Audio File

In [3]:
audio_file_path = "speech.mp3"
print(transcribe(audio_file_path))

사람들이 좋아할 만한 것을 만들어보아요.


In [5]:
# For optimal performance, use pytubefix version '8.12.1'. Version '8.1.1' would encounter errors
from pytubefix import YouTube
     
video_url = 'https://www.youtube.com/watch?v=TLRTml68cYM'

video = YouTube(video_url).streams.filter(only_audio=True).first().download()  # https://stackoverflow.com/a/50041191

In [6]:
transcribe(video)

"AI just got some serious shakeup and it's all because of DeepSync. This new AI model isn't just another release, it's a game changer. It's got OpenAI scrambling, NVIDIA stop taking hits, and the AI industry rethinking everything. And you might ask why? This is because it delivers top-tier AI performance for a fraction of the usual cost and it was built in the last two months. If AI companies no longer need billion-dollar budgets to train their models, what happens next? Let's break it down. So, what is DeepSync R1? In simple terms, it's an open-source AI model out of China that's punching way above its weight. In fact, it's on par with OpenAI's $200 per month O1 model at a lot of things like coding, research, and even maths. And it's free. You can even host it yourself if you don't trust them. But what makes it really special is the fact that it was trained for just $5.5 million, roughly $6 million. In the AI world right now, that's like getting a brand new Tesla for the price of a us

In [7]:
subtitle = transcribe(
    video,
    prompt="DeepSeek Explained: The AI Game-Changer You Need to Know!",  # 영상의 맥락을 설명해 정확도를 높임
    response_format="srt",  # 자막 제작을 위해 타이밍 표시
)
print(subtitle)

1
00:00:00,000 --> 00:00:03,920
AI just got some serious shake-up, and it's all because of DeepSeek.

2
00:00:03,920 --> 00:00:09,440
This new AI model isn't just another release, it's a game-changer. It's got OpenAI scrambling,

3
00:00:09,440 --> 00:00:14,560
NVIDIA stopped taking hits, and the AI industry rethinking everything. And you might ask why?

4
00:00:14,560 --> 00:00:18,480
This is because it delivers top-tier AI performance for a fraction

5
00:00:18,480 --> 00:00:21,280
of the usual cost, and it was built in the last two months.

6
00:00:21,280 --> 00:00:26,320
If AI companies no longer need billion-dollar budgets to train their models, what happens next?

7
00:00:26,320 --> 00:00:27,360
Let's break it down.

8
00:00:27,360 --> 00:00:32,959
So, what is DeepSeek R1? In simple terms, it's an open-source AI model out of China

9
00:00:32,959 --> 00:00:37,279
that's punching way above its weight. In fact, it's on par with OpenAI's $200-per-month

10
00:00:37,279 --> 00:00:42,

In [8]:
with open("subtitle.srt", "w") as f:
  f.write(subtitle)

In [ ]:
# 온전한 문장으로 만들기 위한 방법(녹취록 만들때)
plaintext = transcribe(
    video,
    prompt="어요., 네요., 까요?, 니다."  # 마침표 추가
)
print(plaintext)