In [None]:
!git clone https://github.com/Zyphra/Zonos
%cd Zonos
!pip install -e .
!pip install --no-build-isolation -e .[compile]

import subprocess
subprocess.check_call(['pip', 'install', 'moviepy', 'demucs', 'pydub', 'dotenv', 'openai', 'noisereduce', 'pyannote.audio', 'openai-whisper', 'gradio'],
                      stdout=subprocess.DEVNULL,
                      stderr=subprocess.DEVNULL)

import subprocess

subprocess.run(['apt-get', 'update'], env={'LC_ALL': 'C.UTF-8'})
subprocess.run(['apt-get', 'install', '-y', 'espeak-ng'], env={'LC_ALL': 'C.UTF-8'})

Cloning into 'Zonos'...
remote: Enumerating objects: 340, done.[K
remote: Counting objects: 100% (192/192), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 340 (delta 156), reused 115 (delta 115), pack-reused 148 (from 1)[K
Receiving objects: 100% (340/340), 3.03 MiB | 43.06 MiB/s, done.
Resolving deltas: 100% (211/211), done.
/content/Zonos
Obtaining file:///content/Zonos
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting kanjize>=1.5.0 (from zonos==0.1.0)
  Downloading kanjize-1.6.0-py3-none-any.whl.metadata (2.5 kB)
Collecting numpy>=2.2.2 (from zonos==0.1.0)
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [3

CompletedProcess(args=['apt-get', 'install', '-y', 'espeak-ng'], returncode=0)

In [None]:
import gradio as gr
import tempfile
import os
import shutil
from pathlib import Path
import json
import gc
import warnings
warnings.filterwarnings("ignore")

import cv2
from moviepy.editor import VideoFileClip, ImageSequenceClip, AudioClip, concatenate_videoclips
import torch
import torchaudio
import numpy as np
from pydub import AudioSegment
import soundfile as sf
import whisper
from demucs.pretrained import get_model
from demucs.apply import apply_model
from pyannote.audio import Pipeline
from openai import OpenAI
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
import librosa
from transformers import AutoTokenizer, ClapModel, AutoFeatureExtractor

import torch
import torch.nn as nn
import numpy as np
import random
import os

def set_seed(seed=42):
   random.seed(seed)
   np.random.seed(seed)
   torch.manual_seed(seed)
   if torch.cuda.is_available():
       torch.cuda.manual_seed(seed)
       torch.cuda.manual_seed_all(seed)
   torch.backends.cudnn.deterministic = True
   torch.backends.cudnn.benchmark = False
   os.environ['PYTHONHASHSEED'] = str(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
global_models = {}
temp_data = {}

class EmotionMLP(torch.nn.Module):
   def __init__(self, input_size=1024, hidden_size=256):
       super(EmotionMLP, self).__init__()
       self.linear1 = torch.nn.Linear(input_size, hidden_size)
       self.batch_norm1 = torch.nn.BatchNorm1d(hidden_size)
       self.relu1 = torch.nn.ReLU()
       self.dropout1 = torch.nn.Dropout(0.3)
       self.linear2 = torch.nn.Linear(hidden_size, 128)
       self.batch_norm2 = torch.nn.BatchNorm1d(128)
       self.relu2 = torch.nn.ReLU()
       self.dropout2 = torch.nn.Dropout(0.3)
       self.linear3 = torch.nn.Linear(128, 8)

   def forward(self, x):
       x = self.linear1(x)
       x = self.batch_norm1(x)
       x = self.relu1(x)
       x = self.dropout1(x)
       x = self.linear2(x)
       x = self.batch_norm2(x)
       x = self.relu2(x)
       x = self.dropout2(x)
       x = self.linear3(x)
       return torch.sigmoid(x)

def initialize_models():
   print("🚀 모델 초기화 시작...")

   print("📝 Whisper 모델 로딩...")
   global_models['whisper'] = whisper.load_model("medium", device=device)

   print("🎤 VAD 파이프라인 로딩...")
   global_models['vad'] = Pipeline.from_pretrained(
       "pyannote/voice-activity-detection",
       use_auth_token=''
   )
   global_models['vad'].to(device)

   print("🎵 Demucs 모델 로딩...")
   global_models['demucs'] = get_model('htdemucs').to(device)
   global_models['demucs'].eval()

   print("🗣️ Zonos TTS 모델 로딩...")
   global_models['zonos'] = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device)
   global_models['zonos'].eval()

   print("🧠 CLAP 모델 로딩...")
   global_models['clap'] = ClapModel.from_pretrained("laion/larger_clap_general").to(device)
   global_models['clap'].eval()
   global_models['feature_extractor'] = AutoFeatureExtractor.from_pretrained("laion/larger_clap_general")
   global_models['tokenizer'] = AutoTokenizer.from_pretrained("laion/larger_clap_general")

   print("😊 Emotion MLP 모델 로딩...")
   global_models['emotion'] = EmotionMLP(input_size=1024, hidden_size=256)
   global_models['emotion'].load_state_dict(torch.load("/content/best_emotion_model.pth", map_location=device))
   global_models['emotion'].to(device)
   global_models['emotion'].eval()

   print("✅ 모든 모델 로딩 완료!")
   print(f"🔧 사용 중인 디바이스: {device}")

print("Starting model initialization...")
initialize_models()


In [None]:
def get_normalized_audio(video_path):
    """비디오에서 정규화된 오디오 추출"""
    audio_segment = AudioSegment.from_file(video_path, format="mp4")
    audio_samples = np.array(audio_segment.get_array_of_samples())

    if audio_segment.channels == 2:
        audio_samples = audio_samples.reshape((-1, 2))

    sample_width = audio_segment.sample_width
    max_val = float(2 ** (8 * sample_width - 1))
    audio_float = audio_samples.astype(np.float32) / max_val

    del audio_samples, audio_segment
    gc.collect()

    return audio_float

def get_image_sequence_clip(video_path):
    """비디오에서 이미지 시퀀스 클립 생성"""
    video_clip = VideoFileClip(video_path)
    fps_video = video_clip.fps
    frames = list(video_clip.iter_frames())
    image_seq_clip = ImageSequenceClip(frames, fps=fps_video)
    return image_seq_clip

def separate_vocals_and_background(audio, device):
    """Demucs를 사용하여 보컬과 배경음악 분리"""
    audio_tensor = torch.tensor(audio.T).float().unsqueeze(0).to(device)

    with torch.no_grad():
        estimates = apply_model(global_models['demucs'], audio_tensor, shifts=1, overlap=0.5)

    estimates_np = estimates.squeeze(0).cpu().numpy()
    vocals = estimates_np[3].T
    background = (estimates_np[0] + estimates_np[1] + estimates_np[2]).T

    del audio_tensor, estimates, estimates_np
    torch.cuda.empty_cache()
    gc.collect()

    return vocals, background

def speech_to_text_with_vad(audio_file, margin=0.0):
    """Whisper + VAD를 사용한 음성 인식"""
    with torch.no_grad():
        result = global_models['whisper'].transcribe(audio_file, word_timestamps=True)

    vad_output = global_models['vad'](audio_file)

    refined_segments = []
    for segment in result['segments']:
        vad_matches = [
            (speech.start, speech.end)
            for speech in vad_output.get_timeline()
            if (speech.start <= segment['end'] and speech.end >= segment['start'])
        ]

        if vad_matches:
            best_match = min(vad_matches, key=lambda x: x[1] - x[0])
            extended_start = max(segment['start'], max(best_match[0], 0)) - margin
            extended_end = min(segment['end'], best_match[1]) + margin
            extended_start = max(extended_start, 0.0)
        else:
            extended_start = max(segment['start'] - margin, 0.0)
            extended_end = segment['end'] + margin

        refined_segments.append({
            'start': extended_start,
            'end': extended_end,
            'text': segment['text'].strip()
        })

    return {'segments': refined_segments}

def translate_text_with_gpt(text, request_text, openai_key):
    """GPT를 사용한 번역 (원본 프롬프트 방식)"""
    transcript_schema = {
        "type": "object",
        "properties": {
            "transcript": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "start": {
                            "type": "number",
                            "description": "문장이 시작하는 시간 (초 단위)"
                        },
                        "end": {
                            "type": "number",
                            "description": "문장이 끝나는 시간 (초 단위)"
                        },
                        "line": {
                            "type": "string",
                            "description": "Translated Text"
                        }
                    },
                    "required": ["start", "end", "line"]
                }
            }
        },
        "required": ["transcript"]
    }

    system_prompt = """
                ### Persona ###
                You are a Translator who receives text formatted as a Whisper-style transcript. Your role is to translate this transcript to korean.

                ### Task ###
                Translate the provided transcript text into **Korean**. You must structure the result strictly according to the specified JSON schema.
                All keys and string values must use double quotes.
                The resulting JSON object must contain a "transcript" key, with its value being an array of objects, each containing timestamps and the translated dialogue line.
                The translation should capture the satirical, humorous, and cynical style characteristic of 'The Simpsons', taking into account the video title and the provided summary information.
                Include liberal translations (paraphrasing) to naturally convey the original context, characters, and humor to a Korean audience.

                Translation Guidelines:
                - Prioritize context and cultural nuance over literal translation (use paraphrasing).
                - Maintain the tone appropriate for the original characters and situations.
                - Choose expressions that effectively convey American humor and satire in Korean.

                ### Output Instructions ###
                You must call the `get_transcript` tool (function) to return the result in JSON format. Generate *only* the JSON object resulting from this tool call, with no other text or explanations.

                ### [Critically Important] Exclusion Rule: ###
                If a line of the transcript consists *only* of single interjections, onomatopoeia, short cheers, brief affirmative/negative responses, or laughter (such as 'Yeah', 'Yummy', 'Oh', 'Ah', 'Mmm', 'Huh', 'Wow', 'Haha', 'Woo hoo', 'Uh-huh', etc.),
                that entire line **must absolutely not be included** in the final JSON output. For example, if an original transcript line is just "0:04.000 --> 0:06.000\\nYeah! Woo hoo!", the corresponding object for that line must be **completely omitted** from the `transcript` array in the resulting JSON.
                **Failure to adhere strictly to this rule negates your primary function.**

                ### Output Example (Illustrating exclusion of single interjection lines): ###
                ```json
                {
                    "transcript": [
                        {"start": 0.03, "end": 4.00, "line": "Translated meaningful dialogue line 1"},
                        # Omitted lines containing only interjections like "Yeah!", "Oh!", etc.
                        {"start": 7.00, "end": 9.00, "line": "Translated meaningful dialogue line 2"}

                    ]
                }
                """

    user_prompt = (
        f"Please translate to korean the following text, considering the video title and the user's requests, and structure it in the specified format.\n\n"
        f"Video Title: \"The Simpsons\"\n"
        f"User's requests for the translation:\n"
        f"{request_text}\n\n"
        f"Text to translate and structure:\n"
        f"{text}\n"
    )

    client = OpenAI(api_key=openai_key)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt.encode('utf-8', errors='ignore').decode('utf-8')},
            {"role": "user", "content": user_prompt.encode('utf-8', errors='ignore').decode('utf-8')}
        ],
        tools=[{
            "type": "function",
            "function": {
                "name": "get_transcript",
                "description":"Converts and returns the text in transcript JSON format, including timeline segments.",
                "parameters": transcript_schema
            }
        }],
        tool_choice={"type": "function", "function": {"name": "get_transcript"}}
    )

    message = response.choices[0].message
    if message.tool_calls and message.tool_calls[0].function.name == "get_transcript":
        tool_call = message.tool_calls[0]
        function_args = tool_call.function.arguments
        try:
            result = json.loads(function_args)
            return result
        except json.JSONDecodeError as e:
            print(f"JSON 파싱 실패: {e}")
            print(f"모델이 반환한 원본 인수 문자열: {function_args}")
            return {"error": "JSON 파싱 실패", "raw_output": function_args}
    else:
        print("오류: 모델이 예상된 get_transcript 툴을 호출하지 않았습니다.")
        print(f"모델 응답: {message}")
        raw_content = message.content if message.content else "내용 없음"
        return {"error": "툴 호출 누락", "raw_output": raw_content}

def count_phonemes(text):
    """한국어 음소 개수 계산"""
    total_phonemes = 0
    for char in text:
        code = ord(char)
        if 0xAC00 <= code <= 0xD7A3:  # 한글 완성형
            syllable_index = code - 0xAC00
            final_index = syllable_index % 28
            total_phonemes += 2 if final_index == 0 else 3
    return total_phonemes

def process_waveform(wavs, start_sec, end_sec, sample_rate=44100):
    """파형 시간 조정"""
    target_duration = end_sec - start_sec
    target_length = int(target_duration * sample_rate)
    if target_length < 1:
        target_length = 1

    if wavs.ndim == 3 and wavs.shape[0] == 1:
        wavs_np = wavs[0, 0].numpy()
    elif wavs.ndim == 2 and wavs.shape[0] == 1:
        wavs_np = wavs[0].numpy()
    else:
        wavs_np = wavs.view(-1).numpy()

    current_length = len(wavs_np)
    if target_length > 0 and current_length > 0:
        stretch_rate = current_length / target_length
    else:
        stretch_rate = 1.0

    try:
        stretched_wav = librosa.effects.time_stretch(wavs_np, rate=stretch_rate)
    except Exception as e:
        print(f"Time stretch error: {e}")
        stretched_wav = wavs_np

    return torch.from_numpy(stretched_wav).unsqueeze(0).unsqueeze(1)

def create_video_with_combined_audio(vocals, background, video_clip, output_file, sr_audio=44100):
    """오디오와 비디오 결합"""
    n_samples_v = vocals.shape[0]
    n_samples_b = background.shape[0]
    n_samples = max(n_samples_v, n_samples_b)

    if n_samples_v < n_samples:
        vocals = np.pad(vocals, ((0, n_samples - n_samples_v), (0, 0)), mode='constant')
    if n_samples_b < n_samples:
        background = np.pad(background, ((0, n_samples - n_samples_b), (0, 0)), mode='constant')

    combined = vocals + background
    duration_audio = combined.shape[0] / sr_audio

    def make_audio_frame(t):
        t_arr = np.atleast_1d(t)
        idx = (t_arr * sr_audio).astype(int)
        idx = np.clip(idx, 0, combined.shape[0] - 1)
        frame = combined[idx]
        return frame[0] if frame.shape[0] == 1 else frame

    audio_clip = AudioClip(make_audio_frame, duration=duration_audio, fps=sr_audio)
    video_duration = video_clip.duration

    if video_duration < duration_audio:
        audio_clip = audio_clip.set_duration(video_duration)
    elif video_duration > duration_audio:
        video_clip = video_clip.subclip(0, duration_audio)

    final_clip = video_clip.set_audio(audio_clip)
    final_clip.write_videofile(output_file, codec="libx264", audio_codec="aac")

# Stage 1: 음성 인식 및 번역
def stage1_transcribe_and_translate(video_file, openai_key, translation_request, progress=gr.Progress()):
    """1단계: 비디오에서 음성 인식 후 번역"""
    if not video_file:
        return None, None, "비디오 파일을 업로드해주세요."

    if not openai_key:
        return None, None, "OpenAI API 키를 입력해주세요."

    try:
        progress(0.05, desc="처리 시작...")

        # 임시 파일 경로 설정
        temp_dir = tempfile.mkdtemp()
        video_path = os.path.join(temp_dir, "input_video.mp4")
        shutil.copy(video_file, video_path)

        # 1. 오디오/비디오 분리
        progress(0.1, desc="오디오/비디오 분리 중...")
        audio = get_normalized_audio(video_path)
        video_clip = get_image_sequence_clip(video_path)

        # 2. 보컬/배경 분리
        progress(0.3, desc="보컬/배경음악 분리 중...")
        vocals, background = separate_vocals_and_background(audio, device)

        # 임시 오디오 파일 저장
        vocals_file = os.path.join(temp_dir, "vocals.wav")
        sf.write(vocals_file, vocals, 48000)

        # 3. 음성 인식
        progress(0.6, desc="음성 인식 중...")
        stt_result = speech_to_text_with_vad(vocals_file)

        # 4. 번역
        progress(0.8, desc="번역 중...")
        transcript_text = ""
        for segment in stt_result['segments']:
            transcript_text += f"[{segment['start']:05.3f} ~ {segment['end']:05.3f}] {segment['text']}\n"

        # 번역 요청이 없으면 기본 요청 사용
        if not translation_request.strip():
            translation_request = "GPT 4o-mini를 활용하여 번역함. 의성어 부분은 제외하도록 함. 추가적으로 번역에 반영하고 싶은 부분들 체크"

        translated_result = translate_text_with_gpt(transcript_text, translation_request, openai_key)

        if "error" in translated_result:
            return None, None, f"번역 오류: {translated_result['error']}"

        # 임시 데이터 저장 (Stage 2에서 사용)
        temp_data['temp_dir'] = temp_dir
        temp_data['video_path'] = video_path
        temp_data['vocals_file'] = vocals_file
        temp_data['background'] = background
        temp_data['video_clip'] = video_clip
        temp_data['stt_result'] = stt_result
        temp_data['sampling_rate'] = 48000

        # 번역 결과를 사용자가 볼 수 있도록 포맷팅
        translation_display = ""
        for segment in translated_result["transcript"]:
            start = segment["start"]
            end = segment["end"]
            korean_text = segment["line"].strip()

            # 원본 텍스트 찾기
            original_text = ""
            for orig_segment in stt_result['segments']:
                if abs(orig_segment['start'] - start) < 0.5:
                    original_text = orig_segment['text']
                    break

            translation_display += f"[{start:05.3f} ~ {end:05.3f}] {korean_text}, {original_text}\n"

        progress(1.0, desc="1단계 완료!")

        return translated_result, translation_display, "1단계 완료! 번역 결과를 확인하고 2단계로 진행하세요."

    except Exception as e:
        return None, None, f"1단계 처리 중 오류가 발생했습니다: {str(e)}"
    finally:
        # 메모리 정리
        gc.collect()
        torch.cuda.empty_cache()

# Stage 2: TTS 생성 및 비디오 합성
def stage2_generate_tts_and_video(translated_data, progress=gr.Progress()):
    """2단계: 번역된 텍스트로 TTS 생성 후 비디오 합성"""
    if not translated_data or 'temp_dir' not in temp_data:
        return None, "먼저 1단계를 완료해주세요."

    try:
        progress(0.1, desc="TTS 생성 시작...")

        # 임시 데이터 로드
        temp_dir = temp_data['temp_dir']
        vocals_file = temp_data['vocals_file']
        background = temp_data['background']
        video_clip = temp_data['video_clip']
        stt_result = temp_data['stt_result']
        sampling_rate = temp_data['sampling_rate']

        # TTS 생성
        wav, _ = torchaudio.load(vocals_file)
        translated_vocal = np.zeros(wav.shape[1], dtype=np.float32)

        total_segments = len(translated_data["transcript"])

        # 번역된 세그먼트와 원본 매칭 후 TTS 생성
        for i, trans_segment in enumerate(translated_data["transcript"]):
            progress(0.1 + 0.7 * (i / total_segments), desc=f"TTS 생성 중... ({i+1}/{total_segments})")

            start = trans_segment["start"]
            end = trans_segment["end"]
            korean_text = trans_segment["line"].strip()

            # 원본 텍스트 찾기
            original_text = ""
            for orig_segment in stt_result['segments']:
                if abs(orig_segment['start'] - start) < 0.5:
                    original_text = orig_segment['text']
                    break

            if not korean_text or not original_text:
                continue

            start_sample = int(start * sampling_rate)
            end_sample = int(end * sampling_rate)
            seg = wav[:, start_sample:end_sample]

            segment_duration = end - start
            repeat_times = max(1, int(30 / segment_duration))
            repeated_seg = torch.cat([seg] * repeat_times, dim=1)
            max_samples = int(30 * sampling_rate)

            if repeated_seg.shape[1] > max_samples:
                repeated_seg = repeated_seg[:, :max_samples]
            elif repeated_seg.shape[1] < max_samples:
                padding_samples = max_samples - repeated_seg.shape[1]
                repeated_seg = torch.nn.functional.pad(repeated_seg, (0, padding_samples))

            speaker = global_models['zonos'].make_speaker_embedding(repeated_seg, sampling_rate)
            speed = count_phonemes(korean_text) / segment_duration

            wav_numpy = wav.squeeze().cpu().numpy()
            resampled_numpy = librosa.resample(y=wav_numpy, orig_sr=sampling_rate, target_sr=48000)
            inputs_text = global_models['tokenizer']([original_text], padding=True, return_tensors="pt")
            inputs_audio = global_models['feature_extractor'](resampled_numpy, sampling_rate=48000, return_tensors="pt")

            for key in inputs_text:
                inputs_text[key] = inputs_text[key].to(device)
            for key in inputs_audio:
                inputs_audio[key] = inputs_audio[key].to(device)

            with torch.no_grad():
                text_features = global_models['clap'].get_text_features(**inputs_text)
                audio_features = global_models['clap'].get_audio_features(**inputs_audio)[0].unsqueeze(0)
                concatenated_features = torch.cat((text_features, audio_features), dim=1).to(device)
                emotion_vector = global_models['emotion'](concatenated_features)

            with torch.no_grad():
                cond_dict = make_cond_dict(
                    text=korean_text,
                    speaker=speaker,
                    language='ko',
                    fmax=sampling_rate/2,
                    pitch_std=50,
                    speaking_rate=speed,
                    emotion=emotion_vector
                )
                conditioning = global_models['zonos'].prepare_conditioning(cond_dict)
                codes = global_models['zonos'].generate(conditioning)
                wavs = global_models['zonos'].autoencoder.decode(codes).cpu()

            # 시간 조정
            wavs_processed = process_waveform(wavs, start, end, sampling_rate)

            if wavs_processed is not None:
                generated_audio = wavs_processed.squeeze().numpy()
                target_start = int(start * sampling_rate)
                target_end = target_start + len(generated_audio)

                if target_end <= len(translated_vocal):
                    translated_vocal[target_start:target_end] += generated_audio

        # 최종 비디오 생성
        progress(0.9, desc="최종 비디오 생성 중...")
        output_file = os.path.join(temp_dir, "translated_video.mp4")
        stereo_vocals = np.column_stack((translated_vocal, translated_vocal))

        create_video_with_combined_audio(
            stereo_vocals, background, video_clip,
            output_file, sr_audio=sampling_rate
        )

        progress(1.0, desc="완료!")

        # 임시 데이터 정리
        temp_data.clear()

        return output_file, "번역 비디오 생성이 완료되었습니다!"

    except Exception as e:
        return None, f"2단계 처리 중 오류가 발생했습니다: {str(e)}"
    finally:
        # 메모리 정리
        gc.collect()
        torch.cuda.empty_cache()

# Gradio 인터페이스
def create_interface():
    with gr.Blocks(title="2-Stage Video Translation Pipeline", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🎬 Zero-Shot Voice cloning 기술을 활용한 영상 번역 시스템")
        gr.Markdown("영어 비디오를 한국어로 번역하고 음성을 합성하여 새로운 비디오를 생성합니다.")

        # Stage 1: 음성 인식 및 번역
        with gr.Tab("1단계: 음성 인식 & 번역"):
            gr.Markdown("## 🎤 1단계: 음성 인식 및 번역")

            with gr.Row():
                with gr.Column():
                    video_input = gr.File(
                        label="비디오 파일 업로드",
                        file_types=["video"],
                        file_count="single"
                    )

                    openai_key = gr.Textbox(
                        label="OpenAI API 키",
                        placeholder="sk-...",
                        type="password"
                    )

                    translation_request = gr.Textbox(
                        label="번역 요청사항",
                        placeholder="예: 캐릭터의 성격을 더 강조해서 번역해주세요",
                        lines=3,
                        value=""
                    )

                    stage1_btn = gr.Button("🎯 1단계 시작: 음성인식 & 번역", variant="primary", size="lg")

                with gr.Column():
                    translation_output = gr.Textbox(
                        label="번역 결과",
                        lines=15,
                        interactive=False
                    )
                    stage1_status = gr.Textbox(label="1단계 상태", interactive=False)

            gr.Markdown("""
            ### 1단계 처리 과정:
            - 🎵 **오디오/비디오 분리** - 영상에서 음성과 화면 분리
            - 🎤 **보컬/배경음악 분리** - AI 음원 분리 (Demucs)
            - 🗣️ **음성 인식** - 영어 음성을 텍스트로 변환 (Whisper + VAD)
            - 🌐 **번역** - 영어를 한국어로 자연스럽게 번역 (GPT-4o-mini)
            """)

        # Stage 2: TTS 및 비디오 생성
        with gr.Tab("2단계: 음성 합성 & 비디오 생성"):
            gr.Markdown("## 🤖 2단계: 음성 합성 및 비디오 생성")

            with gr.Row():
                with gr.Column():
                    gr.Markdown("### 번역 결과 확인")
                    gr.Markdown("1단계에서 생성된 번역 결과를 확인하고 2단계를 진행하세요.")

                    stage2_btn = gr.Button("🚀 2단계 시작: TTS & 비디오 생성", variant="primary", size="lg")

                with gr.Column():
                    video_output = gr.File(label="완성된 번역 비디오")
                    stage2_status = gr.Textbox(label="2단계 상태", interactive=False)


            gr.Markdown("""
            ### 2단계 처리 과정:
            - 🎭 **감정 분석** - 원본 음성의 감정 상태 분석 (CLAP + Emotion MLP)
            - 🗣️ **음성 합성** - 한국어 텍스트를 감정이 담긴 음성으로 변환 (Zonos TTS)
            - ⏱️ **시간 동기화** - 원본과 동일한 타이밍으로 음성 조정
            - 🎬 **최종 비디오 생성** - 새로운 음성과 원본 영상 결합
            """)

        # 전역 변수로 번역 데이터 저장
        translated_data_state = gr.State()

        # 이벤트 핸들러
        stage1_btn.click(
            fn=stage1_transcribe_and_translate,
            inputs=[video_input, openai_key, translation_request],
            outputs=[translated_data_state, translation_output, stage1_status]
        )

        stage2_btn.click(
            fn=stage2_generate_tts_and_video,
            inputs=[translated_data_state],
            outputs=[video_output, stage2_status]
        )

        # 사용 가이드
        gr.Markdown("""
        ---
        ## 📋 사용 가이드

        ### 🎯 **1단계: 음성 인식 & 번역**
        1. **비디오 파일**: MP4 형식의 영어 비디오를 업로드
        2. **OpenAI API 키**: GPT-4o-mini 번역을 위해 필요
        3. **번역 요청사항**: 특별한 번역 스타일이나 요구사항 입력
        4. **1단계 시작** 버튼 클릭
        5. 번역 결과 확인 및 검토

        ### 🚀 **2단계: 음성 합성 & 비디오 생성**
        1. 1단계 완료 후 번역 결과 확인
        2. **2단계 시작** 버튼 클릭
        3. TTS 생성 및 비디오 합성 대기
        4. 완성된 번역 비디오 다운로드

        ### 💡 **장점**
        - **단계별 진행**: 번역 결과를 먼저 확인한 후 TTS 진행
        - **번역 수정 가능**: 1단계에서 번역 품질을 검토 가능
        - **효율적 처리**: 각 단계별로 최적화된 처리
        - **Simpson 특화**: Simpson 특유의 유머와 풍자 번역

        ### ⚠️ **주의사항**
        - **GPU 메모리**: 12GB 이상 권장
        - **순서 준수**: 반드시 1단계 완료 후 2단계 진행
        """)

    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        share=True
    )