**NEW FILE**

In [None]:
!pip install torchvision librosa==0.9.2 opencv-python-headless ffmpeg-python

In [None]:
import os
import shutil
import torch
from IPython.display import FileLink

In [None]:
print("GPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA Device:", torch.cuda.get_device_name(0))
    # Set device to GPU
    device = torch.device("cuda")
else:
    print("No GPU available - using CPU")
    device = torch.device("cpu")

In [None]:
# Setup directory structure
os.makedirs('checkpoints', exist_ok=True)
os.makedirs('face_detection/detection/sfd', exist_ok=True)
os.makedirs('results', exist_ok=True)

In [None]:
# ---------------------------------------------------
face_video = "/kaggle/input/audio-viddeo/trump2 - Made with Clipchamp.mp4"  # Video with face
audio_file = "/kaggle/input/audio-viddeo/generated_speech2.wav"  # Audio file

In [None]:
preprocessed_video = "preprocessed_face2.mp4"
!ffmpeg -i "{face_video}" -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p -strict -2 "{preprocessed_video}"

In [None]:
import os

# Create the temp directory if it doesn't exist
os.makedirs("temp", exist_ok=True)
os.makedirs("results", exist_ok=True)



In [None]:
os.makedirs("temp", exist_ok=True)
os.makedirs("results", exist_ok=True)


In [None]:
import os

print("Preprocessed video exists:", os.path.exists("preprocessed_face2.mp4"))
print("Audio file exists:", os.path.exists(audio_file))
print("Checkpoint exists:", os.path.exists("/kaggle/input/wav2lip/Wav2Lip/checkpoints/wav2lip_gan.pth"))



In [None]:
!python /kaggle/input/wav2lip/Wav2Lip/inference.py \
--checkpoint_path /kaggle/input/wav2lip/Wav2Lip/checkpoints/wav2lip_gan.pth \
--face "{preprocessed_video}" \
--audio "{audio_file}" \
--outfile temp/result3.avi \
--resize_factor 1 \
--fps 25 \
--face_det_batch_size 4 \
--wav2lip_batch_size 16 \
--nosmooth

In [None]:
!ffmpeg -i temp/result3.avi -vcodec libx264 -acodec aac results/result3.mp4


In [None]:
if os.path.exists("results/result3.mp4"):
    print("Lip-synced video generated successfully!")
    display(FileLink("results/result3.mp4"))
else:
    raise FileNotFoundError("Output video was not generated. Check logs for errors.")

In [None]:
# Install dependencies
!pip install torchvision librosa==0.9.2 opencv-python-headless ffmpeg-python gradio torchaudio tortoise-tts ffmpeg-python

import os
import torch
import torchaudio
import gradio as gr
import subprocess
import shutil
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio

# Constants
WORKING_DIR = "/kaggle/working/"
CLONED_VOICE_PATH = os.path.join(WORKING_DIR, "cloned_voice1.wav")
OUTPUT_VIDEO_PATH = os.path.join(WORKING_DIR, "lip_sync_result1.mp4")

# Setup directories
os.makedirs(WORKING_DIR, exist_ok=True)
os.makedirs(os.path.join(WORKING_DIR, "temp"), exist_ok=True)

# Initialize TTS
tts = TextToSpeech()

def generate_voice_clone(audio_file, text, quality):
    """Generate cloned voice and save to fixed location"""
    try:
        # Handle Gradio audio input
        if isinstance(audio_file, tuple):
            sample_rate, audio_data = audio_file
            torchaudio.save(CLONED_VOICE_PATH, torch.from_numpy(audio_data).float(), sample_rate)
        else:
            shutil.copy(audio_file, CLONED_VOICE_PATH)
        
        # Generate cloned voice
        voice_samples = [load_audio(CLONED_VOICE_PATH, 22050)]
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            preset=quality,
            diffusion_iterations=100
        )
        
        # Save the final output
        torchaudio.save(CLONED_VOICE_PATH, gen.squeeze(0).cpu(), 24000)
        
        return CLONED_VOICE_PATH, "Voice cloned successfully!"
        
    except Exception as e:
        return None, f"Error: {str(e)}"

def lip_sync(video_path):
    """Lip-sync using the pre-generated voice with proper preprocessing"""
    try:
        if not os.path.exists(CLONED_VOICE_PATH):
            return None, "Please generate voice clone first!"
        
        # Handle video input
        if isinstance(video_path, dict):
            video_path = video_path["name"]
        elif isinstance(video_path, tuple):
            video_path = video_path[1]
        
        # Create directories
        temp_dir = os.path.join(WORKING_DIR, "temp")
        results_dir = os.path.join(WORKING_DIR, "results")
        os.makedirs(temp_dir, exist_ok=True)
        os.makedirs(results_dir, exist_ok=True)
        
        # Prepare paths
        processed_video_path = os.path.join(temp_dir, "preprocessed_input.mp4")
        temp_audio_path = os.path.join(temp_dir, "audio.wav")
        temp_output_path = os.path.join(temp_dir, "result.avi")
        final_output_path = os.path.join(results_dir, "lip_sync_output.mp4")
        
        # Convert audio to 16000Hz
        cmd = f"ffmpeg -y -i {CLONED_VOICE_PATH} -ar 16000 {temp_audio_path}"
        subprocess.run(cmd, shell=True, check=True)
        
        # Preprocess video (critical step!)
        cmd = f'ffmpeg -y -i "{video_path}" -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p -strict -2 "{processed_video_path}"'
        subprocess.run(cmd, shell=True, check=True)
        
        # Run Wav2Lip with parameters matching the working version
        cmd = f"""
        python /kaggle/input/wav2lip/Wav2Lip/inference.py \
        --checkpoint_path /kaggle/input/wav2lip/Wav2Lip/checkpoints/wav2lip_gan.pth \
        --face "{processed_video_path}" \
        --audio "{temp_audio_path}" \
        --outfile "{temp_output_path}" \
        --resize_factor 1 \
        --fps 25 \
        --face_det_batch_size 4 \
        --wav2lip_batch_size 16 \
        --nosmooth
        """
        subprocess.run(cmd, shell=True, check=True)
        
        # Convert output to MP4
        cmd = f'ffmpeg -y -i "{temp_output_path}" -vcodec libx264 -acodec aac "{final_output_path}"'
        subprocess.run(cmd, shell=True, check=True)
        
        if os.path.exists(final_output_path):
            return final_output_path, "Lip-sync complete!", CLONED_VOICE_PATH
        else:
            return None, "Lip-sync failed - no output generated", None
            
    except subprocess.CalledProcessError as e:
        return None, f"Command failed: {e.cmd} with return code {e.returncode}", None
    except Exception as e:
        return None, f"Error: {str(e)}", None
 


# Custom CSS for styling


# Custom CSS with improved visibility and modern colors

# Custom CSS with dark theme
custom_css = """
.gradio-container {
    font-family: 'Helvetica', Arial, sans-serif;
    background: #000000 !important;
    color: #ffffff !important;
}
.header {
    text-align: center;
    margin-bottom: 20px;
    padding: 20px;
    background: #1a1a1a !important;
    border-radius: 12px;
    box-shadow: 0 4px 6px rgba(0,0,0,0.3);
}
.header h1 {
    color: #ffffff !important;
    margin-bottom: 10px;
    font-weight: 700;
}
.header p {
    color: #b3b3b3 !important;
    font-size: 1.1em;
}
.tab {
    background: #2d2d2d !important;
    border-radius: 12px;
    padding: 20px;
    box-shadow: 0 4px 12px rgba(0,0,0,0.3);
    border: 1px solid #404040 !important;
    color: #ffffff !important;
}
.input-section, .output-section {
    background: #3d3d3d !important;
    border-radius: 12px;
    padding: 20px;
    margin-bottom: 20px;
    box-shadow: 0 2px 8px rgba(0,0,0,0.3);
    border: 1px solid #505050 !important;
    color: #ffffff !important;
}
.input-section h2, .output-section h2 {
    color: #ffffff !important;
    margin-top: 0;
    padding-bottom: 12px;
    border-bottom: 2px solid #505050 !important;
    font-weight: 600;
}
.btn {
    background: linear-gradient(135deg, #4a6baf 0%, #3a5a9f 100%) !important;
    color: white !important;
    border: none !important;
    padding: 12px 24px !important;
    border-radius: 8px !important;
    font-weight: 600 !important;
    box-shadow: 0 2px 4px rgba(0,0,0,0.3) !important;
    transition: all 0.3s ease !important;
}
.btn:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 4px 8px rgba(0,0,0,0.4) !important;
    background: linear-gradient(135deg, #3a5a9f 0%, #2a4a8f 100%) !important;
}
.status-box {
    background: #2d2d2d !important;
    padding: 12px;
    border-radius: 8px;
    border-left: 4px solid #4a6baf !important;
    font-family: monospace;
    color: #ffffff !important;
}
label {
    font-weight: 500 !important;
    color: #cccccc !important;
    margin-bottom: 8px !important;
}
.gr-interface {
    background: transparent !important;
}
.tabs {
    gap: 16px !important;
}
.gr-box {
    border-color: #505050 !important;
    background: #3d3d3d !important;
    color: white !important;
}
input, textarea, select {
    background: #2d2d2d !important;
    color: white !important;
    border-color: #505050 !important;
}
"""

# Gradio Interface with dark theme
with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue="blue")) as app:
    with gr.Column():
        gr.Markdown("""
        <div class="header">
            <h1>🎤 AI Voice Cloning + 🎬 Lip Sync Studio</h1>
            <p>Transform any voice and create perfectly synced videos with cutting-edge AI</p>
        </div>
        """)
    
    with gr.Tabs():
        with gr.Tab("1. Voice Cloning", elem_classes="tab"):
            with gr.Row(equal_height=True):
                with gr.Column(scale=1, elem_classes="input-section"):
                    gr.Markdown("### 🛠️ Input Parameters")
                    audio_input = gr.Audio(label="🎙️ Reference Voice Sample", 
                                         type="filepath",
                                         elem_id="audio-input")
                    text_input = gr.Textbox(label="📝 Text to Speak", 
                                           placeholder="Type what you want the cloned voice to say...",
                                           lines=3)
                    quality = gr.Dropdown(
                        label="⚡ Quality Preset", 
                        choices=["fast", "standard", "high_quality"], 
                        value="standard",
                        info="Higher quality = better results but longer processing"
                    )
                    clone_btn = gr.Button("✨ Generate Cloned Voice", elem_classes="btn")
                
                with gr.Column(scale=1, elem_classes="output-section"):
                    gr.Markdown("### 🎧 Results")
                    voice_output = gr.Audio(label="🔊 Cloned Voice Output", 
                                           interactive=False,
                                           elem_id="audio-output")
                    with gr.Group():
                        gr.Markdown("**📊 Status**")
                        clone_status = gr.Textbox(label="", 
                                                show_label=False, 
                                                elem_classes="status-box",
                                                placeholder="Waiting for voice generation...")
        
        with gr.Tab("2. Lip Sync", elem_classes="tab"):
            with gr.Row(equal_height=True):
                with gr.Column(scale=1, elem_classes="input-section"):
                    gr.Markdown("### 🎥 Video Input")
                    video_input = gr.Video(label="📽️ Upload Target Video", 
                                         elem_id="video-input")
                    sync_btn = gr.Button("🎬 Generate Lip Sync", elem_classes="btn")
                    gr.Markdown("""
                    <div style="color: #b3b3b3; font-size: 0.9em; margin-top: 10px;">
                    ℹ️ You must generate a voice clone in the first tab before lip-syncing.
                    </div>
                    """)
                
                with gr.Column(scale=1, elem_classes="output-section"):
                    gr.Markdown("### 🎞️ Final Output")
                    video_output = gr.Video(label="📺 Lip-Synced Video", 
                                         elem_id="video-output")
                    with gr.Group():
                        gr.Markdown("**📊 Status**")
                        sync_status = gr.Textbox(label="", 
                                               show_label=False, 
                                               elem_classes="status-box",
                                               placeholder="Waiting for lip sync...")
                    voice_review = gr.Audio(label="🔈 Voice Used for Lip Sync", 
                                          interactive=False)
    
    # Voice cloning action
    clone_btn.click(
        fn=generate_voice_clone,
        inputs=[audio_input, text_input, quality],
        outputs=[voice_output, clone_status]
    )
    
    # Lip-sync action
    sync_btn.click(
        fn=lip_sync,
        inputs=[video_input],
        outputs=[video_output, sync_status, voice_review]
    )

app.launch(share=True)