# Installation

In [6]:
# !pip install openai-whisper
# !pip install ffmpeg-python
# !pip install transformers
# !pip install yt_dlp
# !pip install moviepy==1.0.3
# !pip install gtts
# !pip install pydub
!pip install gradio
# !pip install openai-whisper
# !pip install google-generativeai
# !pip install gTTS
# !pip install pydub
# !pip install moviepy
# !pip install spleeter



# importing dependencies

In [1]:
import gradio as gr
import whisper
import google.generativeai as genai
from gtts import gTTS
from pydub import AudioSegment
import os
from moviepy.editor import VideoFileClip , AudioFileClip
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from datetime import timedelta
from gtts import gTTS
from pydub import AudioSegment
import re
from datetime import datetime

# Extract Audio from Video

In [2]:
def extract_audio(video_path):
    video = VideoFileClip(video_path)
    video.audio.write_audiofile("temp_audio.wav")
    print("Extracted Audio")
    return "temp_audio.wav"

# print(extract_audio("/content/temp/test_video_2.mp4"))

# Extract Vocals from audio

In [3]:
def isolate_vocals(audio_path):
    os.system(f"spleeter separate -p spleeter:2stems -o output {audio_path}")
    print("Extracted Vocals")
    return "output/temp_audio/vocals.wav"  # Path to isolated vocals

# print(isolate_vocals("/content/temp_audio.wav"))

# Extracting text , translating it and creating subtitle .

In [None]:

# Load models
whisper_model = whisper.load_model("base")
translation_model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(translation_model_name)
translator = M2M100ForConditionalGeneration.from_pretrained(translation_model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def format_timestamp(seconds):
    return str(timedelta(seconds=int(seconds)))

def translate(text, target_lang):
    tokenizer.src_lang = "en"
    encoded = tokenizer(text, return_tensors="pt").to(translator.device)
    generated = translator.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(target_lang))
    return tokenizer.decode(generated[0], skip_special_tokens=True)

def transcribe_and_translate(audio_path, target_lang="hi", save_srt=True):
    # print("🔍 Transcribing with timestamps...")
    result = whisper_model.transcribe(audio_path, verbose=True)

    srt_output = ""
    for i, segment in enumerate(result['segments']):
        start = format_timestamp(segment['start'])
        end = format_timestamp(segment['end'])
        original_text = segment['text'].strip()
        translated_text = translate(original_text, target_lang)

        srt_output += f"{i+1}\n{start} --> {end}\n{translated_text}\n\n"

    if save_srt:
        with open("translated_subtitles.srt", "w", encoding="utf-8") as f:
            f.write(srt_output)
        # print("✅ Saved subtitles to translated_subtitles.srt")
    print("translated text")
    return "translated_subtitles.srt"

# === Run it ===
# audio_path = "/content/temp_audio.wav"
# print(transcribe_and_translate(audio_path, target_lang="hi"))


In [5]:
from pydub import AudioSegment
from datetime import datetime
import re
from gtts import gTTS
import os

def time_to_ms(time_str):
    """Convert SRT time format to milliseconds"""
    h, m, s = map(float, time_str.strip().replace(',', '.').split(':'))
    return int((h * 3600 + m * 60 + s) * 1000)

def sync_audio(srt_path, output_path="synced_translated_audio.mp3", lang='hi', speed_factor=1.3):
    """
    Generate synchronized audio from translated subtitles

    Args:
        srt_path (str): Path to SRT subtitle file
        output_path (str): Output audio file path
        lang (str): Language code for TTS (default: 'hi' for Hindi)
        speed_factor (float): Audio speed multiplier (1.0 = normal)

    Returns:
        str: Path to generated audio file
    """
    # Read SRT file
    with open(srt_path, "r", encoding="utf-8") as f:
        content = f.read()

    blocks = re.split(r"\n\s*\n", content.strip())
    final_audio = AudioSegment.silent(duration=0)
    current_time = 0  # in milliseconds

    # Temporary directory for line audio files
    os.makedirs("temp_tts", exist_ok=True)

    for i, block in enumerate(blocks):
        lines = block.strip().split("\n")
        if len(lines) < 3:
            continue

        # Parse subtitle block
        timing = lines[1]
        text = " ".join(lines[2:])
        start_time, end_time = timing.split("-->")
        start_ms = time_to_ms(start_time)
        end_ms = time_to_ms(end_time)

        print(f"🔊 [{start_time.strip()}] {text}")

        # Add silence gap if needed
        if start_ms > current_time:
            silence = AudioSegment.silent(duration=start_ms - current_time)
            final_audio += silence

        # Generate TTS audio
        temp_file = f"temp_tts/line_{i+1}.mp3"
        try:
            tts = gTTS(text=text, lang=lang)
            tts.save(temp_file)

            # Process audio segment
            segment = AudioSegment.from_mp3(temp_file)
            segment = segment.speedup(playback_speed=speed_factor)

            # Trim to fit time slot
            max_duration = end_ms - start_ms
            if len(segment) > max_duration:
                segment = segment[:max_duration]

            final_audio += segment
            current_time = start_ms + len(segment)

        except Exception as e:
            print(f"⚠️ Error processing block {i+1}: {str(e)}")
            continue

    # Export final audio
    final_audio.export(output_path, format="mp3")

    # Cleanup temporary files
    for f in os.listdir("temp_tts"):
        os.remove(os.path.join("temp_tts", f))
    os.rmdir("temp_tts")

    print(f"✅ Successfully generated synchronized audio at: {output_path}")
    print("audio_synced.")
    return output_path

# Example usage
# audio_path = sync_audio(
#     srt_path="translated_subtitles.srt",
#     output_path="final_audio.mp3",
#     lang='hi',
#     speed_factor=1.25
# )

In [6]:
def merge_audio(vocals_path, original_audio_path):
    background = AudioSegment.from_file("output/temp_audio/accompaniment.wav")
    voice = AudioSegment.from_file(vocals_path)
    combined = background.overlay(voice)
    combined.export("final_audio.wav", format="wav")
    print("merged audio")
    return "final_audio.wav"

# print(merge_audio("final_audio.mp3", "/content/temp_audio.wav"))

In [9]:
from moviepy.editor import VideoFileClip, AudioFileClip
import gradio as gr
import os


def process_video(video_path, target_langs):
    """
    Process video and translate to multiple languages

    Args:
        video_path (str): Path to input video
        target_langs (list): List of language codes (e.g., ['hi', 'ta', 'te'])

    Returns:
        dict: Paths to translated videos {lang: video_path}
    """
    results = {}

    # Step 1: Extract and process original audio
    audio_path = extract_audio(video_path)
    vocals_path = isolate_vocals(audio_path)
    bg_audio_path = f"output/{os.path.splitext(os.path.basename(audio_path))[0]}/accompaniment.wav"

    for lang in target_langs:
        print(f"\n🌐 Processing {lang} translation...")

        # Step 2: Transcribe and translate
        srt_path = transcribe_and_translate(audio_path, target_lang=lang)

        # Step 3: Generate synchronized TTS audio
        synced_audio_path = sync_audio(srt_path, lang=lang)

        # Step 4: Merge with background
        final_audio_path = merge_audio(synced_audio_path, bg_audio_path)

        # Step 5: Create translated video
        output_path = f"output_video_{lang}.mp4"
        video_clip = VideoFileClip(video_path)
        final_clip = video_clip.set_audio(AudioFileClip(final_audio_path))
        final_clip.write_videofile(output_path)

        results[lang] = output_path
        print(f"✅ {lang.upper()} version saved to {output_path}")

    return results

# Gradio Interface with Multi-Language Support
lang_choices = [
    ("Hindi", "hi"),
    ("Tamil", "ta"),
    ("Telugu", "te"),
    ("Bengali", "bn"),
    ("Marathi", "mr"),
    ("Gujarati", "gu"),
    ("Kannada", "kn"),
    ("Malayalam", "ml"),
    ("Punjabi", "pa"),
    ("Odia", "or"),
    ("Assamese", "as"),
    ("Urdu", "ur"),
    ("Sanskrit", "sa")
]

def gradio_interface(video, language_checkboxes):
    selected_langs = [code for (name, code) in lang_choices if name in language_checkboxes]
    if not selected_langs:
        return "⚠️ Please select at least one language"

    results = process_video(video, selected_langs)
    return list(results.values())

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🌍 Multi-Language Video Translator")

    with gr.Row():
        video_input = gr.Video(label="Input Video")
        output_gallery = gr.Gallery(label="Translated Videos")

    lang_checkboxes = gr.CheckboxGroup(
        choices=[name for (name, code) in lang_choices],
        label="Target Languages",
        value=["Hindi"]
    )

    submit_btn = gr.Button("Translate")
    submit_btn.click(
        fn=gradio_interface,
        inputs=[video_input, lang_checkboxes],
        outputs=output_gallery
    )

demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d819dcc8115d26ebcc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


