<a href="https://colab.research.google.com/github/Nishanth-nishu/NDVI_/blob/main/OpenVoice_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Clone the OpenVoice repository
!git clone https://github.com/myshell-ai/OpenVoice.git
%cd OpenVoice

# Install dependencies
!pip install -e .
!pip install torch torchaudio numpy scipy soundfile gradio
!pip install git+https://github.com/myshell-ai/MeloTTS.git
!python -m unidic download  # Required for multi-lingual text processing


Cloning into 'OpenVoice'...
remote: Enumerating objects: 453, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 453 (delta 4), reused 3 (delta 3), pack-reused 447 (from 2)[K
Receiving objects: 100% (453/453), 3.85 MiB | 12.77 MiB/s, done.
Resolving deltas: 100% (214/214), done.
/content/OpenVoice
Obtaining file:///content/OpenVoice
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting librosa==0.9.1 (from MyShell-OpenVoice==0.0.0)
  Downloading librosa-0.9.1-py3-none-any.whl.metadata (6.9 kB)
Collecting faster-whisper==0.9.0 (from MyShell-OpenVoice==0.0.0)
  Downloading faster_whisper-0.9.0-py3-none-any.whl.metadata (11 kB)
Collecting pydub==0.25.1 (from MyShell-OpenVoice==0.0.0)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting wavmark==0.0.3 (from MyShell-OpenVoice==0.0.0)
  Downloading wavmark-0.0.3-py3-none-any.whl.metadata (5.0 kB)
Collecting numpy==1.22.0 (from MyShell-Open

Collecting git+https://github.com/myshell-ai/MeloTTS.git
  Cloning https://github.com/myshell-ai/MeloTTS.git to /tmp/pip-req-build-4hugu_ri
  Running command git clone --filter=blob:none --quiet https://github.com/myshell-ai/MeloTTS.git /tmp/pip-req-build-4hugu_ri
  Resolved https://github.com/myshell-ai/MeloTTS.git to commit 209145371cff8fc3bd60d7be902ea69cbdb7965a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting txtsplit (from melotts==0.1.2)
  Downloading txtsplit-1.0.0.tar.gz (6.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cached_path (from melotts==0.1.2)
  Downloading cached_path-1.6.7-py3-none-any.whl.metadata (19 kB)
Collecting transformers==4.27.4 (from melotts==0.1.2)
  Downloading transformers-4.27.4-py3-none-any.whl.metadata (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting num2words==0.5.12 (from melotts==0.1.2)
  Downloading num2words-0

In [1]:
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
from google.colab import files
import os

class VideoDubbingUI:
    def __init__(self):
        self.upload_path = None
        self.create_ui()

    def create_ui(self):
        # Title
        display(HTML("<h2>Japanese-to-English Video Dubbing System</h2>"))

        # File Upload Section
        self.upload_button = widgets.Button(
            description='Upload Video',
            button_style='primary',
            icon='upload'
        )
        self.upload_button.on_click(self.handle_upload)
        self.file_label = widgets.Label('No file selected')

        # Language Selection (for future extensibility)
        self.source_lang = widgets.Dropdown(
            options=['Japanese'],
            value='Japanese',
            description='Source:',
            disabled=True
        )

        self.target_lang = widgets.Dropdown(
            options=['English'],
            value='English',
            description='Target:',
            disabled=True
        )

        # Process Control
        self.start_button = widgets.Button(
            description='Start Process',
            button_style='success',
            disabled=True
        )
        self.start_button.on_click(self.start_process)

        self.pause_button = widgets.Button(
            description='Pause',
            button_style='warning',
            disabled=True
        )
        self.pause_button.on_click(self.pause_process)

        self.stop_button = widgets.Button(
            description='Stop',
            button_style='danger',
            disabled=True
        )
        self.stop_button.on_click(self.stop_process)

        # Progress Bar
        self.progress = widgets.FloatProgress(
            value=0,
            min=0,
            max=100,
            description='Progress:',
            bar_style='info',
            orientation='horizontal'
        )

        # Status Message
        self.status = widgets.HTML(value="System Ready")

        # Preview Section (placeholder)
        self.preview = widgets.HTML(value='<div style="background-color: #f0f0f0; padding: 10px; text-align: center;">Video preview will appear here</div>')

        # Feedback Section
        self.translation_rating = widgets.IntSlider(
            value=5,
            min=1,
            max=10,
            description='Translation:',
            disabled=True
        )

        self.voice_rating = widgets.IntSlider(
            value=5,
            min=1,
            max=10,
            description='Voice:',
            disabled=True
        )

        self.comments = widgets.Textarea(
            placeholder='Additional comments...',
            disabled=True
        )

        self.feedback_button = widgets.Button(
            description='Submit Feedback',
            button_style='info',
            disabled=True
        )
        self.feedback_button.on_click(self.submit_feedback)

        # Download Section
        self.download_button = widgets.Button(
            description='Download Result',
            button_style='success',
            disabled=True
        )
        self.download_button.on_click(self.download_result)

        # Layout
        upload_box = widgets.VBox([self.upload_button, self.file_label])
        lang_box = widgets.HBox([self.source_lang, self.target_lang])
        control_box = widgets.HBox([self.start_button, self.pause_button, self.stop_button])
        progress_box = widgets.VBox([self.progress, self.status])
        feedback_box = widgets.VBox([
            self.translation_rating,
            self.voice_rating,
            self.comments,
            self.feedback_button
        ])

        main_layout = widgets.VBox([
            upload_box,
            lang_box,
            control_box,
            progress_box,
            self.preview,
            feedback_box,
            self.download_button
        ])

        display(main_layout)

    def handle_upload(self, button):
        """Handle video file upload"""
        try:
            uploaded = files.upload()
            if uploaded:
                filename = list(uploaded.keys())[0]
                self.upload_path = filename
                self.file_label.value = f'Uploaded: {filename}'
                self.start_button.disabled = False
                self.status.value = "Video uploaded successfully"
        except Exception as e:
            self.status.value = f"Upload error: {str(e)}"

    def start_process(self, button):
        """Start the dubbing process"""
        self.progress.value = 0
        self.status.value = "Processing started..."
        self.start_button.disabled = True
        self.pause_button.disabled = False
        self.stop_button.disabled = False

        # Simulate processing steps (replace with actual processing)
        import time
        for i in range(5):
            time.sleep(1)
            self.progress.value += 20
            steps = ["Speech recognition", "Translation", "Voice synthesis",
                    "Lip sync", "Final processing"]
            self.status.value = f"Step {i+1}/5: {steps[i]}"

        self.status.value = "Processing complete!"
        self.enable_feedback()

    def pause_process(self, button):
        """Pause the current process"""
        self.status.value = "Process paused"
        self.start_button.disabled = False

    def stop_process(self, button):
        """Stop the current process"""
        self.progress.value = 0
        self.status.value = "Process stopped"
        self.start_button.disabled = False
        self.pause_button.disabled = True
        self.stop_button.disabled = True

    def enable_feedback(self):
        """Enable feedback controls after processing"""
        self.translation_rating.disabled = False
        self.voice_rating.disabled = False
        self.comments.disabled = False
        self.feedback_button.disabled = False
        self.download_button.disabled = False

    def submit_feedback(self, button):
        """Handle feedback submission"""
        feedback = {
            'translation_rating': self.translation_rating.value,
            'voice_rating': self.voice_rating.value,
            'comments': self.comments.value
        }
        self.status.value = "Thank you for your feedback!"

    def download_result(self, button):
        """Handle result download"""
        if self.upload_path:
            # In a real implementation, this would download the processed video
            self.status.value = "Download started..."

# Create and display the UI
dubbing_ui = VideoDubbingUI()

VBox(children=(VBox(children=(Button(button_style='primary', description='Upload Video', icon='upload', style=…

In [None]:
# First cell - Install dependencies
%%capture
!apt-get update && apt-get install -y ffmpeg
!pip install moviepy pysrt deep_translator pydub
!git clone https://github.com/myshell-ai/OpenVoice
!git clone https://github.com/myshell-ai/MeloTTS
!pip install git+https://github.com/myshell-ai/MeloTTS.git
!python -m unidic download
!mv OpenVoice/checkpoints .
!pip install torch==2.0.1 fairseq soundfile praat-parselmouth pytest-shutil torchcrepe
!pip install -e OpenVoice

# Second cell - Import required libraries
import os
import torch
import moviepy.editor as mp
import pysrt
from deep_translator import GoogleTranslator
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from pydub import AudioSegment
import tempfile
from melotts import FastSpeech2TTSP
import numpy as np
import soundfile as sf

# Third cell - Initialize OpenVoice and MeloTTS
def initialize_openvoice():
    ckpt_converter = 'checkpoints/converter'
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    output_dir = 'outputs'
    os.makedirs(output_dir, exist_ok=True)

    tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

    return tone_color_converter, output_dir, device

# Initialize MeloTTS for different languages
def setup_melotts(device):
    tts_models = {
        'en': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-en-ljspeech', device=device),
        'es': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-es-css10', device=device),
        'fr': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-fr-css10', device=device),
        'de': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-de-css10', device=device),
        'zh': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-zh-aishell3', device=device),
        'ja': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-ja-jsut', device=device),
        'ko': FastSpeech2TTSP.from_pretrained('melotts/fastspeech2-ko-kss', device=device)
    }
    return tts_models

# Fourth cell - Translation and audio generation functions
def translate_srt(input_srt, target_languages):
    try:
        subs = pysrt.open(input_srt)

        for lang in target_languages:
            translator = GoogleTranslator(source="en", target=lang)
            translated_subs = subs[:]

            for sub in translated_subs:
                sub.text = translator.translate(sub.text)

            output_srt = f"subtitles_{lang}.srt"
            translated_subs.save(output_srt, encoding="utf-8")
            print(f"Translated subtitles saved to: {output_srt}")

        return True
    except Exception as e:
        print(f"Error during translation: {e}")
        return False

def generate_dubbed_audio(srt_file, output_audio, lang, tone_color_converter, source_se, target_se, tts_model):
    subs = pysrt.open(srt_file)
    audio_segments = []
    last_end_time = 0

    for sub in subs:
        text = sub.text.replace("\n", " ")

        # Generate base audio using MeloTTS
        temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        temp_output = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)

        # Generate speech using MeloTTS
        audio_array = tts_model.synthesize(text)
        sf.write(temp_wav.name, audio_array, tts_model.sampling_rate)

        # Convert tone color
        tone_color_converter.convert(
            audio_src_path=temp_wav.name,
            src_se=source_se,
            tgt_se=target_se,
            output_path=temp_output.name,
            message="@MyShell"
        )

        # Load and process audio segment
        audio_clip = AudioSegment.from_wav(temp_output.name)
        subtitle_start = sub.start.ordinal / 1000.0
        subtitle_end = sub.end.ordinal / 1000.0

        if subtitle_start > last_end_time:
            gap_duration = (subtitle_start - last_end_time) * 1000
            silence = AudioSegment.silent(duration=gap_duration)
            audio_segments.append(silence)

        audio_clip = audio_clip.set_frame_rate(22050).set_channels(1)
        audio_clip = audio_clip[:int((subtitle_end - subtitle_start) * 1000)]

        audio_segments.append(audio_clip)
        last_end_time = subtitle_end

        # Clean up temporary files
        os.unlink(temp_wav.name)
        os.unlink(temp_output.name)

    final_audio = sum(audio_segments)
    final_audio.export(output_audio, format="mp3")
    print(f"Generated dubbed audio for language {lang}")

# Fifth cell - Video processing function
def merge_subtitles_audio_video(video_path, audio_path, subtitles_path, output_path):
    try:
        video = mp.VideoFileClip(video_path)
        audio = mp.AudioFileClip(audio_path)

        if audio.duration > video.duration:
            audio = audio.subclip(0, video.duration)
        elif audio.duration < video.duration:
            audio = audio.fx(mp.vfx.loop, duration=video.duration)

        subs = pysrt.open(subtitles_path)
        subtitle_clips = []

        for sub in subs:
            subtitle = mp.TextClip(
                sub.text,
                fontsize=24,
                color='white',
                font='DejaVuSans',
                method='caption',
                size=(video.w, None)
            ).set_start(
                sub.start.ordinal / 1000.0
            ).set_duration(
                (sub.end.ordinal - sub.start.ordinal) / 1000.0
            ).set_pos(('center', 'bottom'))

            subtitle_clips.append(subtitle)

        final_video = video.set_audio(audio)
        final_video = mp.CompositeVideoClip([final_video, *subtitle_clips])

        final_video.write_videofile(
            output_path,
            codec='libx264',
            audio_codec='aac'
        )

        print(f"Video with subtitles and dubbed audio created successfully: {output_path}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Sixth cell - Main processing function
def process_multiple_languages(video_file, reference_speaker, subtitles_file_template, output_file_template, languages):
    # Initialize OpenVoice and MeloTTS
    tone_color_converter, output_dir, device = initialize_openvoice()
    tts_models = setup_melotts(device)

    # Extract reference speaker embedding
    source_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)
    target_se = source_se  # For voice cloning, we use the same embedding

    for lang in languages:
        print(f"Processing language: {lang}")

        if lang not in tts_models:
            print(f"Skipping unsupported language: {lang}")
            continue

        # Construct file paths
        dubbed_audio_file = f"{output_dir}/audio_{lang}_synced.mp3"
        subtitles_file = subtitles_file_template.format(lang=lang)
        output_video_file = output_file_template.format(lang=lang)

        # Generate dubbed audio
        generate_dubbed_audio(
            subtitles_file,
            dubbed_audio_file,
            lang,
            tone_color_converter,
            source_se,
            target_se,
            tts_models[lang]
        )

        # Merge everything together
        merge_subtitles_audio_video(
            video_file,
            dubbed_audio_file,
            subtitles_file,
            output_video_file
        )

# Seventh cell - Run the process
# Upload your video and reference audio files to Colab first
video_file = "your_video.mp4"  # Replace with your video file path
reference_speaker = "your_reference.mp3"  # Replace with your reference audio file path
languages = ["en", "es", "fr", "de", "ja", "zh", "ko"]  # Supported languages

subtitles_file_template = "subtitles_{lang}.srt"
output_file_template = "{lang}_final_video_with_subtitles.mp4"

process_multiple_languages(
    video_file,
    reference_speaker,
    subtitles_file_template,
    output_file_template,
    languages
)