In [None]:
!pip install TTS
!pip install -U openai-whisper
!pip install moviepy
!pip install transformers

In [None]:
from TTS.api import TTS
# from TTS.api import TTS
import whisper
from moviepy.editor import VideoFileClip, concatenate_videoclips, concatenate_audioclips
# # from deep_translator import GoogleTranslator
# import os
# # from gtts import gTTS
# from transformers import pipeline
# # import spacy
from pathlib import Path
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


model_type_stt="medium"
# # Load the Whisper model
model = whisper.load_model(model_type_stt)

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)

model_name = "facebook/nllb-200-distilled-600M"
model_t = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# nlp = spacy.load("xx_ent_wiki_sm")
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
d_langs_t = {'Английский': 'eng_Latn',
            'Датский': 'dan_Latn',
            'Испанский': 'spa_Latn',
            'Итальянский': 'ita_Latn',
            'Китайский': 'zho_Hans',
            'Немецкий': 'deu_Latn',
            'Польский': 'pol_Latn',
            'Португальский': 'por_Latn',
            'Турецкий': 'tur_Latn',
            'Французский': 'fra_Latn',
            'Чешский': 'ces_Latn',
            # 'Японский': 'jpn_Jpan',
            }

d_langs_xtts = {'Английский': 'en',
            'Датский': 'nl',
            'Испанский': 'es',
            'Итальянский': 'it',
            'Китайский': 'zh-cn',
            'Немецкий': 'de',
            'Польский': 'pl',
            'Португальский': 'pt',
            'Турецкий': 'tr',
            'Французский': 'fr',
            'Чешский': 'cs',
            # 'Японский': 'ja',
            }

def extract_audio_from_video(video_file, output_audio_format="mp3"):
    video = VideoFileClip(video_file)
    audio_file = f"{video_file.rsplit('.', 1)[0]}.{output_audio_format}"
    video.audio.write_audiofile(audio_file, logger=None)  # Disable logging for faster I/O
    return audio_file

def transcribe_with_whisper(audio_file, model=model):
    # Transcribe the audio
    result = model.transcribe(audio_file)
    return result

def speech2text(video_file):
    audio_file = extract_audio_from_video(video_file)
    data = transcribe_with_whisper(audio_file)
    return {
        'timecode_with_text': [[x['start'], x['end'], x['text']] for x in data['segments']],
        'text': data['text']
    }

def cut_video(video_path, intervals):
    video = VideoFileClip(video_path)
    clips = []

    start, end = intervals[0], intervals[1]
    clip = video.subclip(start, end)
    clips.append(clip)

    return clips

def concat_video_and_wav(clip):
        # Load the video
    # video = VideoFileClip(video_path)

    # Load the audio file
    audio = AudioFileClip('./wavs_for_xtts/wav_translated.wav')

    # Set the audio of the video
    return clip[0].set_audio(audio)
# video_gen_path = '/content/Wav2Lip/results/result_voice.mp4'

def get_wav_translate(text, path_to_speaker, lang):
    # generate speech by cloning a voice using default settings
    print(text, path_to_speaker, lang)
    tts.tts_to_file(text=text,
                    file_path="./wavs_for_xtts/wav_translated.wav",
                    speaker_wav=path_to_speaker,
                    language=lang)

def concatenate_to_wav(clips, path='./concat_wavs'):
    audio_clips = []
    for clip in clips:
        audio_clips.append(clip.audio)
    final_clip = concatenate_audioclips(audio_clips)
    final_clip.write_audiofile(path + '/wav_after_concat.wav')

def translate(text, target_language):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model_t.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_language])
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
def main(video_file, lang='Английский'):
    lang_t = d_langs_t.get(lang, 'eng_Latn')
    lang_xtts = d_langs_xtts.get(lang, 'en')
    d_text_timecodes = speech2text(video_file)
    Path(f"./wavs_for_xtts").mkdir(exist_ok = True)
    Path(f"./concat_wavs").mkdir(exist_ok = True)
    Path(f"./result_videos").mkdir(exist_ok = True)

    name = Path(video_file).stem
    lst_clips = []

    for intervals in d_text_timecodes['timecode_with_text']:
        clips = cut_video(video_file, intervals)
        concatenate_to_wav(clips)
        get_wav_translate(translate(intervals[-1], lang_t)[0], './concat_wavs/wav_after_concat.wav', lang_xtts)
        lst_clips.append(concat_video_and_wav(clips))

    # Concatenate the video clips
    final_clip = concatenate_videoclips(lst_clips)

    # Write the final clip to an mp4 file
    final_clip.write_videofile(f"./result_videos/{name}.mp4")

    return 'OK'


# Example usage:
if __name__ == "__main__":
    video_file = "/content/drive/MyDrive/wav2lip/33.mp4"  # Update with your video file path
    lang = 'Японский'
    print(main(video_file, lang))