<a href="https://colab.research.google.com/github/VinzentBuecheler/Deepfake/blob/main/DeepFake_Generator_GradioUI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Fake Video Generator (Run in Google Colab)

## 1. Text translation - Setup

In [1]:
def translation_en_de(input):
  !pip install transformers torch espnet IPython espnet_model_zoo
  !pip install sacremoses

  from transformers import FSMTForConditionalGeneration, FSMTTokenizer
  import time
  import torch
  from espnet2.bin.tts_inference import Text2Speech
  from espnet2.utils.types import str_or_none

  mname = "facebook/wmt19-de-en"
  tokenizer = FSMTTokenizer.from_pretrained(mname)
  model = FSMTForConditionalGeneration.from_pretrained(mname)

  input_ids = tokenizer.encode(input, return_tensors="pt")
  outputs = model.generate(input_ids)
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return decoded

## 2. Voice Cloning - Setup

In [2]:
SAMPLE_RATE = 22050
embedding = None

def voice_cloning(text,celebName): #voice_cloning_setup(celebName):

  #2.1 Setup
  %tensorflow_version 1.x
  import os
  from os.path import exists, join, basename, splitext

  from IPython.display import display, Audio, clear_output
  
  git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
  project_name = splitext(basename(git_repo_url))[0]
  if not exists(project_name):
    # clone and install
    !git clone -q --recursive {git_repo_url}
    # install dependencies
    !cd {project_name} && pip install -q -r requirements.txt
    !pip install -q --upgrade gdown
    !apt-get install -qq libportaudio2
    !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

    # download pretrained model
    !cd {project_name} && wget https://github.com/blue-fish/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip && unzip -o pretrained.zip
    !cd {project_name} && mkdir -p saved_models/default/
    !cd {project_name}/saved_models/default/ && gdown https://drive.google.com/uc?id=1f9z6OHKwCRa7CteX6AV5XN68CCPHwCI1 #https://drive.google.com/uc?id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1
    !cd {project_name}/saved_models/default/ && gdown https://drive.google.com/uc?id=19Uqcr2an7ha0Xymur4AtXV7a9lZN7mqj #https://drive.google.com/uc?id=1EqFMIbvxffxtjiVrtykroF6_mUh-5Z3s
    !cd {project_name}/saved_models/default/ && gdown https://drive.google.com/uc?id=14qJzfTehtjvBwUBlWFWnvZnfYvT9m9aW #https://drive.google.com/uc?id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu

  import sys
  sys.path.append(project_name)

  from IPython.utils import io
  import ipywidgets as widgets
  import numpy as np
  from dl_colab_notebooks.audio import record_audio, upload_audio

  from synthesizer.inference import Synthesizer
  from encoder import inference as encoder
  from vocoder import inference as vocoder
  from pathlib import Path

  !ls

  encoder.load_model(project_name / Path("saved_models/default/encoder.pt"))
  synthesizer = Synthesizer(project_name / Path("saved_models/default/synthesizer.pt"))
  vocoder.load_model(project_name / Path("saved_models/default/vocoder.pt"))
  
  #2.2 Generating Embedding

  #Choose between the celebrity audios.
  if celebName == "Trump":
    !cd sample_data && gdown https://drive.google.com/uc?id=1i0WhVsQh-7ptZQ5TTUfmsOo_yhnAqu3_
    audio = "/content/sample_data/Trump_WEF_2018-trimmed.mp3"
  elif celebName == "Merkel":
    !cd sample_data && gdown https://drive.google.com/uc?id=1cuHpYW8slLF34Es5HPBY3zUUDc_SMKgW
    audio = "/content/sample_data/Angela Merkel final.mp3"
  elif celebName == "Mickey":
    # Mickey gdrive link
    audio = "/content/sample_data/Mickey_audio.mp3"
  elif celebName == "Modi":
    # Modi gdrive link
    audio = "/content/sample_data/Modi_audio.mp3"

  embedding = None
  embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))

  if embedding is None:
    print("Error fetching the reference audio file. Check the link to gdrive file")
  else:
    
    #2.3 Synthesizing new Audio
    from scipy.io.wavfile import write
    print("Synthesizing new audio...")
    #with io.capture_output() as captured:
    specs = synthesizer.synthesize_spectrograms([text], [embedding])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    # clear_output()
    #display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=False))

    #Save the generated audio file in a directory.
    scaled_audio = np.int16(generated_wav/np.max(np.abs(generated_wav)) * 32767)
    
    if celebName == "Trump":
      outputAudioName = 'voiceClone_output_Trump.wav'
    elif celebName == "Merkel":
      outputAudioName = 'voiceClone_output_Merkel.wav'
    elif celebName == "Mickey":
      outputAudioName = 'voiceClone_output_Mickey.wav'
    elif celebName == "Modi":
      outputAudioName = 'voiceClone_output_Modi.wav'

    write(outputAudioName, synthesizer.sample_rate, scaled_audio )

## 3. Lip Syncing - Setup

In [3]:
def lip_syncing_setup():
  !git clone https://github.com/Rudrabha/Wav2Lip.git
  !cd /content/Wav2Lip/checkpoints/ && gdown https://drive.google.com/uc?id=1by1m-0RCx5v34G0ejXy9Zt6wNueNaDpW

  !cd /content/Wav2Lip && pip install -r requirements.txt

  !wget "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" -O "Wav2Lip/face_detection/detection/sfd/s3fd.pth"

  #Fetch reference videos of
  #Trump
  !cd /content/sample_data && gdown https://drive.google.com/uc?id=1KgJd4Jix3U7lr2BYymb5u8uGD0rjHzDW
  #Merkel
  !cd /content/sample_data && gdown https://drive.google.com/uc?id=15EfLI5b_bNrmcmHKovb5QtbxyG8eJV0p
  #Mickey
  # Modi gdrive link
  #Modi
  # Modi gdrive link

def lip_syncing(celebName):
  
  lip_syncing_setup()

  if celebName == "Trump":
    !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip.pth --face "../sample_data/Trim.mp4" --audio "/content/voiceClone_output_Trump.wav"
  elif celebName == "Merkel":
    !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip.pth --face "../sample_data/Merkel (online-video-cutter.com).mp4" --audio "/content/voiceClone_output_Merkel.wav"
  elif celebName == "Mickey":
    !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip.pth --face "../sample_data/Mickey_video.mp4" --audio "/content/voiceClone_output_Mickey.wav"
  elif celebName == "Modi":
    !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip.pth --face "../sample_data/Modi_video.mp4" --audio "/content/voiceClone_output_Modi.wav"

## 4. Pipelining - Setup

In [4]:
def create_deepfake(inputText, celebName):
# inputText = "Ich freue mich, heute so viele Gesichter zu sehen. Ist es nicht erstaunlich, dass mein Video heute für die Demo gefälscht ist?"
# final_text = "Happy to see so many faces today. Isn't this amazing that my video is faked for demo today?"
  final_text = translation_en_de(inputText)
  voice_cloning(final_text,celebName)
  lip_syncing(celebName)
  return '/content/Wav2Lip/results/result_voice.mp4'

## 5. Deep Fake creation - Execution & Output

In [None]:
!pip install gradio
from gradio.templates import Text
import gradio as gr

demo = gr.Interface(fn=create_deepfake, 
                    inputs=[gr.inputs.Textbox(label="Input a text in German"), 
                            gr.inputs.Dropdown(["Trump", "Merkel","Mickey", "Modi"], 
                                               label = "Choose a Celebrity")], 
                    outputs="playable_video")
demo.launch(inbrowser=True,show_error=True, debug=True)