# Real-Time Voice Cloning

This is a colab demo notebook using the open source project [CorentinJ/Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning)
to clone a voice.

For other deep-learning Colab notebooks, visit [tugstugi/dl-colab-notebooks](https://github.com/tugstugi/dl-colab-notebooks).


Original issue: https://github.com/tugstugi/dl-colab-notebooks/issues/18

## Setup CorentinJ/Real-Time-Voice-Cloning

In [2]:
!pip install -r requirements_deepFake.txt
# !pip install multiprocess
# !pip install multiprocess
# !pip install librosa
# !pip install llvmlite
# !pip install numba
# !pip install llvmlite 0.33
# !pip uninstall -y numba
# !pip uninstall -y llvmlite

# !pip --use-feature=2020-resolver install numba==0.43.0
# !pip --use-feature=2020-resolver install llvmlite==0.31.0
# !pip install llvm
# !pip install fastparquet
# !pip install --upgrade pip
# !python --version
# !sudo apt install python3-llvmlite


Collecting appdirs==1.4.4
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting audioread==2.1.8
  Downloading https://files.pythonhosted.org/packages/2e/0b/940ea7861e0e9049f09dcfd72a90c9ae55f697c17c299a323f0148f913d2/audioread-2.1.8.tar.gz
Collecting cffi==1.14.2
[?25l  Downloading https://files.pythonhosted.org/packages/82/7e/9cc46f072c9a414b5a6e08c5c2da5db3bff2601e69c4a6d4f6a34e6f9cfc/cffi-1.14.2-cp36-cp36m-manylinux1_x86_64.whl (400kB)
[K     |████████████████████████████████| 409kB 10.1MB/s eta 0:00:01
Collecting dill==0.3.2
[?25l  Downloading https://files.pythonhosted.org/packages/e2/96/518a8ea959a734b70d2e95fef98bcbfdc7adad1c1e5f5dd9148c835205a5/dill-0.3.2.zip (177kB)
[K     |████████████████████████████████| 184kB 16.9MB/s eta 0:00:01
Collecting inflect==4.1.0
  Downloading https://files.pythonhosted.org/packages/b3/27/15edd6e1519f4e489ff50def9367a62d138baa9c000

In [13]:
!pip freeze > requirements_deepFake.txt

In [3]:
#@title Setup CorentinJ/Real-Time-Voice-Cloning

#@markdown * clone the project
#@markdown * download pretrained models
#@markdown * initialize the voice cloning models

# %tensorflow_version 1.x
import os
from os.path import exists, join, basename, splitext

# git_repo_url = 'https://github.com/CorentinJ/Real-Time-Voice-Cloning.git'
# project_name = splitext(basename(git_repo_url))[0]
# print(project_name)
# if not exists(project_name):
#     print("Downloading Files")
#     # clone and install
#     !git clone -q --recursive {git_repo_url}
#     # install dependencies
#     !cd {project_name} && pip install -q -r requirements.txt
#     !pip install -q gdown
#     !apt-get install -qq libportaudio2
#     !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

#   # download pretrained model
#     !cd {project_name} && gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc && unzip pretrained.zip
BASE_PATH_VOICE_CLONE = "./voice_clone/"
import sys
sys.path.append(BASE_PATH_VOICE_CLONE)
import numpy as np
from pathlib import Path
from scipy.io import wavfile
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder


SAMPLE_RATE = 22050
embedding = None


# loading Models
encoder.load_model(BASE_PATH_VOICE_CLONE / Path("encoder/saved_models/pretrained.pt"))
synthesizer = Synthesizer(BASE_PATH_VOICE_CLONE / Path("synthesizer/saved_models/logs-pretrained/taco_pretrained"))
vocoder.load_model(BASE_PATH_VOICE_CLONE / Path("vocoder/saved_models/pretrained/pretrained.pt"))
print("All models Load Sucessfully")

Loaded encoder "pretrained.pt" trained to step 1564501
Found synthesizer "pretrained" trained to step 278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at voice_clone/vocoder/saved_models/pretrained/pretrained.pt
All models Load Sucessfully


  warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")


In [4]:
import librosa

def _compute_embedding(audio):
    '''
    Description 
        Loading Embedding from the audio file to clone
        
    Input:
        audio: Audio File 
        
    Output
        Embeddings
    
    '''
    global embedding
    embedding = None
    embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))

def read_audio_file(path):
    samples, sample_rate = librosa.load(path)
    _compute_embedding(samples)

audio_file_path = "./voices/sundarPichai.wav"
read_audio_file(audio_file_path)
print("Embedding Loads Sucessfully")

Embedding Loads Sucessfully


In [8]:

def clone_voice(text):
    
    def synthesize(embed, text):
        print("Synthesizing new audio...")
        #with io.capture_output() as captured:
        specs = synthesizer.synthesize_spectrograms([text], [embed])
        generated_wav = vocoder.infer_waveform(specs[0])
        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
        print(type(generated_wav))
#         display(Audio(generated_wav, rate=synthesizer.sample_rate, autoplay=True))

    if embedding is None:
        print("first record a voice or upload a voice file!")
    else:
        synthesize(embedding, text)
        print("Voice Clonned Sucessfully")
        
text = "I am bhola record I am here to see you in the middle of the earth hello there" #@param {type:"string"}
clone_voice(text)

Synthesizing new audio...
{| ████████████████ 76000/76800 | Batch Size: 8 | Gen Rate: 2.6kHz | }<class 'numpy.ndarray'>
Voice Clonned Sucessfully


In [None]:
import os
import sys
sys.path.append("./Deepfake/Wav2Lip-master/")
checkpoint_path = "./Deepfake/weights/wav2lip_gan.pth"
video_path = "./Deepfake/Wav2Lip-master/samples/WhatsApp Video 2020-08-30 at 3.52.46 PM (1).mp4"
audio_path = "./Deepfake/Wav2Lip-master/samples/sundarPichai.wav"
infrence_path = "./Deepfake/Wav2Lip-master/inference.py"



# !wget "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth" 
!cd './Deepfake/Wav2Lip-master/' && python "inference.py"  --checkpoint_path "./weights/wav2lip_gan.pth" --face "./samples/WhatsApp Video 2020-08-30 at 3.52.46 PM (1).mp4" --audio './samples/sundarPichai.wav'


Using cpu for inference.
Reading video frames...
Number of frames available for inference: 100
(80, 24001)
Length of mel chunks: 4183
  0%|                                                    | 0/33 [00:00<?, ?it/s]
  0%|                                                     | 0/7 [00:00<?, ?it/s][A
 14%|██████▍                                      | 1/7 [00:46<04:37, 46.30s/it][A
 29%|████████████▊                                | 2/7 [01:31<03:49, 45.83s/it][A
 43%|███████████████████▎                         | 3/7 [02:15<03:01, 45.50s/it][A
 57%|█████████████████████████▋                   | 4/7 [03:00<02:15, 45.31s/it][A
 71%|████████████████████████████████▏            | 5/7 [03:45<01:30, 45.09s/it][A
 86%|██████████████████████████████████████▌      | 6/7 [04:29<00:44, 44.96s/it][A
100%|█████████████████████████████████████████████| 7/7 [04:41<00:00, 40.17s/it][A
Load checkpoint from: ./weights/wav2lip_gan.pth
Model loaded
  3%|█▏                                       | 1/33

In [15]:
from os import listdir, path
import numpy as np
import scipy, cv2, os, sys, argparse, audio
import json, subprocess, random, string
from tqdm import tqdm
from glob import glob
import torch, face_detection
from models import Wav2Lip

parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')

parser.add_argument('--checkpoint_path', type=str, 
					help='Name of saved checkpoint to load weights from', required=True)

parser.add_argument('--face', type=str, 
					help='Filepath of video/image that contains faces to use', required=True)
parser.add_argument('--audio', type=str, 
					help='Filepath of video/audio file to use as raw audio source', required=True)
parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.', 
								default='results/result_voice.mp4')

parser.add_argument('--static', type=bool, 
					help='If True, then use only first video frame for inference', default=False)
parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', 
					default=25., required=False)

parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0], 
					help='Padding (top, bottom, left, right). Please adjust to include chin at least')

parser.add_argument('--face_det_batch_size', type=int, 
					help='Batch size for face detection', default=16)
parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)

parser.add_argument('--resize_factor', default=1, type=int, 
			help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')

args = parser.parse_args()
img_size = 96
static = False
fps = 25
pads = [0, 10, 0, 0]
face_det_batch_size = 16
wav2lip_batch_size = 128
resize_factor



if os.path.isfile(video_path) and video_path.split('.')[1] in ['jpg', 'png', 'jpeg']:
	args.static = True

def get_smoothened_boxes(boxes, T):
	for i in range(len(boxes)):
		if i + T > len(boxes):
			window = boxes[len(boxes) - T:]
		else:
			window = boxes[i : i + T]
		boxes[i] = np.mean(window, axis=0)
	return boxes

def face_detect(images):
	detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 
											flip_input=False, device=device)

	batch_size = args.face_det_batch_size
	
	while 1:
		predictions = []
		try:
			for i in tqdm(range(0, len(images), batch_size)):
				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
		except RuntimeError:
			if batch_size == 1: 
				raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
			batch_size //= 2
			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
			continue
		break

	results = []
	pady1, pady2, padx1, padx2 = args.pads
	for rect, image in zip(predictions, images):
		if rect is None:
			raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')

		y1 = max(0, rect[1] - pady1)
		y2 = min(image.shape[0], rect[3] + pady2)
		x1 = max(0, rect[0] - padx1)
		x2 = min(image.shape[1], rect[2] + padx2)
		
		results.append([x1, y1, x2, y2])

	boxes = get_smoothened_boxes(np.array(results), T=5)
	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]

	del detector
	return results 

def datagen(frames, mels):
	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

	if not args.static:
		face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
	else:
		face_det_results = face_detect([frames[0]])

	for i, m in enumerate(mels):
		idx = 0 if args.static else i%len(frames)
		frame_to_save = frames[idx].copy()
		face, coords = face_det_results[idx].copy()

		face = cv2.resize(face, (img_size, img_size))
			
		img_batch.append(face)
		mel_batch.append(m)
		frame_batch.append(frame_to_save)
		coords_batch.append(coords)

		if len(img_batch) >= args.wav2lip_batch_size:
			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

			img_masked = img_batch.copy()
			img_masked[:, img_size//2:] = 0

			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

			yield img_batch, mel_batch, frame_batch, coords_batch
			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

	if len(img_batch) > 0:
		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

		img_masked = img_batch.copy()
		img_masked[:, img_size//2:] = 0

		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

		yield img_batch, mel_batch, frame_batch, coords_batch

mel_step_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} for inference.'.format(device))

def _load(checkpoint_path):
	if device == 'cuda':
		checkpoint = torch.load(checkpoint_path)
	else:
		checkpoint = torch.load(checkpoint_path,
								map_location=lambda storage, loc: storage)
	return checkpoint

def load_model(path):
	model = Wav2Lip()
	print("Load checkpoint from: {}".format(path))
	checkpoint = _load(path)
	s = checkpoint["state_dict"]
	new_s = {}
	for k, v in s.items():
		new_s[k.replace('module.', '')] = v
	model.load_state_dict(new_s)

	model = model.to(device)
	return model.eval()

def main():
	if not os.path.isfile(video_path):
		fnames = list(glob(os.path.join(video_path, '*.jpg')))
		sorted_fnames = sorted(fnames, key=lambda f: int(os.path.basename(f).split('.')[0]))
		full_frames = [cv2.imread(f) for f in sorted_fnames]

	elif video_path.split('.')[1] in ['jpg', 'png', 'jpeg']:
		full_frames = [cv2.imread(video_path)]
		fps = args.fps

	else:
		video_stream = cv2.VideoCapture(video_path)
		fps = video_stream.get(cv2.CAP_PROP_FPS)

		print('Reading video frames...')

		full_frames = []
		while 1:
			still_reading, frame = video_stream.read()
			if not still_reading:
				video_stream.release()
				break
			if args.resize_factor > 1:
				frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))

			full_frames.append(frame)

	print ("Number of frames available for inference: "+str(len(full_frames)))

	if not audio_path.endswith('.wav'):
		print('Extracting raw audio...')
		command = 'ffmpeg -y -i {} -strict -2 {}'.format(audio_path, 'temp/temp.wav')

		subprocess.call(command, shell=True)
		audio_path = 'temp/temp.wav'

	wav = audio.load_wav(audio_path, 16000)
	mel = audio.melspectrogram(wav)
	print(mel.shape)

	if np.isnan(mel.reshape(-1)).sum() > 0:
		raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')

	mel_chunks = []
	mel_idx_multiplier = 80./fps 
	i = 0
	while 1:
		start_idx = int(i * mel_idx_multiplier)
		if start_idx + mel_step_size > len(mel[0]):
			break
		mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
		i += 1

	print("Length of mel chunks: {}".format(len(mel_chunks)))

	full_frames = full_frames[:len(mel_chunks)]

	batch_size = args.wav2lip_batch_size
	gen = datagen(full_frames.copy(), mel_chunks)

	for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, 
											total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
		if i == 0:
			model = load_model(checkpoint_path)
			print ("Model loaded")

			frame_h, frame_w = full_frames[0].shape[:-1]
			out = cv2.VideoWriter('temp/result.avi', 
									cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))

		img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
		mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)

		with torch.no_grad():
			pred = model(mel_batch, img_batch)

		pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
		
		for p, f, c in zip(pred, frames, coords):
			y1, y2, x1, x2 = c
			p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))

			f[y1:y2, x1:x2] = p
			out.write(f)

	out.release()

	command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(audio_path, 'temp/result.avi', args.outfile)
	subprocess.call(command, shell=True)

if __name__ == '__main__':
	main()


mv: cannot stat 'home/Deepfake/weights': No such file or directory
