# Demonstration of Voice Synthesis using Only a Single Face Photo

### Please first setup the required environment to include the following libraries
* face-alignment (pip install face-alignment)
* numpy
* cv2
* torch
* torchvision
* matplotlib

### Please go through each of the following steps

#### Step 1: Put all test images in the folder "../Test_Images/". Name each person's face photo as "{NAME}.jpg".

#### Step 2: Modify the following parameters

In [None]:
person = "Alice" # Make sure the person's name matches the name of the image. For example, person = "Alice" for ../Test_Images/Alice.jpg
num = 100 # The numnber of audio output files to generate
text = 'One two three four five six seven eight.' # The content of the audio to be generated. Make sure the numnber matches the digits displayed on the screen (e.g., 12345678)
latent_dim = 48 # we use 48 by default

#### Step 3: Run the following cell. The audio outputs will be in the output folder "../Test_Outputs/", following the name of the person (e.g., Alice_1.wav).

In [None]:
import face_alignment
import cv2
import numpy as np
from synthesize import *
from torchvision import transforms
import matplotlib.pyplot as plt
from cvae_model import CVAE
import torch

global fa        
fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._3D, flip_input=False, device='cuda')

def align(image, desiredLeftEye=(0.3, 0.3), desiredFaceWidth=256):
	# convert the landmark (x, y)-coordinates to a NumPy array
	########################## Detect 68 landmarks ##########################
	landmarks = fa.get_landmarks(image) 
	# If there's no face detected or more than one face detected, then return the original image
	if landmarks is None or len(landmarks) != 1:
		print('Not Aligned')
		return None

	landmark = landmarks[0]

	right_eye_points = list(range(36,42))
	left_eye_points = list(range(42,48))

	# extract the left and right eye (x, y)-coordinates

	landmarks_left, landmarks_right = landmark[left_eye_points], landmark[right_eye_points]

	lStart, lEnd = np.array([int(x[0]) for x in landmarks_left]), np.array([int(x[1]) for x in landmarks_left])
	rStart, rEnd = np.array([int(x[0]) for x in landmarks_right]), np.array([int(x[1]) for x in landmarks_right])

	# compute the center of mass for each eye
	leftEyeCenter = (lStart.mean().astype("int"), lEnd.mean().astype("int"))
	rightEyeCenter = (rStart.mean().astype("int"), rEnd.mean().astype("int"))

	# compute the angle between the eye centroids
	dY = rightEyeCenter[1] - leftEyeCenter[1]
	dX = rightEyeCenter[0] - leftEyeCenter[0]
	angle = np.degrees(np.arctan2(dY, dX)) - 180

	# compute the desired right eye x-coordinate based on the
	# desired x-coordinate of the left eye
	desiredRightEyeX = 1.0 - desiredLeftEye[0]

	# determine the scale of the new resulting image by taking
	# the ratio of the distance between eyes in the *current*
	# image to the ratio of distance between eyes in the
	# *desired* image
	dist = np.sqrt((dX ** 2) + (dY ** 2))
	desiredDist = (desiredRightEyeX - desiredLeftEye[0])
	desiredDist *= desiredFaceWidth
	scale = desiredDist / dist

	# compute center (x, y)-coordinates (i.e., the median point)
	# between the two eyes in the input image
	eyesCenter = (int((leftEyeCenter[0] + rightEyeCenter[0]) // 2),
		int((leftEyeCenter[1] + rightEyeCenter[1]) // 2))

	# grab the rotation matrix for rotating and scaling the face
	M = cv2.getRotationMatrix2D(eyesCenter, angle, scale)

	# update the translation component of the matrix
	tX = desiredFaceWidth * 0.5
	tY = desiredFaceWidth * desiredLeftEye[1]
	M[0, 2] += (tX - eyesCenter[0])
	M[1, 2] += (tY - eyesCenter[1])

	# apply the affine transformation
	(w, h) = (desiredFaceWidth, desiredFaceWidth)
	output = cv2.warpAffine(image, M, (w, h),
	flags=cv2.INTER_CUBIC)

	# return the aligned face
	return output

if torch.cuda.is_available():
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    ## Print some environment information (for debugging purposes)
    print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(),
        device_id,
        gpu_properties.name,
        gpu_properties.major,
        gpu_properties.minor,
        gpu_properties.total_memory / 1e9))
else:
    print("Using CPU for inference.\n")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CVAE(latent_size=latent_dim).to(device)
model.load_state_dict(torch.load("./F2V_models/CVAE_48.pth"))
model.eval()

encoder.load_model(Path('./F2V_models/encoder.pt'))
synthesizer = Synthesizer(Path('./F2V_models/synthesizer.pt'))
vocoder.load_model(Path('./F2V_models/vocoder.pt'))

input_face = cv2.imread("Test_Images/{}.jpg".format(person))
aligned_face = align(input_face)
plt.imshow(aligned_face[...,::-1])

transform_fn = transforms.Compose([
    transforms.ToPILImage(),
    # transforms.RandomResizedCrop(224),
    # transforms.RandomHorizontalFlip(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ],
                        std  = [ 0.229, 0.224, 0.225 ]),
    ])

image = transform_fn(aligned_face)
image = image.unsqueeze(0)

for i in range(num):
    sampled_z = torch.tensor(np.random.normal(0, 1, (image.size(0), latent_dim))).to(device)
    cvae_B, cnn_B = model.decode(sampled_z.float(), image.to(device))
    cvae_B = cvae_B.squeeze(1)
    cvae_B = cvae_B.detach().cpu().numpy()
    cnn_B = cnn_B.detach().cpu().numpy()
    cvae_B = cvae_B[0]
    cnn_B = cnn_B[0]
    embed = cvae_B
    texts = [text]
    embeds = [embed]
    generated_wav = synthesize_audio(synthesizer, vocoder, texts, embeds)
    filename = "./Test_Outputs/{}_{}.wav".format(person, i)
    sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)

# Save CNN
embed = cnn_B
texts = [text]
embeds = [embed]
generated_wav = synthesize_audio(synthesizer, vocoder, texts, embeds)
filename = "./Test_Outputs/{}_CNN.wav".format(person)
sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)

print("{} Completed".format(person))

### Step 4: Now use a digital speaker to play each of the generated audio file one by one in attempt to bypass the Voiceprint.