In [None]:
!pip install facenet_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import cv2
import numpy as np
import sys
import dlib
import re
import torch
from torch import nn
# from model import AutoEncoder
from skimage import img_as_ubyte
from facenet_pytorch import MTCNN
import torch
import cv2
from PIL import Image

import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
detector = dlib.get_frontal_face_detector()
PREDICTOR_PATH = '/content/drive/MyDrive/Minor_Project/models/shape_predictor_68_face_landmarks.dat'
predictor = dlib.shape_predictor(PREDICTOR_PATH)

In [None]:
def rotate(image, output_size=256):
    image = image[:, :, ::-1]  # BGR to RGB
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # detect faces in the grayscale image
    rects = detector(gray, 1)
    if len(rects) > 0:
        # loop over the face detections
        for (i, rect) in enumerate(rects):
            shape = predictor(gray, rect)  # get facial features
            shape = np.array([(shape.part(j).x, shape.part(j).y) for j in range(shape.num_parts)])

            # center and scale face around mid point between eyes
            center_eyes = shape[27].astype(int)
            eyes_d = np.linalg.norm(shape[36] - shape[45])
            face_size_x = int(eyes_d * 2.)
            if face_size_x < 50:
                continue

            # rotate to normalized angle
            d = (shape[45] - shape[36]) / eyes_d  # normalized eyes-differnce vector (direction)
            a = np.rad2deg(np.arctan2(d[1], d[0]))  # angle
            scale_factor = float(output_size) / float(face_size_x * 2.)  # scale to fit in output_size
            # rotation (around center_eyes) + scale transform
            M = np.append(cv2.getRotationMatrix2D((int(center_eyes[0]),int(center_eyes[1])), a, scale_factor), [[0, 0, 1]], axis=0)
            # apply shift from center_eyes to middle of output_size
            M1 = np.array([[1., 0., -center_eyes[0] + output_size / 2.],
                           [0., 1., -center_eyes[1] + output_size / 2.],
                           [0, 0, 1.]])
            # concatenate transforms (rotation-scale + translation)
            M = M1.dot(M)[:2]
            # warp
            try:
                face = cv2.warpAffine(image, M, (output_size, output_size), borderMode=cv2.BORDER_REPLICATE)
            except:
                continue
            face = cv2.resize(face, (output_size, output_size), cv2.COLOR_BGR2RGB)
            return face

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))
mtcnn = MTCNN(keep_all=True, device=device, margin=50, select_largest=True, image_size=256)


def extract_face(frame, align=True, margin=5):
    if align:
        frame = rotate(np.array(frame))
    frame = Image.fromarray(frame)
    boxes, _ = mtcnn.detect(frame)
    for box in boxes:
        box_list = box.tolist()
        # bounding box coordinated
        x1 = int(box_list[0])
        y1 = int(box_list[1])
        x2 = int(box_list[2])
        y2 = int(box_list[3])
        #  find the middle of the image to get a perfect square, mtcnn gives a rectangle image of the face so making
        #  the image a square makes it easier to train
        y1 += margin
        y2 -= margin
        diff = abs(y1 - y2)
        mid_x = (x2 + x1) // 2
        # mid_y = (y2 + y1) // 2
        x1 = mid_x - (diff // 2)
        x2 = mid_x + (diff // 2)
        return frame.crop((x1, y1, x2, y2))  # sends back only the square around the face, possible no face detected


Running on device: cuda:0


In [None]:
 # Read points from text file
def readPoints(path):
    # Create an array of points.
    points = []

    # Read points
    with open(path) as file:
        for line in file:
            x, y = line.split()
            points.append((int(x), int(y)))

    return points


# Apply affine transform calculated using srcTri and dstTri to src and
# output an image of size.
def applyAffineTransform(src, srcTri, dstTri, size):
    # Given a pair of triangles, find the affine transform.
    warpMat = cv2.getAffineTransform(np.float32(srcTri), np.float32(dstTri))

    # Apply the Affine Transform just found to the src image
    dst = cv2.warpAffine(src, warpMat, (size[0], size[1]), None, flags=cv2.INTER_LINEAR,
                         borderMode=cv2.BORDER_REFLECT_101)

    return dst


# Check if a point is inside a rectangle
def rectContains(rect, point):
    if point[0] < rect[0]:
        return False
    elif point[1] < rect[1]:
        return False
    elif point[0] > rect[0] + rect[2]:
        return False
    elif point[1] > rect[1] + rect[3]:
        return False
    return True


# calculate delanauy triangle
def calculateDelaunayTriangles(rect, points):
    # create subdiv
    subdiv = cv2.Subdiv2D(rect)
    # Insert points into subdiv
    for p in points:
        p = tuple(p)
        subdiv.insert(p)

    triangleList = subdiv.getTriangleList()

    delaunayTri = []

    pt = []

    for t in triangleList:
        pt.append((t[0], t[1]))
        pt.append((t[2], t[3]))
        pt.append((t[4], t[5]))

        pt1 = (t[0], t[1])
        pt2 = (t[2], t[3])
        pt3 = (t[4], t[5])

        if rectContains(rect, pt1) and rectContains(rect, pt2) and rectContains(rect, pt3):
            ind = []
            # Get face-points (from 68 face detector) by coordinates
            for j in range(0, 3):
                for k in range(0, len(points)):
                    if (abs(pt[j][0] - points[k][0]) < 1.0 and abs(pt[j][1] - points[k][1]) < 1.0):
                        ind.append(k)
                        # Three points form a triangle. Triangle array corresponds to the file tri.txt in FaceMorph
            if len(ind) == 3:
                delaunayTri.append((ind[0], ind[1], ind[2]))

        pt = []

    return delaunayTri


# Warps and alpha blends triangular regions from img1 and img2 to img
def warpTriangle(img1, img2, t1, t2):
    # Find bounding rectangle for each triangle
    r1 = cv2.boundingRect(np.float32([t1]))
    r2 = cv2.boundingRect(np.float32([t2]))
    # Offset points by left top corner of the respective rectangles
    t1Rect = []
    t2Rect = []
    t2RectInt = []

    for i in range(0, 3):
        t1Rect.append(((t1[i][0] - r1[0]), (t1[i][1] - r1[1])))
        t2Rect.append(((t2[i][0] - r2[0]), (t2[i][1] - r2[1])))
        t2RectInt.append((int(t2[i][0] - r2[0]),int(t2[i][1] - r2[1])))

    # Get mask by filling triangle
    mask = np.zeros((r2[3], r2[2], 3), dtype=np.float32)
    cv2.fillConvexPoly(mask, np.int32(t2RectInt) , (1.0, 1.0, 1.0), 16, 0)
    # Apply warpImage to small rectangular patches
    img1Rect = img1[r1[1]:r1[1] + r1[3], r1[0]:r1[0] + r1[2]]
    # img2Rect = np.zeros((r2[3], r2[2]), dtype = img1Rect.dtype)

    size = (r2[2], r2[3])

    img2Rect = applyAffineTransform(img1Rect, t1Rect, t2Rect, size)

    img2Rect = img2Rect * mask

    # Copy triangular region of the rectangular patch to the output image
    img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] * (
                (1.0, 1.0, 1.0) - mask)

    img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] = img2[r2[1]:r2[1] + r2[3], r2[0]:r2[0] + r2[2]] + img2Rect


def get_landmarks(im):
    rects = detector(im, 1)

    if len(rects) > 1:
        # print("Too Many Faces") #raise TooManyFaces
        return []
    if len(rects) == 0:
        # print("No Faces") #raise NoFaces
        return []

    return np.matrix([[p.x, p.y] for p in predictor(im, rects[0]).parts()])


def atof(text):
    try:
        retval = float(text)
    except ValueError:
        retval = text
    return retval


def natural_keys(text):
    return [atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text)]


def swap_faces(img1, img2):
    img1Warped = np.copy(img2)

    points1 = get_landmarks(img1)
    points1 = points1.tolist()

    points2 = get_landmarks(img2)
    points2 = points2.tolist()

    # Find convex hull
    hull1 = []
    hull2 = []
    hullIndex = cv2.convexHull(np.array(points2), returnPoints=False)
    for i in range(0, len(hullIndex)):
        hull1.append(points1[int(hullIndex[i])])
        hull2.append(points2[int(hullIndex[i])])

    # Find delanauy traingulation for convex hull points
    sizeImg2 = img2.shape
    rect = (0, 0, sizeImg2[1], sizeImg2[0])
    dt = calculateDelaunayTriangles(rect, hull2)
    if len(dt) == 0:
        quit()
    # Apply affine transformation to Delaunay triangles
    for i in range(0, len(dt)):
        t1 = []
        t2 = []

        # get points for img1, img2 corresponding to the triangles
        for j in range(0, 3):
            t1.append(hull1[dt[i][j]])
            t2.append(hull2[dt[i][j]])

        warpTriangle(img1, img1Warped, t1, t2) # tracked error upto here

    # Calculate Mask
    hull8U = []
    for i in range(0, len(hull2)):
        hull8U.append((hull2[i][0], hull2[i][1]))

    mask = np.zeros(img2.shape, dtype=img2.dtype)

    cv2.fillConvexPoly(mask, np.int32(hull8U), (255, 255, 255))

    r = cv2.boundingRect(np.float32([hull2]))

    center = ( (int(r[0]) + int(r[2] / 2)), (int(r[1]) + int(r[3] / 2)) )
    # Clone seamlessly

    return cv2.seamlessClone(np.uint8(img1Warped), img2, mask, center, cv2.NORMAL_CLONE)

In [None]:
class Flatten(nn.Module):
    def forward(self, inputs):
        return inputs.view(inputs.size(0), -1)


class UnFlatten(nn.Module):
    def forward(self, inputs, size=512):
        return inputs.view(inputs.size(0), 128, 4, 4)


class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        self.model = nn.Sequential(
            nn.Conv2d(3, 128, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 32, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(32, 16, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(16, 4, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Sigmoid(),
        )

    def forward(self, img):
        validity = self.model(img)
        return (validity)

In [None]:
class ResBlock(nn.Module):
    def __init__(self, n_ch) -> None:
        super().__init__()

        self.resblock_model = nn.Sequential(
            nn.Conv2d(n_ch, n_ch, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(n_ch),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(n_ch, n_ch, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(n_ch)
        )

    def forward(self, inputs):
        return self.resblock_model(inputs) + inputs

In [None]:
class Downscale(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=3, padding=1):
        super().__init__()
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.conv = nn.Conv2d(self.in_ch, self.out_ch, kernel_size=self.kernel_size, stride=2, padding=padding)
        self.batch_norm = nn.BatchNorm2d(self.out_ch)
        self.relu = nn.LeakyReLU(0.1)
        self.drop = nn.Dropout2d()

    def forward(self, x):
        x = self.conv(x)
        x = self.batch_norm(x)
        x = self.relu(x)
        x = self.drop(x)
        return x

In [None]:
class Upscale(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=5, padding=2):
        super().__init__()
        self.conv = nn.ConvTranspose2d(in_ch, out_ch, kernel_size, stride=2, padding=1)
        self.batch_norm = nn.BatchNorm2d(out_ch)
        self.relu = nn.LeakyReLU(0.1)
        self.drop = nn.Dropout2d()

    def forward(self, x):
        x = self.conv(x)
        x = self.batch_norm(x)
        x = self.relu(x)
        x = self.drop(x)
        return x

In [None]:
class AutoEncoder(nn.Module):

    def __init__(self, image_channels=3, h_dim=2048, z_dim=128):
        super(AutoEncoder, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.encoder = nn.Sequential(
            Downscale(image_channels, 64),
            Downscale(64, 128),
            # Downscale(128, 128),
            # ResBlock(128),
            Downscale(128, 256),
            # ResBlock(256),
            Downscale(256, 256),
            Downscale(256, 512),
            # ResBlock(512),
            Downscale(512, 512),
            Flatten(),
        )
        # ([32, 2304])

        self.inter_layer = nn.Sequential(
            nn.Linear(h_dim, z_dim),
            nn.Linear(z_dim, z_dim),
            nn.Linear(z_dim, h_dim),
        )

        self.decoder = nn.Sequential(
            UnFlatten(),
            # Upscale(128, 128, kernel_size=4),
            Upscale(128, 256, kernel_size=4),
            # ResBlock(256),
            # ResBlock(128),
            # ResBlock(128),
            Upscale(256, 256, kernel_size=4),
            Upscale(256, 128, kernel_size=4),
            # ResBlock(128),
            Upscale(128, 64, kernel_size=4),
            ResBlock(64),
            Upscale(64, 32, kernel_size=4),
            Upscale(32, 32, kernel_size=4),
            nn.Conv2d(32, image_channels, kernel_size=1, stride=2),
            nn.Sigmoid(),
        )

        self.decoder_b = nn.Sequential(
            UnFlatten(),
            # Upscale(128, 128, kernel_size=4),
            Upscale(128, 256, kernel_size=4),
            # ResBlock(256),
            # ResBlock(128),
            # ResBlock(128),
            Upscale(256, 256, kernel_size=4),
            Upscale(256, 128, kernel_size=4),
            # ResBlock(128),
            Upscale(128, 64, kernel_size=4),
            ResBlock(64),
            Upscale(64, 32, kernel_size=4),
            Upscale(32, 32, kernel_size=4),
            nn.Conv2d(32, image_channels, kernel_size=1, stride=2),
            nn.Sigmoid(),
        )

    def forward(self, x, version='a'):
        z = self.encoder(x)
        z = self.inter_layer(z)
        if version == 'a':
            z = self.decoder(z)
        else:
            z = self.decoder_b(z)
        return z

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cap = cv2.VideoCapture("/content/drive/MyDrive/Minor_Project/Dataset/videos/elon.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

model = AutoEncoder(image_channels=3).to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/Minor_Project/saved_models/final.pth"))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_tracked = cv2.VideoWriter('{}.mp4'.format("Swapped_Video"), fourcc, fps, (width, height))
i = 0
decoder = "b"

while cap.isOpened():
    ret, frame = cap.read()
    if ret:
        try:
            print('\rTracking frame: {}'.format(i + 1), end='')
            i += 1
            # Retrive face from frame, align it, resize it in cv2 to fit into model
            img1_face = extract_face(frame)
            img1_face = np.array(img1_face)
            img1_face = cv2.resize(img1_face, (128, 128))

            #  convert the frame
            frame = np.array(frame)

            #  pytorch takes in channel, height and width,  so transpose to change into correct dimensions
            img1_face = cv2.cvtColor(img1_face, cv2.COLOR_BGR2RGB)
            img_tensor = img1_face[:, :, ::-1].transpose((2, 0, 1)).copy()  # chw, RGB order,[0,255]
            img_tensor = torch.from_numpy(img_tensor).float().div(255)  # chw , FloatTensor type,[0,1]
            img_tensor = img_tensor.unsqueeze(0)  # nch*w
            x = img_tensor.to(device)
            model.eval()
            out = model(x, version="b")
            # convert the pytorch output into cv2
            out = out.data.cpu().squeeze().numpy()
            out = np.transpose(out, (1, 2, 0))
            out = cv2.cvtColor(out, cv2.COLOR_RGB2BGR)
            out = img_as_ubyte(out)
            out2 = swap_faces(out, frame)
            video_tracked.write(out2)

        except Exception as e:
            print(e)
    else:
        break

Tracking frame: 11OpenCV(4.6.0) /io/opencv/modules/imgproc/src/imgwarp.cpp:2595: error: (-215:Assertion failed) src.cols > 0 && src.rows > 0 in function 'warpAffine'

Tracking frame: 16OpenCV(4.6.0) /io/opencv/modules/imgproc/src/imgwarp.cpp:2595: error: (-215:Assertion failed) src.cols > 0 && src.rows > 0 in function 'warpAffine'

Tracking frame: 19OpenCV(4.6.0) /io/opencv/modules/imgproc/src/imgwarp.cpp:2595: error: (-215:Assertion failed) src.cols > 0 && src.rows > 0 in function 'warpAffine'

Tracking frame: 20OpenCV(4.6.0) /io/opencv/modules/imgproc/src/imgwarp.cpp:2595: error: (-215:Assertion failed) src.cols > 0 && src.rows > 0 in function 'warpAffine'

Tracking frame: 42OpenCV(4.6.0) /io/opencv/modules/imgproc/src/imgwarp.cpp:2595: error: (-215:Assertion failed) src.cols > 0 && src.rows > 0 in function 'warpAffine'

Tracking frame: 43OpenCV(4.6.0) /io/opencv/modules/imgproc/src/imgwarp.cpp:2595: error: (-215:Assertion failed) src.cols > 0 && src.rows > 0 in function 'warpAffine'