In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow
from torchvision.utils import save_image
import numpy as np
import cv2  
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
class Flatten(nn.Module):
    def forward(self, inputs):
        return inputs.view(inputs.size(0), -1)


class UnFlatten(nn.Module):
    def forward(self, inputs, size=512):
        return inputs.view(inputs.size(0), 128, 4, 4)


class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        self.model = nn.Sequential(
            nn.Conv2d(3, 128, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 64, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 32, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(32, 16, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(16, 4, kernel_size=5, stride=2, padding=2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Sigmoid(),
        )

    def forward(self, img):
        validity = self.model(img)
        return (validity)

In [4]:
class ResBlock(nn.Module):
    def __init__(self, n_ch) -> None:
        super().__init__()

        self.resblock_model = nn.Sequential(
            nn.Conv2d(n_ch, n_ch, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(n_ch),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Conv2d(n_ch, n_ch, kernel_size=3, bias=False, padding=1),
            nn.BatchNorm2d(n_ch)
        )

    def forward(self, inputs):
        return self.resblock_model(inputs) + inputs

In [5]:
class Downscale(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=3, padding=1):
        super().__init__()
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.conv = nn.Conv2d(self.in_ch, self.out_ch, kernel_size=self.kernel_size, stride=2, padding=padding)
        self.batch_norm = nn.BatchNorm2d(self.out_ch)
        self.relu = nn.LeakyReLU(0.1)
        self.drop = nn.Dropout2d()

    def forward(self, x):
        x = self.conv(x)
        x = self.batch_norm(x)
        x = self.relu(x)
        x = self.drop(x)
        return x

In [6]:
class Upscale(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=5, padding=2):
        super().__init__()
        self.conv = nn.ConvTranspose2d(in_ch, out_ch, kernel_size, stride=2, padding=1)
        self.batch_norm = nn.BatchNorm2d(out_ch)
        self.relu = nn.LeakyReLU(0.1)
        self.drop = nn.Dropout2d()

    def forward(self, x):
        x = self.conv(x)
        x = self.batch_norm(x)
        x = self.relu(x)
        x = self.drop(x)
        return x

In [7]:
class AutoEncoder(nn.Module):

    def __init__(self, image_channels=3, h_dim=2048, z_dim=128):
        super(AutoEncoder, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.encoder = nn.Sequential(
            Downscale(image_channels, 64),
            Downscale(64, 128),
            # Downscale(128, 128),
            # ResBlock(128),
            Downscale(128, 256),
            # ResBlock(256),
            Downscale(256, 256),
            Downscale(256, 512),
            # ResBlock(512),
            Downscale(512, 512),
            Flatten(),
        )
        # ([32, 2304])

        self.inter_layer = nn.Sequential(
            nn.Linear(h_dim, z_dim),
            nn.Linear(z_dim, z_dim),
            nn.Linear(z_dim, h_dim),
        )

        self.decoder = nn.Sequential(
            UnFlatten(),
            # Upscale(128, 128, kernel_size=4),
            Upscale(128, 256, kernel_size=4),
            # ResBlock(256),
            # ResBlock(128),
            # ResBlock(128),
            Upscale(256, 256, kernel_size=4),
            Upscale(256, 128, kernel_size=4),
            # ResBlock(128),
            Upscale(128, 64, kernel_size=4),
            ResBlock(64),
            Upscale(64, 32, kernel_size=4),
            Upscale(32, 32, kernel_size=4),
            nn.Conv2d(32, image_channels, kernel_size=1, stride=2),
            nn.Sigmoid(),
        )

        self.decoder_b = nn.Sequential(
            UnFlatten(),
            # Upscale(128, 128, kernel_size=4),
            Upscale(128, 256, kernel_size=4),
            # ResBlock(256),
            # ResBlock(128),
            # ResBlock(128),
            Upscale(256, 256, kernel_size=4),
            Upscale(256, 128, kernel_size=4),
            # ResBlock(128),
            Upscale(128, 64, kernel_size=4),
            ResBlock(64),
            Upscale(64, 32, kernel_size=4),
            Upscale(32, 32, kernel_size=4),
            nn.Conv2d(32, image_channels, kernel_size=1, stride=2),
            nn.Sigmoid(),
        )

    def forward(self, x, version='a'):
        z = self.encoder(x)
        z = self.inter_layer(z)
        if version == 'a':
            z = self.decoder(z)
        else:
            z = self.decoder_b(z)
        return z

In [8]:
import os
from math import exp
import torch.nn.functional as F
from torch.autograd import Variable

In [9]:
class Iterator:
    def __init__(self, dataset, batch_size=32):
        self.datset = dataset
        self.max = len(dataset)
        self.batch_size = batch_size
        self.idx = 0

    def __iter__(self):
        self.idx = 0
        return self

    def __next__(self):
        if self.idx + + self.batch_size >= self.max - 1:
            np.random.shuffle(self.datset)
            self.idx = 0
        self.idx += self.batch_size
        return self.datset[self.idx:self.idx + self.batch_size]

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoEncoder(image_channels=3).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Minor_Project/saved_models/final.pth'))

<All keys matched successfully>

In [11]:
torch.cuda.empty_cache()

In [12]:
import os
from PIL import Image

In [13]:
!pip install facenet-pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting facenet-pytorch
  Downloading facenet_pytorch-2.5.2-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 33.7 MB/s 
Installing collected packages: facenet-pytorch
Successfully installed facenet-pytorch-2.5.2


In [15]:
from skimage import img_as_ubyte
from facenet_pytorch import MTCNN

In [16]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))
mtcnn = MTCNN(keep_all=True, device=device, margin=50, select_largest=True, image_size=256)


def extract_face(frame, align=True, margin=5):
    if align:
        frame = rotate(np.array(frame))
    frame = Image.fromarray(frame)
    boxes, _ = mtcnn.detect(frame)
    for box in boxes:
        box_list = box.tolist()
        # bounding box coordinated
        x1 = int(box_list[0])
        y1 = int(box_list[1])
        x2 = int(box_list[2])
        y2 = int(box_list[3])
        #  find the middle of the image to get a perfect square, mtcnn gives a rectangle image of the face so making
        #  the image a square makes it easier to train
        y1 += margin
        y2 -= margin
        diff = abs(y1 - y2)
        mid_x = (x2 + x1) // 2
        # mid_y = (y2 + y1) // 2
        x1 = mid_x - (diff // 2)
        x2 = mid_x + (diff // 2)
        return frame.crop((x1, y1, x2, y2))  # sends back only the square around the face, possible no face detected

import dlib
detector = dlib.get_frontal_face_detector()
PREDICTOR_PATH = '/content/drive/MyDrive/Minor_Project/models/shape_predictor_68_face_landmarks.dat'
predictor = dlib.shape_predictor(PREDICTOR_PATH)

def rotate(image, output_size=256):
    image = image[:, :, ::-1]  # BGR to RGB
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # detect faces in the grayscale image
    rects = detector(gray, 1)
    if len(rects) > 0:
        # loop over the face detections
        for (i, rect) in enumerate(rects):
            shape = predictor(gray, rect)  # get facial features
            shape = np.array([(shape.part(j).x, shape.part(j).y) for j in range(shape.num_parts)])

            # center and scale face around mid point between eyes
            center_eyes = shape[27].astype(int)
            eyes_d = np.linalg.norm(shape[36] - shape[45])
            face_size_x = int(eyes_d * 2.)
            if face_size_x < 50:
                continue

            # rotate to normalized angle
            d = (shape[45] - shape[36]) / eyes_d  # normalized eyes-differnce vector (direction)
            a = np.rad2deg(np.arctan2(d[1], d[0]))  # angle
            scale_factor = float(output_size) / float(face_size_x * 2.)  # scale to fit in output_size
            # rotation (around center_eyes) + scale transform
            M = np.append(cv2.getRotationMatrix2D((int(center_eyes[0]),int(center_eyes[1])), a, scale_factor), [[0, 0, 1]], axis=0)
            # apply shift from center_eyes to middle of output_size
            M1 = np.array([[1., 0., -center_eyes[0] + output_size / 2.],
                           [0., 1., -center_eyes[1] + output_size / 2.],
                           [0, 0, 1.]])
            # concatenate transforms (rotation-scale + translation)
            M = M1.dot(M)[:2]
            # warp
            try:
                face = cv2.warpAffine(image, M, (output_size, output_size), borderMode=cv2.BORDER_REPLICATE)
            except:
                continue
            face = cv2.resize(face, (output_size, output_size), cv2.COLOR_BGR2RGB)
            return face

Running on device: cuda:0


In [17]:
def transfer(model, x, version):
    x = torch.from_numpy(x).unsqueeze(0)
    x = x.to('cuda')
    model.eval()
    if version == 'a':
        out = model(x, version='a')
        return torch.cat([x, out])
    elif version == 'b':
        out = model(x, version='b')
        return torch.cat([x, out])


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cap = cv2.VideoCapture("/content/drive/MyDrive/Minor_Project/Dataset/videos/elon.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

model = AutoEncoder(image_channels=3).to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/Minor_Project/saved_models/final.pth"))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_tracked = cv2.VideoWriter('{}.mp4'.format("SwappedFaceVideo"), fourcc, fps, (width, height))
i = 0

def write_images(model, image_a, dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

    out = transfer(model, image_a, 'b')
    # convert the pytorch output into cv2
    out = out.data.cpu().squeeze().numpy()
    out = np.transpose(out, (1, 2, 0))
    out = cv2.cvtColor(out, cv2.COLOR_RGB2BGR)
    out = img_as_ubyte(out)
    video_tracked.write(out)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cap = cv2.VideoCapture("/content/drive/MyDrive/Minor_Project/Dataset/videos/elon.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

model = AutoEncoder(image_channels=3).to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/Minor_Project/saved_models/final.pth"))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_tracked = cv2.VideoWriter('{}.mp4'.format("Swapped_Video"), fourcc, fps, (width, height))
i = 0
decoder = "b"

while cap.isOpened():
    ret, frame = cap.read()
    if ret:
        try:
            print('\rTracking frame: {}'.format(i + 1), end='')
            i += 1
            # Retrive face from frame, align it, resize it in cv2 to fit into model
            img1_face = extract_face(frame)
            img1_face = np.array(img1_face)
            img1_face = cv2.resize(img1_face, (128, 128))

            #  convert the frame
            frame = np.array(frame)

            #  pytorch takes in channel, height and width,  so transpose to change into correct dimensions
            img1_face = cv2.cvtColor(img1_face, cv2.COLOR_BGR2RGB)
            img_tensor = img1_face[:, :, ::-1].transpose((2, 0, 1)).copy()  # chw, RGB order,[0,255]
            img_tensor = torch.from_numpy(img_tensor).float().div(255)  # chw , FloatTensor type,[0,1]
            img_tensor = img_tensor.unsqueeze(0)  # nch*w
            x = img_tensor.to(device)
            model.eval()
            out = model(x, version="b")
            # convert the pytorch output into cv2
            out = out.data.cpu().squeeze().numpy()
            out = np.transpose(out, (1, 2, 0))
            out = cv2.cvtColor(out, cv2.COLOR_RGB2BGR)
            out = img_as_ubyte(out)
            video_tracked.write(out)

        except Exception as e:
            print(e)
    else:
        break

Tracking frame: 90