# Inference for Xception baseline model
### Forked from: https://www.kaggle.com/humananalog/inference-demo
### View this kernal for training of this model: https://www.kaggle.com/greatgamedota/xception-binary-classifier-with-ffhq-training
### This kernal takes ~3-3.5 hours to submit with GPU

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
zip_path = '/content/drive/My Drive/Deepfake_Data/deepfake-detection-challenge.zip'
!cp "{zip_path}" .
!unzip -q deepfake-detection-challenge.zip
!rm deepfake-detection-challenge.zip

zip_path = '/content/drive/My Drive/Deepfake_Data/deepfakes-inference-demo.zip'
!cp "{zip_path}" .
!unzip -q deepfakes-inference-demo.zip
!rm deepfakes-inference-demo.zip

zip_path = '/content/drive/My Drive/Deepfake_Data/blazeface-pytorch.zip'
!cp "{zip_path}" .
!unzip -q blazeface-pytorch.zip
!rm blazeface-pytorch.zip

In [3]:
!pip install pytorchcv --quiet

[?25l[K     |▉                               | 10kB 23.4MB/s eta 0:00:01[K     |█▋                              | 20kB 3.1MB/s eta 0:00:01[K     |██▌                             | 30kB 4.5MB/s eta 0:00:01[K     |███▎                            | 40kB 2.9MB/s eta 0:00:01[K     |████▏                           | 51kB 3.6MB/s eta 0:00:01[K     |█████                           | 61kB 4.2MB/s eta 0:00:01[K     |█████▉                          | 71kB 4.9MB/s eta 0:00:01[K     |██████▋                         | 81kB 5.5MB/s eta 0:00:01[K     |███████▍                        | 92kB 6.1MB/s eta 0:00:01[K     |████████▎                       | 102kB 4.8MB/s eta 0:00:01[K     |█████████                       | 112kB 4.8MB/s eta 0:00:01[K     |██████████                      | 122kB 4.8MB/s eta 0:00:01[K     |██████████▊                     | 133kB 4.8MB/s eta 0:00:01[K     |███████████▋                    | 143kB 4.8MB/s eta 0:00:01[K     |████████████▍             

In [0]:
import os, sys, time
import cv2
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [5]:
test_dir = "deepfake-detection-challenge/test_videos/"

test_videos = sorted([x for x in os.listdir(test_dir) if x[-4:] == ".mp4"])
len(test_videos)

400

In [0]:
gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [0]:
import sys
sys.path.insert(0, "blazeface-pytorch")
sys.path.insert(0, "deepfakes-inference-demo")

In [0]:
from blazeface import BlazeFace
facedet = BlazeFace().to(gpu)
facedet.load_weights("blazeface.pth")
facedet.load_anchors("anchors.npy")
_ = facedet.train(False)

In [0]:
from helpers.read_video_1 import VideoReader
from helpers.face_extract_1 import FaceExtractor

frames_per_video = 20

video_reader = VideoReader()
video_read_fn = lambda x: video_reader.read_frames(x, num_frames=frames_per_video)
face_extractor = FaceExtractor(video_read_fn, facedet)

In [0]:
input_size = 150

In [0]:
from torchvision.transforms import Normalize

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalize_transform = Normalize(mean, std)

In [0]:
def isotropically_resize_image(img, size, resample=cv2.INTER_AREA):
    h, w = img.shape[:2]
    if w > h:
        h = h * size // w
        w = size
    else:
        w = w * size // h
        h = size

    resized = cv2.resize(img, (w, h), interpolation=resample)
    return resized


def make_square_image(img):
    h, w = img.shape[:2]
    size = max(h, w)
    t = 0
    b = size - h
    l = 0
    r = size - w
    return cv2.copyMakeBorder(img, t, b, l, r, cv2.BORDER_CONSTANT, value=0)

In [0]:
from pytorchcv.model_provider import get_model
model = get_model("xception", pretrained=False)
model = nn.Sequential(*list(model.children())[:-1]) # Remove original output layer

model[0].final_block.pool = nn.Sequential(nn.AdaptiveAvgPool2d(1))

class Head(torch.nn.Module):
  def __init__(self, in_f, out_f):
    super(Head, self).__init__()
    
    self.f = nn.Flatten()
    self.l = nn.Linear(in_f, 512)
    self.d = nn.Dropout(0.5)
    self.o = nn.Linear(512, out_f)
    self.b1 = nn.BatchNorm1d(in_f)
    self.b2 = nn.BatchNorm1d(512)
    self.r = nn.ReLU()

  def forward(self, x):
    x = self.f(x)
    x = self.b1(x)
    x = self.d(x)

    x = self.l(x)
    x = self.r(x)
    x = self.b2(x)
    x = self.d(x)

    out = self.o(x)
    return out

class FCN(torch.nn.Module):
  def __init__(self, base, in_f):
    super(FCN, self).__init__()
    self.base = base
    self.h1 = Head(in_f, 1)
  
  def forward(self, x):
    x = self.base(x)
    return self.h1(x)

net = []
model = FCN(model, 2048)
model = model.cuda()
model.load_state_dict(torch.load('/content/drive/My Drive/model.pth'))
net.append(model)

## Prediction loop

In [0]:
def predict_on_video(video_path, batch_size):
    try:
        # Find the faces for N frames in the video.
        faces = face_extractor.process_video(video_path)

        # Only look at one face per frame.
        face_extractor.keep_only_best_face(faces)
        
        if len(faces) > 0:
            # NOTE: When running on the CPU, the batch size must be fixed
            # or else memory usage will blow up. (Bug in PyTorch?)
            x = np.zeros((batch_size, input_size, input_size, 3), dtype=np.uint8)

            # If we found any faces, prepare them for the model.
            n = 0
            for frame_data in faces:
                for face in frame_data["faces"]:
                    # Resize to the model's required input size.
                    # We keep the aspect ratio intact and add zero
                    # padding if necessary.                    
                    resized_face = isotropically_resize_image(face, input_size)
                    resized_face = make_square_image(resized_face)

                    if n < batch_size:
                        x[n] = resized_face
                        n += 1
                    else:
                        print("WARNING: have %d faces but batch size is %d" % (n, batch_size))
                    
                    # Test time augmentation: horizontal flips.
                    # TODO: not sure yet if this helps or not
                    #x[n] = cv2.flip(resized_face, 1)
                    #n += 1

            if n > 0:
                x = torch.tensor(x, device=gpu).float()

                # Preprocess the images.
                x = x.permute((0, 3, 1, 2))

                for i in range(len(x)):
                    x[i] = normalize_transform(x[i] / 255.)
#                     x[i] = x[i] / 255.

                # Make a prediction, then take the average.
                with torch.no_grad():
                    y_pred = model(x)
                    y_pred = torch.sigmoid(y_pred.squeeze())
                    return y_pred[:n].mean().item()

    except Exception as e:
        print("Prediction error on video %s: %s" % (video_path, str(e)))

    return 0.5

In [0]:
from concurrent.futures import ThreadPoolExecutor

def predict_on_video_set(videos, num_workers):
    def process_file(i):
        filename = videos[i]
        y_pred = predict_on_video(os.path.join(test_dir, filename), batch_size=frames_per_video)
        return y_pred

    with ThreadPoolExecutor(max_workers=num_workers) as ex:
        predictions = ex.map(process_file, range(len(videos)))

    return list(predictions)

The leaderboard submission must finish within 9 hours. With 4000 test videos, that is `9*60*60/4000 = 8.1` seconds per video. So if the average time per video is greater than ~8 seconds, the kernel will be too slow!

In [0]:
speed_test = True

In [20]:
if speed_test:
    start_time = time.time()
    speedtest_videos = test_videos[:5]
    predictions = predict_on_video_set(speedtest_videos, num_workers=4)
    elapsed = time.time() - start_time
    print("Elapsed %f sec. Average per video: %f sec." % (elapsed, elapsed / len(speedtest_videos)))

Elapsed 11.105101 sec. Average per video: 2.221020 sec.


In [21]:
%%time
model.eval()
predictions = predict_on_video_set(test_videos, num_workers=4)

CPU times: user 44min 30s, sys: 50.4 s, total: 45min 20s
Wall time: 11min 32s


In [0]:
submission_df = pd.DataFrame({"filename": test_videos, "label": predictions})
submission_df.to_csv("submission.csv", index=False)

In [23]:
submission_df.head()

Unnamed: 0,filename,label
0,aassnaulhq.mp4,0.993989
1,aayfryxljh.mp4,0.020836
2,acazlolrpz.mp4,0.77954
3,adohdulfwb.mp4,0.032759
4,ahjnxtiamx.mp4,0.829322


In [24]:
predictions[:10]

[0.9939891695976257,
 0.02083555795252323,
 0.779539942741394,
 0.03275921568274498,
 0.8293218612670898,
 0.6453460454940796,
 0.7182217836380005,
 0.9944658279418945,
 0.803548276424408,
 0.2636297345161438]

In [0]:
!cp submission.csv '/content/drive/My Drive/submission.csv'