In [1]:
import os
import glob
import random
from tqdm import tqdm
from typing import Union, Tuple, List, Any

import numpy as np
import pandas as pd
import cv2 as cv

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50, VGG16, Xception
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPUs')
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
# Experiment setup
DFDC_PATH = "../../../DeepFake_Detection/DFDC_ALL_DATA_224/"
DFDC_METADATA = "../../../DeepFake_Detection/DFDC_ALL_DATA/metadata/metadata.json"
WD_TRAIN_CSV_PATH = "../datasets/train/min/train_wasserstein_distance-min_length-3_dataset.csv"
DATA_PATH = "../../../DeepFake_Detection/WILDDEEP_DATA/"

In [4]:
def custom_path_generator(csv_path: str, data_path: str) -> Tuple[List[List[str]], List[str]]:
    df = pd.read_csv(csv_path)

    X_paths, y = [], []
    for i in range(len(df.index)):
        subset = df.iloc[i].subset
        video_type = df.iloc[i].type
        video_number = df.iloc[i].video
        sequence_number = df.iloc[i].sequence
        first_frame = df.iloc[i].first_frame
        subsequence_length = df.iloc[i].subsequence_length
        
        subsequence = []
        for j in range(first_frame, first_frame + subsequence_length):
            path =  data_path + f"{video_type}_{subset}/" + str(video_number) + "/" + video_type + "/" + str(sequence_number) + "/" + str(j) + ".png"
            subsequence.append(path)
        
        X_paths.append(subsequence)

        y.append(0 if video_type == "real" else 1)

    return X_paths, y

def ravel_and_match_lists(X: List[List], y: List) -> Tuple[List, List]:
    x_out = []
    y_out = []
    for i in range(len(X)):
        for j in range(len(X[i])):
            x_out.append(X[i][j])
            y_out.append(y[i])
    

    return x_out, y_out

In [5]:
def get_dfdc_path(data_path: str, file: str) -> str:
    path = data_path + file.replace(".mp4", ".jpg")

    if not os.path.exists(path):
        raise Exception
    
    return path

In [6]:

#TODO: At some point change to handle sequences of frames 
def load_dfdc_paths(metadata_path: str, data_path: str) -> Tuple[List[str], List[str]]:
    X_paths, y = [], []
    labels = ["REAL", "FAKE"]

    dfdc_paths = pd.read_json(DFDC_METADATA)
    dfdc_files = list(dfdc_paths.columns.values)

    for file in tqdm(dfdc_files):
        try:
            X_paths.append(get_dfdc_path(data_path, file))
            y.append(labels.index(dfdc_paths[file]['label']))
        except Exception as err:
            #print(err)
            pass
    
    return X_paths, y

In [7]:
def shuffle_lists(a: List, b: List, seed: int=0) -> Tuple[List, List]:
    lists = list(zip(a, b))
    random.seed(seed)
    random.shuffle(lists)

    a, b = zip(*lists)
    a = list(a)
    b = list(b)

    return (a, b)

def shuffle_arrays(a: np.ndarray, b: np.ndarray, seed: int=0) -> Tuple[np.ndarray, np.ndarray]:
    np.random.seed(seed)
    assert len(a) == len(b)
    permutation = np.random.permutation(len(a))
    return a[permutation], b[permutation]

In [8]:

#TODO: Add multi-feature resampling
def resample_dataset(X: np.ndarray, y: np.ndarray, resampling_type: str, final_ratio: float, seed: int) -> Tuple[List, List]:
    if resampling_type == "undersample":
        sampler = RandomUnderSampler(sampling_strategy = final_ratio, random_state = seed)
    elif resampling_type == "oversample":
        sampler = RandomOverSampler(sampling_strategy = final_ratio, random_state = seed)
    else:
        raise Exception("Unknown resampling type. Available types: 'undersample', 'oversample'.")
    
    X, y = sampler.fit_resample(X, y)

    return X, y

In [9]:
X_dfdc, y_dfdc = load_dfdc_paths(DFDC_METADATA, DFDC_PATH)
X_dfdc, y_dfdc = np.array(X_dfdc), np.array(y_dfdc)
X_dfdc, y_dfdc = X_dfdc.reshape(-1,1), y_dfdc.reshape(-1,1)

X_wd_train, y_wd_train = custom_path_generator(WD_TRAIN_CSV_PATH, DATA_PATH)
X_wd_train, y_wd_train = ravel_and_match_lists(X_wd_train, y_wd_train)
X_wd_train, y_wd_train = shuffle_lists(X_wd_train, y_wd_train)
X_wd_train, y_wd_train = np.array(X_wd_train), np.array(y_wd_train)
X_wd_train, y_wd_train = X_wd_train.reshape(-1,1), y_wd_train.reshape(-1,1)

100%|██████████| 119154/119154 [00:12<00:00, 9835.40it/s] 


In [10]:
X_dfdc.shape, y_dfdc.shape

((114344, 1), (114344, 1))

In [11]:
X_dfdc, y_dfdc = resample_dataset(X_dfdc, y_dfdc, "undersample", 1, 1)
X_dfdc.shape, y_dfdc.shape

((37116, 1), (37116,))

In [12]:
X_wd_train.shape, y_wd_train.shape

((19518, 1), (19518, 1))

In [13]:
X_dfdc_train, X_dfdc_test, y_dfdc_train, y_dfdc_test = train_test_split(X_dfdc, y_dfdc, test_size=0.3)
X_dfdc_train.shape, X_dfdc_test.shape, X_dfdc_test.shape, y_dfdc_test.shape

((25981, 1), (11135, 1), (11135, 1), (11135,))

In [14]:
X_train = np.vstack([X_dfdc_train, X_wd_train])
y_train = np.vstack([y_dfdc_train.reshape(-1,1), y_wd_train])

X_train, y_train = shuffle_arrays(X_train, y_train, 1)

In [15]:
X_train.shape, y_train.shape

((45499, 1), (45499, 1))

In [16]:
X_train[0:10]

array([['../../../DeepFake_Detection/WILDDEEP_DATA/real_train/461/real/238/3232.png'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/real_train/511/real/603/1604.png'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/hnambobzhb.jpg'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/ggzcegccry.jpg'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/ejevdfyyzw.jpg'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/oojvczbnse.jpg'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/real_train/189/real/92/1235.png'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/fake_train/498/fake/32/1152.png'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/real_train/336/real/23/454.png'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/ldnbyjrhwx.jpg']],
      dtype='<U75')

In [17]:
y_train[0:10]

array([[0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0]])

In [18]:
def read_img(path: str) -> cv.Mat:
    return cv.cvtColor(cv.imread(path),cv.COLOR_BGR2RGB)

In [19]:
def load_dataset(dataset: np.ndarray) -> np.ndarray:
    output = []

    for data in tqdm(dataset):
        batch = []
        for path in data:
            batch.append(read_img(path))
        output.append(batch)

    return np.array(output)

In [20]:
X_train = load_dataset(X_train)
X_test = load_dataset(X_dfdc_test)
y_test = y_dfdc_test.reshape(-1,1)

del X_dfdc_test
del y_dfdc_test

100%|██████████| 45499/45499 [01:01<00:00, 741.56it/s]
100%|██████████| 11135/11135 [00:10<00:00, 1014.43it/s]


In [21]:
X_train.shape, y_train.shape

((45499, 1, 224, 224, 3), (45499, 1))

In [22]:
X_train = X_train.reshape((X_train.shape[0], 
                           X_train.shape[2], 
                           X_train.shape[3], 
                           X_train.shape[4]
))
X_test = X_test.reshape((X_test.shape[0], 
                         X_test.shape[2], 
                         X_test.shape[3], 
                         X_test.shape[4]
))

In [23]:
# train_ds = tf.data.Dataset.from_tensor_slices(
#     (X_train, y_train)).shuffle(50000).batch(4)

# test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(4)

In [24]:
model = Xception(include_top=False, weights = None)

In [25]:
def build_model(model: Model) -> Model:
    input_layer = Input(shape = X_train.shape[1::], name = "input_layer")
    x = model(input_layer)
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation="relu", name="fc1")(x)
    x = Dense(1024, activation="relu", name="fc2")(x)
    x = Dense(1, activation="sigmoid", name="output")(x)
    result_model = Model(inputs = input_layer, outputs = x)

    return result_model

In [26]:
X_train.shape[1::]

(224, 224, 3)

In [27]:
model = build_model(model)

In [28]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 224, 224, 3)]     0         
_________________________________________________________________
xception (Functional)        (None, None, None, 2048)  20861480  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2048)              0         
_________________________________________________________________
fc1 (Dense)                  (None, 1024)              2098176   
_________________________________________________________________
fc2 (Dense)                  (None, 1024)              1049600   
_________________________________________________________________
output (Dense)               (None, 1)                 1025      
Total params: 24,010,281
Trainable params: 23,955,753
Non-trainable params: 54,528
____________________________________________

In [29]:
# optimizer = tf.keras.optimizers.Adam()

# train_loss = tf.keras.metrics.Mean(name='train_loss')
# train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

# test_loss = tf.keras.metrics.Mean(name='test_loss')
# test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

In [30]:
# @tf.function
# def train_step(images, labels):
#     with tf.GradientTape() as tape:
#         predictions = model(images, training=True)
#         loss = tf.keras.losses.binary_crossentropy(labels, predictions, from_logits=True)

#     gradients = tape.gradient(loss, model.trainable_variables)
#     optimizer.apply_gradients(zip(gradients, model.trainable_variables))

#     train_loss(loss)
#     train_accuracy(labels, predictions)

In [31]:
# @tf.function
# def test_step(images, labels):
#   predictions = model(images, training=False)
#   t_loss = tf.keras.losses.binary_crossentropy(labels, predictions, from_logits=True)

#   test_loss(t_loss)
#   test_accuracy(labels, predictions)

In [None]:

#! Currently commented out graph execution because of memory limit

# EPOCHS = 5

# for epoch in range(EPOCHS):
#   # Reset the metrics at the start of the next epoch
#   train_loss.reset_states()
#   train_accuracy.reset_states()
#   test_loss.reset_states()
#   test_accuracy.reset_states()

#   for images, labels in train_ds:
#     train_step(images, labels)

#   for test_images, test_labels in test_ds:
#     test_step(test_images, test_labels)

#   print(
#     f'Epoch {epoch + 1}, '
#     f'Loss: {train_loss.result()}, '
#     f'Accuracy: {train_accuracy.result() * 100}, '
#     f'Test Loss: {test_loss.result()}, '
#     f'Test Accuracy: {test_accuracy.result() * 100}'
#   )

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size = 32, epochs=5, verbose=1, validation_data=(X_test, y_test))