In [101]:
import os
import glob
import random
from tqdm import tqdm
from typing import Union, Tuple, List, Any

import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPUs')
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)

In [51]:
# Experiment setup
DFDC_PATH = "../../../DeepFake_Detection/DFDC_ALL_DATA_224/"
DFDC_METADATA = "../../../DeepFake_Detection/DFDC_ALL_DATA/metadata/metadata.json"
WD_TRAIN_CSV_PATH = "../datasets/train/min/train_wasserstein_distance-min_length-3_dataset.csv"
DATA_PATH = "../../../DeepFake_Detection/WILDDEEP_DATA/"

In [130]:
def custom_path_generator(csv_path: str, data_path: str) -> Tuple[List[List[str]], List[str]]:
    df = pd.read_csv(csv_path)

    X_paths, y = [], []
    for i in range(len(df.index)):
        subset = df.iloc[i].subset
        video_type = df.iloc[i].type
        video_number = df.iloc[i].video
        sequence_number = df.iloc[i].sequence
        first_frame = df.iloc[i].first_frame
        subsequence_length = df.iloc[i].subsequence_length
        
        subsequence = []
        for j in range(first_frame, first_frame + subsequence_length):
            path =  data_path + f"{subset}_{video_type}/" + str(video_number) + "/" + video_type + "/" + str(sequence_number) + "/" + str(j) + ".png"
            subsequence.append(path)
        
        X_paths.append(subsequence)

        y.append("0" if video_type == "real" else "1")

    return X_paths, y

def ravel_and_match_lists(X: List[List], y: List) -> Tuple[List, List]:
    x_out = []
    y_out = []
    for i in range(len(X)):
        for j in range(len(X[i])):
            x_out.append(X[i][j])
            y_out.append(y[i])
    

    return x_out, y_out

In [47]:
def get_dfdc_path(data_path: str, file: str) -> str:
    path = data_path + file.replace(".mp4", ".jpg")

    if not os.path.exists(path):
        raise Exception
    
    return path

In [77]:

#TODO: At some point change to handle sequences of frames 
def load_dfdc_paths(metadata_path: str, data_path: str) -> Tuple[List[str], List[str]]:
    X_paths, y = [], []
    labels = ["REAL", "FAKE"]

    dfdc_paths = pd.read_json(DFDC_METADATA)
    dfdc_files = list(dfdc_paths.columns.values)

    for file in tqdm(dfdc_files):
        try:
            X_paths.append(get_dfdc_path(data_path, file))
            y.append(str(labels.index(dfdc_paths[file]['label'])))
        except Exception as err:
            #print(err)
            pass
    
    return X_paths, y

In [140]:
def shuffle_lists(a: List, b: List, seed: int=0) -> Tuple[List, List]:
    lists = list(zip(a, b))
    random.seed(seed)
    random.shuffle(lists)

    a, b = zip(*lists)
    a = list(a)
    b = list(b)

    return (a, b)

def shuffle_arrays(a: np.ndarray, b: np.ndarray, seed: int=0) -> Tuple[np.ndarray, np.ndarray]:
    np.random.seed(seed)
    assert len(a) == len(b)
    permutation = np.random.permutation(len(a))
    return a[permutation], b[permutation]

In [99]:

#TODO: Add multi-feature resampling
def resample_dataset(X: np.ndarray, y: np.ndarray, resampling_type: str, final_ratio: float, seed: int) -> Tuple[List, List]:
    if resampling_type == "undersample":
        sampler = RandomUnderSampler(sampling_strategy = final_ratio, random_state = seed)
    elif resampling_type == "oversample":
        sampler = RandomOverSampler(sampling_strategy = final_ratio, random_state = seed)
    else:
        raise Exception("Unknown resampling type. Available types: 'undersample', 'oversample'.")
    
    X, y = sampler.fit_resample(X, y)

    return X, y

In [136]:
X_dfdc, y_dfdc = load_dfdc_paths(DFDC_METADATA, DFDC_PATH)
X_dfdc, y_dfdc = np.array(X_dfdc), np.array(y_dfdc)
X_dfdc, y_dfdc = X_dfdc.reshape(-1,1), y_dfdc.reshape(-1,1)

X_wd_train, y_wd_train = custom_path_generator(WD_TRAIN_CSV_PATH, DATA_PATH)
X_wd_train, y_wd_train = ravel_and_match_lists(X_wd_train, y_wd_train)
X_wd_train, y_wd_train = shuffle_lists(X_wd_train, y_wd_train)
X_wd_train, y_wd_train = np.array(X_wd_train), np.array(y_wd_train)
X_wd_train, y_wd_train = X_wd_train.reshape(-1,1), y_wd_train.reshape(-1,1)

100%|██████████| 119154/119154 [00:07<00:00, 16319.92it/s]


In [117]:
X_dfdc.shape, y_dfdc.shape

((114344, 1), (114344, 1))

In [138]:
X_dfdc, y_dfdc = resample_dataset(X_dfdc, y_dfdc, "undersample", 1, 1)
X_dfdc.shape, y_dfdc.shape

((37116, 1), (37116,))

In [137]:
X_wd_train.shape, y_wd_train.shape

((19518, 1), (19518, 1))

In [178]:
X_dfdc_train, X_dfdc_test, y_dfdc_train, y_dfdc_test = train_test_split(X_dfdc, y_dfdc, test_size=0.3)
X_dfdc_train.shape, X_dfdc_test.shape

((25981, 1), (11135, 1))

In [186]:
X_train = np.vstack([X_dfdc_train, X_wd_train])
y_train = np.vstack([y_dfdc_train.reshape(-1,1), y_wd_train])

X_train, y_train = shuffle_arrays(X_train, y_train, 1)

In [187]:
X_train.shape, y_train.shape

((45499, 1), (45499, 1))

In [188]:
X_train[0:10]

array([['../../../DeepFake_Detection/WILDDEEP_DATA/train_real/461/real/238/3232.png'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/train_real/511/real/603/1604.png'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/ikkivspaft.jpg'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/rortwhsuas.jpg'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/npuplviqjf.jpg'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/jcjvaypgbq.jpg'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/train_real/189/real/92/1235.png'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/train_fake/498/fake/32/1152.png'],
       ['../../../DeepFake_Detection/WILDDEEP_DATA/train_real/336/real/23/454.png'],
       ['../../../DeepFake_Detection/DFDC_ALL_DATA_224/cgefriykjw.jpg']],
      dtype='<U75')

In [189]:
y_train[0:10]

array([['0'],
       ['0'],
       ['0'],
       ['1'],
       ['1'],
       ['0'],
       ['0'],
       ['1'],
       ['0'],
       ['0']], dtype='<U1')