In [66]:
import os
import glob
import random
from tqdm import tqdm
from typing import Union, Tuple, List, Any

import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPUs')
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [51]:
# Experiment setup
DFDC_PATH = "../../../DeepFake_Detection/DFDC_ALL_DATA_224/"
DFDC_METADATA = "../../../DeepFake_Detection/DFDC_ALL_DATA/metadata/metadata.json"
WD_TRAIN_CSV_PATH = "../datasets/train/min/train_wasserstein_distance-min_length-3_dataset.csv"
DATA_PATH = "../../../DeepFake_Detection/WILDDEEP_DATA/"

In [78]:
def custom_path_generator(csv_path: str, data_path: str) -> Tuple[List[List[str]], List[str]]:
    df = pd.read_csv(csv_path)

    X_paths, y = [], []
    for i in range(len(df.index)):
        subset = df.iloc[i].subset
        video_type = df.iloc[i].type
        video_number = df.iloc[i].video
        sequence_number = df.iloc[i].sequence
        first_frame = df.iloc[i].first_frame
        subsequence_length = df.iloc[i].subsequence_length
        
        subsequence = []
        for j in range(first_frame, first_frame + subsequence_length):
            path =  data_path + f"{subset}_{video_type}/" + str(video_number) + "/" + video_type + "/" + str(sequence_number) + "/" + str(j) + ".png"
            subsequence.append(path)
        
        X_paths.append(subsequence)

        y.append("0" if video_type == "real" else "1")

    return X_paths, y

In [47]:
def get_dfdc_path(data_path: str, file: str) -> str:
    path = data_path + file.replace(".mp4", ".jpg")

    if not os.path.exists(path):
        raise Exception
    
    return path

In [77]:

#TODO: At some point change to handle sequences of frames 
def load_dfdc_paths(metadata_path: str, data_path: str) -> Tuple[List[str], List[str]]:
    X_paths, y = [], []
    labels = ["REAL", "FAKE"]

    dfdc_paths = pd.read_json(DFDC_METADATA)
    dfdc_files = list(dfdc_paths.columns.values)

    for file in tqdm(dfdc_files):
        try:
            X_paths.append(get_dfdc_path(data_path, file))
            y.append(str(labels.index(dfdc_paths[file]['label'])))
        except Exception as err:
            #print(err)
            pass
    
    return X_paths, y

In [73]:
def shuffle_lists(a: List, b: List, seed: int=0) -> Tuple[List, List]:
    lists = list(zip(a, b))
    random.seed(seed)
    random.shuffle(lists)

    a, b = zip(*lists)
    a = list(a)
    b = list(b)

    return (a, b)   

In [42]:
X_dfdc, y_dfdc = load_dfdc_paths(DFDC_METADATA, DFDC_PATH)
X_wd_train, y_wd_train = custom_path_generator(WD_TRAIN_CSV_PATH, DATA_PATH)

In [74]:
X_wd_train, y_wd_train = shuffle_lists(X_wd_train, y_wd_train)