In [4]:
import glob
import numpy as np
import pandas as pd
import os
import pickle

### 1. Initial dataset generation
---
We first setup constant values to load desired dataset from files generated with `distance_generator.py`.  
Based on currently available options we can select following options:  
- DATASET_TYPE: 'train' or 'test'
- FEATURE: 'wasserstein_distance', 'rel_entr' or 'jensenshannon'
- FEATURE_SELECTION: 'min' or 'max'
- SEQUENCE_LENGTH: any number used in feautre genertion process
  
After gathering all required data paths, files are opened in a loop and payload is stored in a list. We also perform sample selection for final desired dataset. After process is finished files are saved to CSV format for further data exploration and ease of use in training process.

In [20]:
DATASET_TYPE = ["test", "train"]
FEATURE = ["wasserstein_distance", "jensenshannon"]
FEATURE_SELECTION = ["min", "max"]
SEQUENCE_LENGTH = [3, 5, 10]

save_directory = "../datasets/"


for dt in DATASET_TYPE:
    fake_main_paths = glob.glob(f"../../../DeepFake_Detection/wilddeep_results/fake_{dt}/*/fake/*")
    real_main_paths = glob.glob(f"../../../DeepFake_Detection/wilddeep_results/real_{dt}/*/real/*")
    all_paths = fake_main_paths + real_main_paths
    subsets = list()

    for seql in SEQUENCE_LENGTH:
        data_cache = {f"{ft}": {f"{fts}": list() for fts in FEATURE_SELECTION} for ft in FEATURE}
            
        for path in all_paths:
            with open(path + "/" + f"subsequence_data-{seql}.pkl", 'rb') as f:
                data = pickle.load(f)
            subsets.append(data)

            for ft in FEATURE:
                for fts in FEATURE_SELECTION:
                    # Droping duplicates to account for cases where feature equals 0 for more than one subsequence
                    if fts == "min":
                        data_cache[ft][fts].append(data[data[ft] == data[ft].min()].drop_duplicates(subset=[ft]).reset_index(drop=True))

                    elif fts == "max":
                        data_cache[ft][fts].append(data[data[ft] == data[ft].max()].drop_duplicates(subset=[ft]).reset_index(drop=True))
            
                
        full_dataset = pd.concat(subsets, axis=0)
        full_dataset.to_csv(f"{save_directory}{dt}_length-{seql}_full-dataset.csv")

        for ft in FEATURE:
                for fts in FEATURE_SELECTION:
                    final_dataset = pd.concat(data_cache[ft][fts], axis=0)
                    final_dataset.to_csv(f"{save_directory}{dt}_{ft}-{fts}_length-{seql}_dataset.csv")

In [19]:
subsets = list()
filtered_samples = list()
save_directory = "../datasets/"
FEAUTRE_SELECTION = "min"
FEATURE = "jensenshannon"
DATASET_TYPE = "train"
SEQUENCE_LENGTH = 3

for path in all_paths:
    with open(path + "/" + f"subsequence_data-{SEQUENCE_LENGTH}.pkl", 'rb') as f:
        data = pickle.load(f)
    subsets.append(data)

    # Droping duplicates to account for cases where feature equals 0 for more than one subsequence
    if FEAUTRE_SELECTION == "min":
        filtered_samples.append(data[data[FEATURE] == data[FEATURE].min()].drop_duplicates(subset=[FEATURE]).reset_index(drop=True))

    elif FEAUTRE_SELECTION == "max":
        filtered_samples.append(data[data[FEATURE] == data[FEATURE].max()].drop_duplicates(subset=[FEATURE]).reset_index(drop=True))

full_dataset = pd.concat(subsets, axis=0)
full_dataset.to_csv(f"{save_directory}{DATASET_TYPE}_{FEATURE}_length-{SEQUENCE_LENGTH}_full-dataset.csv")

final_dataset = pd.concat(filtered_samples, axis=0)
final_dataset.to_csv(f"{save_directory}{DATASET_TYPE}_{FEATURE}-{FEAUTRE_SELECTION}_length-{SEQUENCE_LENGTH}_dataset.csv")

### 2. Data exploration
---
In this section we will explore properties of WildDeepfake dataset based on available features. We start with descriptive analysis of each, previously mentioned feature.

In [None]:
{"wasserstein_distance": {"min": [], "max": []}}

In [43]:
t = full_dataset.groupby(["video", "sequence", "first_frame"])

In [84]:
final_dataset["path"] = final_dataset.apply(lambda x: f"{x['type']}_{x['subset']}/{x['video']}/{x['type']}/{x['sequence']}/{x['first_frame']}.png", axis=1)

In [85]:
test

Unnamed: 0,subset,type,video,sequence,first_frame,subsequence_length,wasserstein_distance,path
0,test,fake,1,1,83,3,0.000107,test_fake/1/fake/1/83.png
0,test,fake,1,17,479,3,0.000123,test_fake/1/fake/17/479.png
0,test,fake,1,33,961,3,0.000048,test_fake/1/fake/33/961.png
0,test,fake,1,40,1289,3,0.000058,test_fake/1/fake/40/1289.png
0,test,fake,10,64,1119,3,0.000086,test_fake/10/fake/64/1119.png
...,...,...,...,...,...,...,...,...
0,test,real,93,46,828,3,0.000076,test_real/93/real/46/828.png
0,test,real,93,50,1002,3,0.000095,test_real/93/real/50/1002.png
0,test,real,93,73,1279,3,0.000084,test_real/93/real/73/1279.png
0,test,real,93,74,1376,3,0.000086,test_real/93/real/74/1376.png


In [14]:
{f"{ft}": {f"{fts}": list() for fts in FEAUTRE_SELECTION} for ft in FEATURE}

{'wasserstein_distance': {'max': [], 'min': []},
 'rel_entr': {'max': [], 'min': []},
 'jensenshannon': {'max': [], 'min': []}}

In [12]:
FEAUTRE_SELECTION = ["max", "min"]