In [1]:
import glob
import numpy as np
import pandas as pd
import os
import pickle

## 1. Initial dataset generation
---
We first setup constant values to load desired dataset from files generated with `distance_generator.py`.  
Based on currently available options we can select following options:  
- DATASET_TYPE: 'train' or 'test'
- FEATURE: 'wasserstein_distance', 'rel_entr' or 'jensenshannon'
- FEATURE_SELECTION: 'min' or 'max'
- SEQUENCE_LENGTH: any number used in feautre genertion process
  
After gathering all required data paths, files are opened in a loop and payload is stored in a list. We also perform sample selection for final desired dataset. After process is finished files are saved to CSV format for further data exploration and ease of use in training process.

In [40]:
DATASET_TYPE = ["test", "train"]
FEATURE = ["wasserstein_distance", "jensenshannon"]
FEATURE_SELECTION = ["min", "max"]
SEQUENCE_LENGTH = [3, 5, 10]

save_directory = "../datasets/"

In [None]:
for dt in DATASET_TYPE:
    fake_main_paths = glob.glob(f"../../../DeepFake_Detection/wilddeep_results/fake_{dt}/*/fake/*")
    real_main_paths = glob.glob(f"../../../DeepFake_Detection/wilddeep_results/real_{dt}/*/real/*")
    all_paths = fake_main_paths + real_main_paths
    
    for seql in SEQUENCE_LENGTH:
        subsets = list()
        data_cache = {f"{ft}": {f"{fts}": list() for fts in FEATURE_SELECTION} for ft in FEATURE}
            
        for path in all_paths:
            with open(path + "/" + f"subsequence_data-{seql}.pkl", 'rb') as f:
                data = pickle.load(f)
            subsets.append(data)

            for ft in FEATURE:
                for fts in FEATURE_SELECTION:
                    # Droping duplicates to account for cases where feature equals 0 for more than one subsequence
                    if fts == "min":
                        data_cache[ft][fts].append(data[data[ft] == data[ft].min()].drop_duplicates(subset=[ft]).reset_index(drop=True))

                    elif fts == "max":
                        data_cache[ft][fts].append(data[data[ft] == data[ft].max()].drop_duplicates(subset=[ft]).reset_index(drop=True))
            
                
        full_dataset = pd.concat(subsets, axis=0)
        sub_directory = os.path.join(save_directory, dt)
        os.makedirs(sub_directory, exist_ok=True)
        full_dataset.to_csv(f"{sub_directory}/{dt}_length-{seql}_full-dataset.csv")

        for ft in FEATURE:
            for fts in FEATURE_SELECTION:
                final_dataset = pd.concat(data_cache[ft][fts], axis=0)
                final_dataset.to_csv(f"{sub_directory}/{dt}_{ft}-{fts}_length-{seql}_dataset.csv")

## 2. Data exploration
---
In this section we will explore properties of WildDeepfake dataset based on available features. We start with descriptive analysis of each dataset based on sequence length.  
For each sequence length we will analyze values of available features.

In [3]:
train_data = ["../datasets/train/train_length-3_full-dataset.csv",
              "../datasets/train/train_length-5_full-dataset.csv",
              "../datasets/train/train_length-10_full-dataset.csv"]

test_data = ["../datasets/test/test_length-3_full-dataset.csv",
             "../datasets/test/test_length-5_full-dataset.csv",
             "../datasets/test/test_length-10_full-dataset.csv"]

train_df = pd.concat([pd.read_csv(data) for data in train_data], axis=0)
test_df = pd.concat([pd.read_csv(data) for data in test_data], axis=0)

In [4]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,subset,type,video,sequence,first_frame,subsequence_length,wasserstein_distance,jensenshannon
0,0,train,fake,1,101,643,3,0.000141,0.051867
1,1,train,fake,1,101,644,3,0.000132,0.039173
2,2,train,fake,1,101,645,3,0.000125,0.034568
3,3,train,fake,1,101,646,3,0.000163,0.048897
4,4,train,fake,1,101,647,3,0.0002,0.057537


In [5]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,subset,type,video,sequence,first_frame,subsequence_length,wasserstein_distance,jensenshannon
0,0,test,fake,1,1,32,3,0.000451,0.105726
1,1,test,fake,1,1,33,3,0.000477,0.116158
2,2,test,fake,1,1,34,3,0.000434,0.11765
3,3,test,fake,1,1,35,3,0.000431,0.114568
4,4,test,fake,1,1,36,3,0.000323,0.094099


In [6]:
train_df = train_df.drop(train_df.columns[0], axis=1).reset_index(drop=True)
test_df = test_df.drop(test_df.columns[0], axis=1).reset_index(drop=True)

In [7]:
train_df

Unnamed: 0,subset,type,video,sequence,first_frame,subsequence_length,wasserstein_distance,jensenshannon
0,train,fake,1,101,643,3,0.000141,0.051867
1,train,fake,1,101,644,3,0.000132,0.039173
2,train,fake,1,101,645,3,0.000125,0.034568
3,train,fake,1,101,646,3,0.000163,0.048897
4,train,fake,1,101,647,3,0.000200,0.057537
...,...,...,...,...,...,...,...,...
2843845,train,real,99,96,1297,10,0.000088,0.028989
2843846,train,real,99,96,1298,10,0.000088,0.029302
2843847,train,real,99,96,1299,10,0.000088,0.029307
2843848,train,real,99,96,1300,10,0.000087,0.029048


In [8]:
test_df

Unnamed: 0,subset,type,video,sequence,first_frame,subsequence_length,wasserstein_distance,jensenshannon
0,test,fake,1,1,32,3,0.000451,0.105726
1,test,fake,1,1,33,3,0.000477,0.116158
2,test,fake,1,1,34,3,0.000434,0.117650
3,test,fake,1,1,35,3,0.000431,0.114568
4,test,fake,1,1,36,3,0.000323,0.094099
...,...,...,...,...,...,...,...,...
484891,test,real,93,96,1883,10,0.000278,0.065789
484892,test,real,93,96,1884,10,0.000256,0.061115
484893,test,real,93,96,1885,10,0.000274,0.062869
484894,test,real,93,96,1886,10,0.000283,0.067241


In [79]:
from functools import reduce
from typing import Union

def generate_statistics(df: pd.DataFrame, feature: str, caption: Union[str, None]=None) -> pd.DataFrame:
    descriptions = [df[df['subsequence_length'] == u][feature].describe() for u in np.unique(df['subsequence_length'])]

    final_df = reduce(lambda  left,right: pd.merge(left,right, left_index=True, right_index=True), descriptions)
    final_df.columns = [f"{feature}_n{u}" for u in np.unique(df["subsequence_length"])]

    if not caption:
        final_df = final_df.style.set_caption(f"Statistics for {feature} - {df['subset'][0]} subset")

    else:
        final_df = final_df.style.set_caption(caption)

    return final_df

In [82]:
train_wd_stats = generate_statistics(train_df, "wasserstein_distance")
test_wd_stats = generate_statistics(test_df, "wasserstein_distance")

train_js_stats = generate_statistics(train_df, "jensenshannon")
test_js_stats = generate_statistics(test_df, "jensenshannon")

In [83]:
train_wd_stats

Unnamed: 0,wasserstein_distance_n3,wasserstein_distance_n5,wasserstein_distance_n10
count,967468.0,954456.0,921926.0
mean,0.000151,0.00015,0.000149
std,9.4e-05,8.3e-05,7.4e-05
min,0.0,0.0,0.0
25%,9.6e-05,9.8e-05,0.0001
50%,0.000122,0.000124,0.000126
75%,0.000173,0.000174,0.000175
max,0.003485,0.001796,0.001072


In [84]:
test_wd_stats

Unnamed: 0,wasserstein_distance_n3,wasserstein_distance_n5,wasserstein_distance_n10
count,164050.0,162438.0,158408.0
mean,0.000156,0.000155,0.000154
std,0.000101,8.9e-05,7.8e-05
min,0.0,0.0,0.0
25%,9.9e-05,0.000102,0.000105
50%,0.000127,0.000129,0.000131
75%,0.000178,0.000179,0.000179
max,0.002775,0.00188,0.001048


In [85]:
train_js_stats

Unnamed: 0,jensenshannon_n3,jensenshannon_n5,jensenshannon_n10
count,967452.0,954424.0,921855.0
mean,0.048543,0.04838,0.048063
std,0.024829,0.022089,0.019424
min,0.0,0.0,0.0
25%,0.03483,0.035574,0.036137
50%,0.042034,0.042385,0.042834
75%,0.054605,0.054428,0.054437
max,0.631399,0.570817,0.521144


In [86]:
test_js_stats

Unnamed: 0,jensenshannon_n3,jensenshannon_n5,jensenshannon_n10
count,164048.0,162434.0,158399.0
mean,0.049007,0.048767,0.048406
std,0.025928,0.022987,0.020097
min,0.0,0.0,0.0
25%,0.034446,0.035269,0.035989
50%,0.042412,0.042962,0.043655
75%,0.055509,0.055376,0.055412
max,0.620712,0.521784,0.380797


In [15]:
import plotly.express as px

In [16]:
fig = px.histogram(train_df[train_df['subsequence_length'] == 3], x="wasserstein_distance")

In [None]:
fig.show()

In [18]:
fig = px.histogram(train_df[train_df['subsequence_length'] == 3], x="jensenshannon")

In [None]:
fig.show()

In [24]:
fig = px.scatter(test_df[test_df['subsequence_length'] == 10], x="wasserstein_distance", y="jensenshannon")

In [None]:
fig.show()