In [None]:
import numpy as np

# Debug library, very useful
from icecream import ic

In [2]:
import os

dir_fake = ["../dataset/fake"]
dir_real = ["../dataset/real"]

# Collect all file paths and filter invalid files
fake_files = [
    os.path.join(subdir, file)
    for dir in dir_fake
    for subdir, _, files in os.walk(dir)
    for file in files
    if os.path.isfile(os.path.join(subdir, file)) is True
]

real_files = [
    os.path.join(subdir, file)
    for dir in dir_real
    for subdir, _, files in os.walk(dir)
    for file in files
    if os.path.isfile(os.path.join(subdir, file)) is True
]

In [None]:
from ImageForensics import FeatureExtraction

N = 300
extract = FeatureExtraction(features=N)

psd1D_total_fake = extract.multithread_fft(fake_files)
psd1D_total_real = extract.multithread_fft(real_files)

# Remove None results if any files failed to process
psd1D_total_fake = [result for result in psd1D_total_fake if result is not None]
psd1D_total_real = [result for result in psd1D_total_real if result is not None]

In [4]:
label_total_fake = np.ones(len(psd1D_total_fake))
label_total_real = np.zeros(len(psd1D_total_real))

# psd1D_total_final = psd1D_total_fake
# label_total_final = label_total_fake

features = np.concatenate((psd1D_total_fake, psd1D_total_real))
labels = np.concatenate((label_total_fake, label_total_real))

In [5]:
# Benford's Law for the first digit
DIGITS = np.arange(1, 10)
BENFORD = np.log10(1 + 1 / DIGITS)

In [6]:
# Get first digit of each value
features = [[int(str(value)[0]) for value in array] for array in features]

# Count the occurrences of each first digit
first_digits_counts = [
    np.histogram(array, bins=np.arange(1, 11))[0] for array in features
]

In [7]:
import scipy.stats as stats


def test_results(alpha, first_digits_counts=first_digits_counts):
    goodness_of_fit = [
        stats.pearsonr(first_digits_count, BENFORD)
        for first_digits_count in first_digits_counts
    ]

    # calculate True Positive, False Positive, True Negative, False Negative
    results = [
        (1 - alpha >= p_value, labels[i])
        for i, (p_value, _) in enumerate(goodness_of_fit)
    ]

    # fake is 0, real is 1
    TP = sum(is_legitimate and (label == 1) for is_legitimate, label in results)
    FP = sum(is_legitimate and (label == 0) for is_legitimate, label in results)
    TN = sum(not is_legitimate and (label == 0) for is_legitimate, label in results)
    FN = sum(not is_legitimate and (label == 1) for is_legitimate, label in results)

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1 = 2 * (precision * recall) / (precision + recall)
    accuracy = (TP + TN) / (TP + TN + FP + FN)

    return {
        "TP": TP,
        "FP": FP,
        "TN": TN,
        "FN": FN,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

In [None]:
ic(test_results(alpha=1 / 100))
ic(test_results(alpha=5 / 100))
ic(test_results(alpha=10 / 100))