In [None]:
import numpy as np

# Debug library, very useful
from icecream import ic

In [None]:
import os

dir_fake = ["../dataset/fake"]
dir_real = ["../dataset/real"]

# Collect all file paths and filter invalid files
fake_files = [
    os.path.join(subdir, file)
    for dir in dir_fake
    for subdir, _, files in os.walk(dir)
    for file in files
    if os.path.isfile(os.path.join(subdir, file))
]

real_files = [
    os.path.join(subdir, file)
    for dir in dir_real
    for subdir, _, files in os.walk(dir)
    for file in files
    if os.path.isfile(os.path.join(subdir, file))
]

In [None]:
from ImageForensics import FeatureExtraction

N = 300
extract = FeatureExtraction(features=N)

psd1D_total_fake = extract.multithread_fft(fake_files)
psd1D_total_real = extract.multithread_fft(real_files)

# Remove None results if any files failed to process
psd1D_total_fake = [result for result in psd1D_total_fake if result is not None]
psd1D_total_real = [result for result in psd1D_total_real if result is not None]

In [None]:
label_total_fake = np.full(len(psd1D_total_fake), True)
label_total_real = np.full(len(psd1D_total_real), False)

# psd1D_total_final = psd1D_total_fake
# label_total_final = label_total_fake

features = np.concatenate((psd1D_total_fake, psd1D_total_real))
labels = np.concatenate((label_total_fake, label_total_real))

In [None]:
# Benford's Law for the first digit
DIGITS = np.arange(1, 10)
BENFORD = np.log10(1 + 1 / DIGITS)

In [None]:
def get_first_digit(value: float) -> int:
    value_str = str(value)
    if value_str[0] == "-":
        return int(value_str[1])
    return int(value_str[0])


def get_digit_counts(array: list[int]) -> list[int]:
    # use List comprehension to count the occurrences of each digit
    return [array.count(digit) for digit in DIGITS]


# Get first digit of each value
first_digits = [[get_first_digit(value) for value in array] for array in features]

# Count the occurrences of each first digit
first_digits_counts = [get_digit_counts(array) for array in first_digits]


In [None]:
import scipy.stats as stats
import sklearn.metrics as metrics


def test_results(
    alpha: int,
    first_digits_counts: list[list[int]] = first_digits_counts,
) -> dict:
    # Test the goodness of fit for each feature
    goodness_of_fit = [
        stats.pearsonr(first_digits_count, BENFORD)
        for first_digits_count in first_digits_counts
    ]

    # calculate True Positive, False Positive, True Negative, False Negative
    results = [1 - alpha >= p_value for p_value, _ in goodness_of_fit]

    # label for fake is 0/False, real is 1/True
    TN, FP, FN, TP = metrics.confusion_matrix(labels, results).ravel()

    return {
        "True Positive": TP,
        "False Positive": FP,
        "True Negative": TN,
        "False Negative": FN,
        "Precision": metrics.precision_score(labels, results),
        "Recall": metrics.recall_score(labels, results),
        "F1": metrics.f1_score(labels, results),
        "Accuracy": metrics.accuracy_score(labels, results),
    }

In [None]:
# Print the table using pandas Dataframe
import pandas as pd

ALPHA = [0.01, 0.05, 0.1]

results = [test_results(alpha) for alpha in ALPHA]

df = pd.DataFrame.from_records(results, index=ALPHA)
df.columns.name = "\N{GREEK SMALL LETTER ALPHA}"
df