### Setup - Preprocess

In [1]:
from config.configuration import RunDetails

# runDetail = RunDetails('config.yml', 'Compare-Sksmta-eval')
runDetail = RunDetails('config.yml', 'ASVspoof-2019_2025-03-24-1_large-batch')

notebookName = 'audio-deepfake-detection-preprocessing'

In [2]:
configFilename = runDetail.configFilename
runJobId = runDetail.jobId

In [3]:
import json

import config.configuration as configuration
from notebook_utils import notebookToPython
from preprocessors.abstract_preprocessor import AbstractPreprocessor
from preprocessors.preprocess_persistance import PreprocessPersistance
from preprocessors.preprocessor_factory import PreprocessorFactory
from readers.label_reader import readLabelsWithJob

In [4]:
config = configuration.ConfigLoader(configFilename)

notebookToPython(notebookName)
job = config.getJobConfig(runJobId)

prettyJson = json.dumps(job.__dict__, indent=4)
print(f"job: {prettyJson}")

if (job.newPreprocessData == False):
    raise ValueError("This notebook is meant for persisting preprocessed data. Select a job without a value for 'preprocessed-data' set.")

Write python file
Generating new preprocessed binary file: output/ASVspoof-2019_2025-03-24-1_large-batch_2025-04-11T11-31-39.918233.pp-bin
job: {
    "jobId": "ASVspoof-2019_2025-03-24-1_large-batch",
    "outputFolder": "output",
    "dataPathRootRaw": "$HOMEDRIVE$HOMEPATH/workspace/Deepfake/data/ASVspoof-2019",
    "dataPathRoot": "C:/Users/tubas/workspace/Deepfake/data/ASVspoof-2019",
    "dataPathSuffix": "LA/ASVspoof2019_LA_eval/flac",
    "dataExtension": ".flac",
    "trainingSplitRandomState": 3,
    "labelFilename": "LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt",
    "executeToCategoricalForLabels": true,
    "classes": [
        "spoof",
        "bonafide"
    ],
    "numClasses": 2,
    "sampleRate": 16000,
    "duration": 5,
    "numMels": 128,
    "maxTimeSteps": 109,
    "kernelSize": [
        2,
        2
    ],
    "poolSize": [
        2,
        2
    ],
    "optimizer": "adam",
    "loss": "categorical_crossentropy",
    "metrics": [
        "accu

### Preprocess

In [5]:
preproc_factory = PreprocessorFactory()
preprocessor: AbstractPreprocessor = preproc_factory.newPreprocessor(job.preprocessor)

MelSpectrogramPreprocessor


In [6]:
X_test, y_test, true_labels, source_filenames = preprocessor.extract_features_jobSource(job, job.dataPathSuffix)

Loading C:/Users/tubas/workspace/Deepfake/data/ASVspoof-2019/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.eval.trl.txt...
fullDataPath: C:/Users/tubas/workspace/Deepfake/data/ASVspoof-2019/LA/ASVspoof2019_LA_eval/flac
Loading audio files: 7123
Loading audio files: 14246
Loading audio files: 21369
Loading audio files: 28492
Loading audio files: 35615
Loading audio files: 42738
Loading audio files: 49861
Loading audio files: 56984
Loading audio files: 64107
Loading audio files: 71230
Number of audio files loaded: 71237


### Save and validate

In [7]:
persist = PreprocessPersistance(X_test, y_test, true_labels, source_filenames)
persist.save(job.preprocessDataFilename)

In [8]:
reloaded = persist.load(job.preprocessDataFilename)
if (persist.compare(reloaded)):
    print(f"Successfully saved preprocessed data: {job.preprocessDataFilename}")
else:
    print(f"An problem occurred while attempting to save preprocessed data: {job.preprocessDataFilename}")


Successfully saved preprocessed data: output/ASVspoof-2019_2025-03-24-1_large-batch_2025-04-11T11-31-39.918233.pp-bin
