# Data Inspection and Setup

This notebook handles:
1. Inspecting dataset structure and statistics
2. Finding image-ground truth pairs
3. Splitting dataset into train/val/test sets
4. Organizing processed data in workspace


## Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add src directory to path
# Notebook is now in data_preparation/notebooks/, so go up 2 levels to get to project root
current_dir = Path.cwd()
if current_dir.name == 'notebooks' and current_dir.parent.name == 'data_preparation':
    project_root = current_dir.parent.parent
else:
    project_root = Path.cwd()
sys.path.append(str(project_root / 'data_preparation' / 'src'))

from data_processor import DatasetProcessor
import json


## Load Configuration

Configuration file contains data paths, split ratios, and processing parameters.


In [5]:
# Load configuration
config_path = project_root / 'configs' / 'data_config.json'
with open(config_path, 'r') as f:
    config = json.load(f)

print("Configuration loaded:")
print(json.dumps(config, indent=2))


Configuration loaded:
{
  "data_paths": {
    "raw_data": "data/raw",
    "processed_data": "data/processed",
    "train_data": "data/processed/train",
    "val_data": "data/processed/val",
    "test_data": "data/processed/test"
  },
  "split_ratios": {
    "train": 0.7,
    "val": 0.15,
    "test": 0.15
  },
  "random_state": 42,
  "preprocessing": {
    "target_size": [
      512,
      512
    ],
    "resize_mode": "resize",
    "target_format": "png",
    "normalize": true,
    "grayscale": false
  },
  "augmentation": {
    "enabled": true,
    "rotation_range": 15,
    "horizontal_flip": true,
    "vertical_flip": false,
    "brightness_range": [
      0.8,
      1.2
    ],
    "contrast_range": [
      0.8,
      1.2
    ]
  },
  "datasets": {
    "CASIA2": {
      "enabled": true,
      "only_copy_move": true
    },
    "comofod_small": {
      "enabled": true,
      "use_all_variants": true
    },
    "COVERAGE": {
      "enabled": true
    },
    "MICC": {
      "enabled": tr

## Setup Data Directories

Create necessary directories for raw and processed data.


In [6]:
# Create processed data directories
processed_dir = project_root / config['data_paths']['processed_data']
processed_dir.mkdir(parents=True, exist_ok=True)
(processed_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True)
(processed_dir / 'train' / 'masks').mkdir(parents=True, exist_ok=True)
(processed_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True)
(processed_dir / 'val' / 'masks').mkdir(parents=True, exist_ok=True)
(processed_dir / 'test' / 'images').mkdir(parents=True, exist_ok=True)
(processed_dir / 'test' / 'masks').mkdir(parents=True, exist_ok=True)

print("Data directories created successfully")


Data directories created successfully


## Verify Raw Data Location

Ensure raw dataset is placed in the configured data directory.


In [7]:
# Check if raw data directory exists and has files
raw_data_path = project_root / config['data_paths']['raw_data']

if not raw_data_path.exists():
    print(f"Warning: Raw data directory '{raw_data_path}' does not exist.")
    print(f"Please place your raw dataset in: {raw_data_path.absolute()}")
else:
    datasets = [d for d in raw_data_path.iterdir() if d.is_dir() and d.name != '__pycache__']
    if datasets:
        print(f"Found {len(datasets)} datasets in raw data directory:")
        for dataset in datasets:
            print(f"  - {dataset.name}")
        print(f"Location: {raw_data_path.absolute()}")
    else:
        print(f"Warning: Raw data directory '{raw_data_path}' is empty.")
        print(f"Please place your dataset files in: {raw_data_path.absolute()}")


Found 5 datasets in raw data directory:
  - CASIA2
  - comofod_small
  - FAU
  - MICC
  - COVERAGE
Location: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/raw


## Inspect Dataset

Analyze dataset structure, count image-ground truth pairs, and categorize by manipulation type.


In [8]:
# Initialize dataset processor
raw_data_path = project_root / config['data_paths']['raw_data']
processed_data_path = project_root / config['data_paths']['processed_data']

processor = DatasetProcessor(
    raw_data_dir=str(raw_data_path),
    processed_data_dir=str(processed_data_path),
    config=config
)

print("Dataset processor initialized")
print(f"Raw data: {raw_data_path}")
print(f"Processed data: {processed_data_path}")
print(f"Target size: {processor.target_size}")
print(f"Target format: {processor.target_format}")


Dataset processor initialized
Raw data: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/raw
Processed data: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed
Target size: (512, 512)
Target format: png


## Get Image-Ground Truth Pairs

Find all pairs of tampered images and their corresponding ground truth masks.


In [9]:
# Process all datasets
print("Starting dataset processing...")
print("This will:")
print("  1. Find all image-mask pairs from all datasets")
print("  2. Resize images to target size (if specified)")
print("  3. Convert masks to binary format")
print("  4. Rename files with consistent numbering")
print("  5. Split into train/val/test sets")
print("  6. Copy to processed directory")
print("\nProcessing...")

stats = processor.process_all_datasets()

print("\n" + "="*60)
print("PROCESSING COMPLETE")
print("="*60)
print(f"Total pairs found: {stats['total_pairs']}")
print(f"Successfully processed: {stats['successful_pairs']}")
print(f"Failed: {stats['failed_pairs']}")

if stats['errors']:
    print(f"\nErrors encountered: {len(stats['errors'])}")
    print("First 5 errors:")
    for error in stats['errors'][:5]:
        print(f"  - {error}")


2025-11-16 18:44:20,899 - INFO - Starting dataset processing pipeline
2025-11-16 18:44:20,899 - INFO - Processing CASIA2...
2025-11-16 18:44:21,038 - INFO - CASIA2: Found 3216 pairs
2025-11-16 18:44:21,038 - INFO - Processing comofod_small...


Starting dataset processing...
This will:
  1. Find all image-mask pairs from all datasets
  2. Resize images to target size (if specified)
  3. Convert masks to binary format
  4. Rename files with consistent numbering
  5. Split into train/val/test sets
  6. Copy to processed directory

Processing...


2025-11-16 18:44:21,158 - INFO - comofod_small: Found 5000 pairs
2025-11-16 18:44:21,158 - INFO - Processing COVERAGE...
2025-11-16 18:44:21,161 - INFO - COVERAGE: Found 100 pairs
2025-11-16 18:44:21,161 - INFO - Processing MICC...
2025-11-16 18:44:21,187 - INFO - MICC: Found 0 pairs
2025-11-16 18:44:21,187 - INFO - Processing FAU...
2025-11-16 18:44:21,204 - INFO - FAU: Found 54 pairs
2025-11-16 18:44:21,204 - INFO - Total pairs found: 8370
2025-11-16 18:44:21,207 - INFO - Split: Train=5858, Val=1256, Test=1256
2025-11-16 20:11:20,813 - INFO - train: 5858 successful, 0 failed
2025-11-16 20:29:30,152 - INFO - val: 1256 successful, 0 failed
2025-11-16 20:46:09,842 - INFO - test: 1256 successful, 0 failed
2025-11-16 20:46:09,843 - INFO - Processing report saved to /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/processing_report.json



PROCESSING COMPLETE
Total pairs found: 8370
Successfully processed: 8370
Failed: 0


## Split Dataset

Split dataset into training, validation, and test sets according to configured ratios.


In [10]:
# Verify processed data
train_dir = processed_data_path / 'train'
val_dir = processed_data_path / 'val'
test_dir = processed_data_path / 'test'

train_images = len(list((train_dir / 'images').glob('*')))
train_masks = len(list((train_dir / 'masks').glob('*')))
val_images = len(list((val_dir / 'images').glob('*')))
val_masks = len(list((val_dir / 'masks').glob('*')))
test_images = len(list((test_dir / 'images').glob('*')))
test_masks = len(list((test_dir / 'masks').glob('*')))

print("\n" + "="*60)
print("PROCESSED DATA VERIFICATION")
print("="*60)
print(f"Training set:")
print(f"  Images: {train_images}")
print(f"  Masks: {train_masks}")
print(f"  Match: {'✓' if train_images == train_masks else '✗'}")

print(f"\nValidation set:")
print(f"  Images: {val_images}")
print(f"  Masks: {val_masks}")
print(f"  Match: {'✓' if val_images == val_masks else '✗'}")

print(f"\nTest set:")
print(f"  Images: {test_images}")
print(f"  Masks: {test_masks}")
print(f"  Match: {'✓' if test_images == test_masks else '✗'}")

total = train_images + val_images + test_images
print(f"\nTotal processed images: {total}")



PROCESSED DATA VERIFICATION
Training set:
  Images: 5858
  Masks: 5858
  Match: ✓

Validation set:
  Images: 1256
  Masks: 1256
  Match: ✓

Test set:
  Images: 1256
  Masks: 1256
  Match: ✓

Total processed images: 8370


## Copy Processed Data to Workspace

Organize split datasets into separate directories with images and masks subdirectories.


In [11]:
# Display sample files
print("\n" + "="*60)
print("SAMPLE FILES")
print("="*60)

train_img_dir = train_dir / 'images'
train_mask_dir = train_dir / 'masks'

sample_images = sorted(list(train_img_dir.glob('*')))[:5]
sample_masks = sorted(list(train_mask_dir.glob('*')))[:5]

print("\nFirst 5 training image-mask pairs:")
for img_path, mask_path in zip(sample_images, sample_masks):
    print(f"  {img_path.name} <-> {mask_path.name}")

print("\n" + "="*60)
print("Data processing complete!")
print("="*60)
print(f"\nProcessed data is available at:")
print(f"  Training: {train_dir}")
print(f"  Validation: {val_dir}")
print(f"  Test: {test_dir}")
print(f"\nProcessing report saved at:")
print(f"  {processed_data_path / 'processing_report.json'}")



SAMPLE FILES

First 5 training image-mask pairs:
  image_000000.png <-> mask_000000.png
  image_000001.png <-> mask_000001.png
  image_000002.png <-> mask_000002.png
  image_000003.png <-> mask_000003.png
  image_000004.png <-> mask_000004.png

Data processing complete!

Processed data is available at:
  Training: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/train
  Validation: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/val
  Test: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/test

Processing report saved at:
  /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/processing_report.json


## Summary

Dataset processing complete. Processed data is organized in:
- `data/processed/train/` - Training images and masks
- `data/processed/val/` - Validation images and masks
- `data/processed/test/` - Test images and masks

Raw data remains in `data/raw/`. Processed data is available in `data/processed/` for experiments.


## FAU Dataset Diagnostic

Analyze which FAU pairs were found and which were missed to avoid duplication when fixing pairing logic.


In [12]:
from fau_diagnostic import diagnose_fau_dataset, print_fau_diagnostic_report

# Run diagnostic on FAU dataset
fau_raw_path = project_root / 'data' / 'raw' / 'FAU'
fau_diagnostic = diagnose_fau_dataset(fau_raw_path)

# Print detailed report
print_fau_diagnostic_report(fau_diagnostic)


FAU DATASET DIAGNOSTIC REPORT

SUMMARY:
  Found pairs: 105
  Images without masks: 4141
  Masks without images: 5704
  Total images found: 4246
  Total masks found: 5852

FOLDER STATISTICS:
  benchmark_data/C1_panasonic:
    Images: 96
    Masks: 15
    Pairs: 16
    Match rate: 16.7%
  benchmark_data/C2_canon:
    Images: 108
    Masks: 12
    Pairs: 12
    Match rate: 11.1%
  benchmark_data/C3_nikon:
    Images: 104
    Masks: 22
    Pairs: 12
    Match rate: 11.5%
  benchmark_data/C4_flickr:
    Images: 146
    Masks: 12
    Pairs: 14
    Match rate: 9.6%
  jpeg/1:
    Images: 18
    Masks: 27
    Pairs: 0
    Match rate: 0.0%
  jpeg/10:
    Images: 18
    Masks: 27
    Pairs: 0
    Match rate: 0.0%
  jpeg/11:
    Images: 18
    Masks: 27
    Pairs: 0
    Match rate: 0.0%
  jpeg/12:
    Images: 18
    Masks: 27
    Pairs: 0
    Match rate: 0.0%
  jpeg/13:
    Images: 18
    Masks: 27
    Pairs: 0
    Match rate: 0.0%
  jpeg/14:
    Images: 18
    Masks: 27
    Pairs: 0
    Match rat

In [13]:
# Save found pairs to a file for reference
import json

fau_found_pairs_file = project_root / 'data' / 'processed' / 'fau_found_pairs.json'
with open(fau_found_pairs_file, 'w') as f:
    json.dump({
        'found_pairs': fau_diagnostic['found_pairs'],
        'folder_stats': dict(fau_diagnostic['folder_stats']),
        'pattern_stats': dict(fau_diagnostic['pattern_stats']),
        'missing_images_count': len(fau_diagnostic['missing_images']),
        'missing_masks_count': len(fau_diagnostic['missing_masks'])
    }, f, indent=2)

print(f"\nFound pairs saved to: {fau_found_pairs_file}")
print(f"\nUse this file to check which pairs were already processed.")
print(f"   When fixing FAU pairing logic, exclude these {len(fau_diagnostic['found_pairs'])} pairs to avoid duplication.")
print(f"\nSummary:")
print(f"  - Found pairs: {len(fau_diagnostic['found_pairs'])}")
print(f"  - Images without masks: {len(fau_diagnostic['missing_images'])}")
print(f"  - Masks without images: {len(fau_diagnostic['missing_masks'])}")



Found pairs saved to: /Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/fau_found_pairs.json

Use this file to check which pairs were already processed.
   When fixing FAU pairing logic, exclude these 105 pairs to avoid duplication.

Summary:
  - Found pairs: 105
  - Images without masks: 4141
  - Masks without images: 5704


In [None]:
from add_fau_pairs import add_new_fau_pairs

new_count = add_new_fau_pairs(
    raw_data_dir=str(project_root / 'data' / 'raw'),
    processed_data_dir=str(project_root / 'data' / 'processed'),
    config=config
)
print(f"Added {new_count} new FAU pairs")

In [14]:
from dataset_coverage_check import check_all_datasets, print_coverage_report

results = check_all_datasets(
    raw_data_dir=str(project_root / 'data' / 'raw'),
    processed_data_dir=str(project_root / 'data' / 'processed'),
    config=config
)

print_coverage_report(results)

DATASET PROCESSING COVERAGE REPORT

✗ CASIA2:
  Found: 3216 pairs
  Expected: 3274 pairs
  Missing: 58 pairs
  Coverage: 98.2%
  Status: incomplete

✓ COVERAGE:
  Found: 100 pairs
  Expected: 100 pairs
  Missing: 0 pairs
  Coverage: 100.0%
  Status: complete

✗ FAU:
  Found: 54 pairs
  Expected: 105 pairs
  Missing: 51 pairs
  Coverage: 51.4%
  Status: incomplete

✗ MICC:
  Found: 0 pairs
  Expected: 160 pairs
  Missing: 160 pairs
  Coverage: 0.0%
  Status: incomplete

✓ comofod_small:
  Found: 5000 pairs
  Expected: 5000 pairs
  Missing: 0 pairs
  Coverage: 100.0%
  Status: complete

SUMMARY:
  Total found: 8370 pairs
  Total expected: 8639 pairs
  Total missing: 269 pairs
  Overall coverage: 96.9%

DATASETS NEEDING ATTENTION:
  - CASIA2: Missing 58 pairs (98.2% coverage)
  - MICC: Missing 160 pairs (0.0% coverage)
  - FAU: Missing 51 pairs (51.4% coverage)
