In [1]:
import os

# Define your dataset paths
output_real_faces = "E:/dataset_1/test_set/real"
output_fake_faces = "E:/dataset_1/test_set/fake"
output_optical_flow = "E:/dataset_1/test_set/optical_flow"
output_edges = "E:/dataset_1/test_set/edges"

# Check number of files in each folder
real_face_files = len([f for f in os.listdir(output_real_faces) if f.endswith(".jpg")])
fake_face_files = len([f for f in os.listdir(output_fake_faces) if f.endswith(".jpg")])
flow_files = len([f for f in os.listdir(output_optical_flow) if f.endswith(".jpg")])
edge_files = len([f for f in os.listdir(output_edges) if f.endswith(".jpg")])

print(f"‚úÖ Real images: {real_face_files}")
print(f"‚úÖ Fake images: {fake_face_files}")
print(f"‚úÖ Optical Flow images: {flow_files}")
print(f"‚úÖ Edge images: {edge_files}")


‚úÖ Real images: 28470
‚úÖ Fake images: 84574
‚úÖ Optical Flow images: 113044
‚úÖ Edge images: 113044


In [2]:
import os

# Define dataset paths
output_real_faces = "E:/dataset_1/test_set/real"
output_fake_faces = "E:/dataset_1/test_set/fake"
output_optical_flow = "E:/dataset_1/test_set/optical_flow"
output_edges = "E:/dataset_1/test_set/edges"

# Get face filenames (keep full names without modification)
real_face_files = {f for f in os.listdir(output_real_faces) if f.endswith(".jpg")}
# fake_face_files = {f for f in os.listdir(output_fake_faces) if f.endswith(".jpg")}

# Combine real and fake face filenames
total_faces = real_face_files

# Get optical flow and edge filenames (keep full names)
optical_flows = {f.replace("_flow.jpg", ".jpg") for f in os.listdir(output_optical_flow) if f.endswith(".jpg")}
edge_maps = {f.replace("_edges.jpg", ".jpg") for f in os.listdir(output_edges) if f.endswith(".jpg")}

# Find missing optical flow and edge maps
missing_flows = total_faces - optical_flows
missing_edges = total_faces - edge_maps

# Print dataset counts
print(f"‚úÖ Real face images: {len(real_face_files)}")
# print(f"‚úÖ Fake face images: {len(fake_face_files)}")
print(f"‚úÖ Optical Flow images: {len(optical_flows)} (Expected: {len(total_faces)})")
print(f"‚úÖ Edge images: {len(edge_maps)} (Expected: {len(total_faces)})")

# Print missing files (if any)
if missing_flows:
    print(f"\n‚ùå Missing Optical Flow for {len(missing_flows)} faces:")
    print("\n".join(list(missing_flows)[:5]))  # Show first 5 missing

if missing_edges:
    print(f"\n‚ùå Missing Edge Maps for {len(missing_edges)} faces:")
    print("\n".join(list(missing_edges)[:5]))  # Show first 5 missing


‚úÖ Real face images: 28470
‚úÖ Optical Flow images: 113044 (Expected: 28470)
‚úÖ Edge images: 113044 (Expected: 28470)


In [None]:
import os
import random
import numpy as np
import cv2
from tqdm import tqdm

# Define dataset paths
output_real_faces = "E:/dataset_1/test_set/real"
output_optical_flow = "E:/dataset_1/test_set/optical_flow"
output_edges = "E:/dataset_1/test_set/edges"

output_fake_faces = "E:/dataset_1/test_set/fake"
output_fake_optical_flow = "E:/dataset_1/test_set/optical_flow"
output_fake_edges = "E:/dataset_1/test_set/edges"

save_path = "E:/dataset_1/preprocessed_surprise.npz"
image_shape = (299, 299, 9)
batch_size = 1000  # Number of images to load/write in one batch

def get_valid_files(face_dir, flow_dir, edge_dir, max_samples=None):
    """Get a shuffled list of valid filenames (face, flow, edges) all exist."""
    face_files = [f for f in os.listdir(face_dir) if f.endswith(".jpg")]
    random.shuffle(face_files)
    if max_samples is not None:
        face_files = face_files[:max_samples]
    return face_files

def process_and_write_to_memmap(face_dir, flow_dir, edge_dir, face_files, 
                                x_memmap, y_memmap, label, start_idx=0, 
                                batch_size=1000):
    """
    Loads images in batches, writes directly to memmap to avoid storing in RAM.
    Returns the number of images actually written.
    """
    total_written = 0

    # Create a TQDM progress bar for the entire list of files
    pbar = tqdm(total=len(face_files), desc=f"Processing {face_dir}", unit="img")

    for i in range(0, len(face_files), batch_size):
        batch_files = face_files[i:i + batch_size]

        # We'll collect a batch in memory just for the current chunk
        batch_x = []
        batch_y = []

        for face_file in batch_files:
            base_name = face_file.replace(".jpg", "")

            face_path = os.path.join(face_dir, face_file)
            flow_path = os.path.join(flow_dir, base_name + "_flow.jpg")
            edge_path = os.path.join(edge_dir, base_name + "_edges.jpg")

            if (os.path.exists(face_path) and 
                os.path.exists(flow_path) and 
                os.path.exists(edge_path)):

                try:
                    face_img = cv2.imread(face_path)
                    flow_img = cv2.imread(flow_path)
                    edge_img = cv2.imread(edge_path)

                    # If any read fails, skip it
                    if face_img is None or flow_img is None or edge_img is None:
                        pbar.update(1)
                        continue

                    face_img = cv2.resize(face_img, (299, 299))
                    flow_img = cv2.resize(flow_img, (299, 299))
                    edge_img = cv2.resize(edge_img, (299, 299))

                    # Normalize
                    face_img = face_img.astype(np.float32) / 255.0
                    flow_img = flow_img.astype(np.float32) / 255.0
                    edge_img = edge_img.astype(np.float32) / 255.0

                    # Concatenate into 9 channels
                    combined_input = np.concatenate((face_img, flow_img, edge_img), axis=-1)
                    
                    batch_x.append(combined_input)
                    batch_y.append(label)

                except Exception as e:
                    print(f"‚ö†Ô∏è Error processing {face_file}: {e}")

            # Update TQDM for *every* file we attempt
            pbar.update(1)

        # Write the current batch to memmap
        if batch_x:
            current_batch_size = len(batch_x)
            x_memmap[start_idx : start_idx + current_batch_size] = batch_x
            y_memmap[start_idx : start_idx + current_batch_size] = batch_y
            start_idx += current_batch_size
            total_written += current_batch_size

    pbar.close()
    return total_written

# ---------------------------------------------------------------------------------
# 1. Gather file lists and determine total size
# ---------------------------------------------------------------------------------
real_face_files = get_valid_files(output_real_faces, output_optical_flow, output_edges)
fake_face_files = get_valid_files(output_fake_faces, output_fake_optical_flow, output_fake_edges)

num_real = len(real_face_files)
num_fake = len(fake_face_files)
total_samples = num_real + num_fake

print(f"Real samples: {num_real}, Fake samples: {num_fake}, Total: {total_samples}")
if total_samples == 0:
    print("‚ùå ERROR: No valid images found! Exiting.")
    exit()

# ---------------------------------------------------------------------------------
# 2. Create memmap arrays (one big array for real+fake)
# ---------------------------------------------------------------------------------
x_data_memmap_path = save_path.replace(".npz", "_x.dat")
y_data_memmap_path = save_path.replace(".npz", "_y.dat")

print("üî® Creating memmap files...")
x_data_memmap = np.memmap(
    x_data_memmap_path, 
    dtype=np.float32, 
    mode="w+", 
    shape=(total_samples, *image_shape)
)
y_data_memmap = np.memmap(
    y_data_memmap_path,
    dtype=np.int8,
    mode="w+",
    shape=(total_samples,)
)

# ---------------------------------------------------------------------------------
# 3. Process REAL samples in chunks
# ---------------------------------------------------------------------------------
print("\nüü¢ Processing REAL images...")
current_index = 0
written_real = process_and_write_to_memmap(
    output_real_faces, 
    output_optical_flow, 
    output_edges, 
    real_face_files, 
    x_data_memmap, 
    y_data_memmap, 
    label=0, 
    start_idx=current_index,
    batch_size=batch_size
)
current_index += written_real

# ---------------------------------------------------------------------------------
# 4. Process FAKE samples in chunks
# ---------------------------------------------------------------------------------
print("\nüî¥ Processing FAKE images...")
written_fake = process_and_write_to_memmap(
    output_fake_faces, 
    output_fake_optical_flow, 
    output_fake_edges, 
    fake_face_files, 
    x_data_memmap, 
    y_data_memmap, 
    label=1, 
    start_idx=current_index,
    batch_size=batch_size
)
current_index += written_fake

print(f"\n‚úÖ Done processing.\nReal written: {written_real}, Fake written: {written_fake}, Total written: {written_real + written_fake}")

# ---------------------------------------------------------------------------------
# 5. Flush memmap to disk and create compressed .npz
# ---------------------------------------------------------------------------------
del x_data_memmap
del y_data_memmap

print("üíæ Creating final compressed NPZ file...")

# Reopen in read mode and save as NPZ
x_data_memmap = np.memmap(
    x_data_memmap_path, 
    dtype=np.float32, 
    mode="r", 
    shape=(written_real + written_fake, *image_shape)
)
y_data_memmap = np.memmap(
    y_data_memmap_path, 
    dtype=np.int8, 
    mode="r", 
    shape=(written_real + written_fake,)
)

np.savez_compressed(
    save_path, 
    x=x_data_memmap, 
    y=y_data_memmap
)

print("‚úÖ Data saved successfully!")


Real samples: 28470, Fake samples: 84574, Total: 113044
üî® Creating memmap files...

üü¢ Processing REAL images...


Processing E:/dataset_1/test_set/real: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28470/28470 [24:43<00:00, 19.19img/s]  



üî¥ Processing FAKE images...


Processing E:/dataset_1/test_set/fake:   2%|‚ñè         | 2095/84574 [03:34<2:35:39,  8.83img/s] 