In [1]:
import os
import random
import numpy as np
import cv2
from tqdm import tqdm

# Define dataset paths for real emotions
output_real_faces_sad = "E:/dataset_1/sad/real"
output_optical_flow_sad = "E:/dataset_1/sad/optical_flow" 
output_edges_sad = "E:/dataset_1/sad/edges"

output_real_faces_angry = "E:/dataset_1/angry/real"
output_optical_flow_angry = "E:/dataset_1/angry/optical_flow" 
output_edges_angry = "E:/dataset_1/angry/edges"

output_real_faces_contempt = "E:/dataset_1/contempt/real"
output_optical_flow_contempt = "E:/dataset_1/contempt/optical_flow" 
output_edges_contempt = "E:/dataset_1/contempt/edges"

output_real_faces_disgust = "E:/dataset_1/disgust/real"
output_optical_flow_disgust = "E:/dataset_1/disgust/optical_flow" 
output_edges_disgust = "E:/dataset_1/disgust/edges"

output_real_faces_fear = "E:/dataset_1/fear/real"
output_optical_flow_fear = "E:/dataset_1/fear/optical_flow" 
output_edges_fear = "E:/dataset_1/fear/edges"

output_real_faces_happy = "E:/dataset_1/happy/real"
output_optical_flow_happy = "E:/dataset_1/happy/optical_flow" 
output_edges_happy = "E:/dataset_1/happy/edges"

output_real_faces_neutral = "E:/dataset_1/neutral/real"
output_optical_flow_neutral = "E:/dataset_1/neutral/optical_flow"
output_edges_neutral = "E:/dataset_1/neutral/edges"

output_real_faces_surprise = "E:/dataset_1/surprise/real"
output_optical_flow_surprise = "E:/dataset_1/surprise/optical_flow" 
output_edges_surprise = "E:/dataset_1/surprise/edges"

# Define dataset paths for fake images 
output_fake_faces = "E:/dataset_1/happy/fake"
output_fake_optical_flow = "E:/dataset_1/happy/optical_flow"
output_fake_edges = "E:/dataset_1/happy/edges"

save_path = "E:/dataset_1/preprocessed_combined.npz"
image_shape = (299, 299, 9)
batch_size = 1000  # Number of images to load/write in one batch

def get_valid_files(face_dir, flow_dir, edge_dir, max_samples=None):
    """Get a shuffled list of valid filenames (face, flow, edges) that exist."""
    face_files = [f for f in os.listdir(face_dir) if f.endswith(".jpg")]
    random.shuffle(face_files)
    if max_samples is not None:
        face_files = face_files[:max_samples]
    return face_files

def process_and_write_to_memmap(face_dir, flow_dir, edge_dir, face_files, 
                                x_memmap, y_memmap, label, start_idx=0, 
                                batch_size=1000):
    """
    Loads images in batches, writes directly to memmap to avoid storing in RAM.
    Returns the number of images actually written.
    """
    total_written = 0
    pbar = tqdm(total=len(face_files), desc=f"Processing {face_dir}", unit="img")

    for i in range(0, len(face_files), batch_size):
        batch_files = face_files[i:i + batch_size]
        batch_x = []
        batch_y = []

        for face_file in batch_files:
            base_name = face_file.replace(".jpg", "")
            face_path = os.path.join(face_dir, face_file)
            flow_path = os.path.join(flow_dir, base_name + "_flow.jpg")
            edge_path = os.path.join(edge_dir, base_name + "_edges.jpg")

            if (os.path.exists(face_path) and 
                os.path.exists(flow_path) and 
                os.path.exists(edge_path)):

                try:
                    face_img = cv2.imread(face_path)
                    flow_img = cv2.imread(flow_path)
                    edge_img = cv2.imread(edge_path)

                    # Skip if any image failed to load
                    if face_img is None or flow_img is None or edge_img is None:
                        pbar.update(1)
                        continue

                    face_img = cv2.resize(face_img, (299, 299))
                    flow_img = cv2.resize(flow_img, (299, 299))
                    edge_img = cv2.resize(edge_img, (299, 299))

                    # Normalize images
                    face_img = face_img.astype(np.float32) / 255.0
                    flow_img = flow_img.astype(np.float32) / 255.0
                    edge_img = edge_img.astype(np.float32) / 255.0

                    # Concatenate into 9 channels
                    combined_input = np.concatenate((face_img, flow_img, edge_img), axis=-1)
                    
                    batch_x.append(combined_input)
                    batch_y.append(label)

                except Exception as e:
                    print(f"⚠️ Error processing {face_file}: {e}")

            pbar.update(1)

        if batch_x:
            current_batch_size = len(batch_x)
            x_memmap[start_idx : start_idx + current_batch_size] = batch_x
            y_memmap[start_idx : start_idx + current_batch_size] = batch_y
            start_idx += current_batch_size
            total_written += current_batch_size

    pbar.close()
    return total_written

# ---------------------------------------------------------------------------------
# 1. Define real emotion categories and compute total samples
# ---------------------------------------------------------------------------------
real_emotions = [
    (output_real_faces_sad, output_optical_flow_sad, output_edges_sad),
    (output_real_faces_angry, output_optical_flow_angry, output_edges_angry),
    (output_real_faces_contempt, output_optical_flow_contempt, output_edges_contempt),
    (output_real_faces_disgust, output_optical_flow_disgust, output_edges_disgust),
    (output_real_faces_fear, output_optical_flow_fear, output_edges_fear),
    (output_real_faces_happy, output_optical_flow_happy, output_edges_happy),
    (output_real_faces_neutral, output_optical_flow_neutral, output_edges_neutral),
    (output_real_faces_surprise, output_optical_flow_surprise, output_edges_surprise)
]

max_real_per_emotion = 10000
max_fake = 80000
max_real_total = max_real_per_emotion * len(real_emotions)
total_samples = max_real_total + max_fake

print(f"Planned real samples: {max_real_total}, Planned fake samples: {max_fake}, Total: {total_samples}")

# ---------------------------------------------------------------------------------
# 2. Create memmap arrays for all samples
# ---------------------------------------------------------------------------------
x_data_memmap_path = save_path.replace(".npz", "_x.dat")
y_data_memmap_path = save_path.replace(".npz", "_y.dat")

print("🔨 Creating memmap files...")
x_data_memmap = np.memmap(
    x_data_memmap_path, 
    dtype=np.float32, 
    mode="w+", 
    shape=(total_samples, *image_shape)
)
y_data_memmap = np.memmap(
    y_data_memmap_path,
    dtype=np.int8,
    mode="w+",
    shape=(total_samples,)
)

# ---------------------------------------------------------------------------------
# 3. Process REAL samples for each emotion category
# ---------------------------------------------------------------------------------
current_index = 0
total_real_written = 0
for face_dir, flow_dir, edge_dir in real_emotions:
    print(f"\n🟢 Processing REAL images from: {face_dir}")
    face_files = get_valid_files(face_dir, flow_dir, edge_dir, max_samples=max_real_per_emotion)
    written = process_and_write_to_memmap(
        face_dir, 
        flow_dir, 
        edge_dir, 
        face_files, 
        x_data_memmap, 
        y_data_memmap, 
        label=0, 
        start_idx=current_index,
        batch_size=batch_size
    )
    current_index += written
    total_real_written += written

print(f"\n✅ Finished processing REAL images. Total real images written: {total_real_written}")

# ---------------------------------------------------------------------------------
# 4. Process FAKE samples in one chunk
# ---------------------------------------------------------------------------------
print("\n🔴 Processing FAKE images...")
fake_face_files = get_valid_files(output_fake_faces, output_fake_optical_flow, output_fake_edges, max_samples=max_fake)
written_fake = process_and_write_to_memmap(
    output_fake_faces, 
    output_fake_optical_flow, 
    output_fake_edges, 
    fake_face_files, 
    x_data_memmap, 
    y_data_memmap, 
    label=1, 
    start_idx=current_index,
    batch_size=batch_size
)
current_index += written_fake

print(f"\n✅ Finished processing FAKE images. Total fake images written: {written_fake}")

print(f"\n✅ Overall, images written: {total_real_written + written_fake} out of planned {total_samples}")

# ---------------------------------------------------------------------------------
# 5. Flush memmap to disk and create compressed .npz file
# ---------------------------------------------------------------------------------
del x_data_memmap
del y_data_memmap

print("💾 Creating final compressed NPZ file...")

# Reopen memmap arrays in read mode for the actual number of images written
x_data_memmap = np.memmap(
    x_data_memmap_path, 
    dtype=np.float32, 
    mode="r", 
    shape=(total_real_written + written_fake, *image_shape)
)
y_data_memmap = np.memmap(
    y_data_memmap_path, 
    dtype=np.int8, 
    mode="r", 
    shape=(total_real_written + written_fake,)
)

np.savez_compressed(
    save_path, 
    x=x_data_memmap, 
    y=y_data_memmap
)

print("✅ Data saved successfully!")


Planned real samples: 80000, Planned fake samples: 80000, Total: 160000
🔨 Creating memmap files...

🟢 Processing REAL images from: E:/dataset_1/sad/real


Processing E:/dataset_1/sad/real: 100%|██████████| 10000/10000 [18:18<00:00,  9.10img/s] 



🟢 Processing REAL images from: E:/dataset_1/angry/real


Processing E:/dataset_1/angry/real: 100%|██████████| 10000/10000 [14:36<00:00, 11.40img/s] 



🟢 Processing REAL images from: E:/dataset_1/contempt/real


Processing E:/dataset_1/contempt/real: 100%|██████████| 10000/10000 [15:32<00:00, 10.73img/s] 



🟢 Processing REAL images from: E:/dataset_1/disgust/real


Processing E:/dataset_1/disgust/real: 100%|██████████| 10000/10000 [13:35<00:00, 12.26img/s] 



🟢 Processing REAL images from: E:/dataset_1/fear/real


Processing E:/dataset_1/fear/real: 100%|██████████| 10000/10000 [17:55<00:00,  9.30img/s] 



🟢 Processing REAL images from: E:/dataset_1/happy/real


Processing E:/dataset_1/happy/real: 100%|██████████| 10000/10000 [34:17<00:00,  4.86img/s] 



🟢 Processing REAL images from: E:/dataset_1/neutral/real


Processing E:/dataset_1/neutral/real: 100%|██████████| 10000/10000 [22:14<00:00,  7.49img/s] 



🟢 Processing REAL images from: E:/dataset_1/surprise/real


Processing E:/dataset_1/surprise/real: 100%|██████████| 10000/10000 [15:36<00:00, 10.67img/s] 



✅ Finished processing REAL images. Total real images written: 79999

🔴 Processing FAKE images...


Processing E:/dataset_1/happy/fake: 100%|██████████| 80000/80000 [7:59:27<00:00,  2.78img/s]   



✅ Finished processing FAKE images. Total fake images written: 80000

✅ Overall, images written: 159999 out of planned 160000
💾 Creating final compressed NPZ file...
✅ Data saved successfully!


In [1]:
import numpy as np

# Paths to your existing .dat files (from your previous run)
x_dat_path = "E:/dataset_1/preprocessed_combined_x.dat"
y_dat_path = "E:/dataset_1/preprocessed_combined_y.dat"

# Define the shape of your data.
# Replace num_samples with the actual number of samples written (e.g., total_real_written + written_fake)
num_samples = 123456  # <-- Update this value accordingly
image_shape = (299, 299, 9)

print("Loading memmap arrays from existing .dat files...")
x_memmap = np.memmap(x_dat_path, dtype=np.float32, mode="r", shape=(num_samples, *image_shape))
y_memmap = np.memmap(y_dat_path, dtype=np.int8, mode="r", shape=(num_samples,))

# Define paths for the uncompressed NPY files
x_save_path = "E:/dataset_1/combined_x_data.npy"
y_save_path = "E:/dataset_1/combined_y_data.npy"

print("Saving data as uncompressed NPY files...")
np.save(x_save_path, x_memmap)
np.save(y_save_path, y_memmap)

print("✅ Data saved successfully as uncompressed NPY files:")
print(f"    {x_save_path}")
print(f"    {y_save_path}")


Loading memmap arrays from existing .dat files...
Saving data as uncompressed NPY files...
✅ Data saved successfully as uncompressed NPY files:
    E:/dataset_1/combined_x_data.npy
    E:/dataset_1/combined_y_data.npy
