In [1]:
import cv2
import shutil
from sklearn.model_selection import train_test_split
import os


In [4]:

# Paths
data_dir = 'c:/Users/tarek/OneDrive/Desktop/Attendance/lfw_funneled'  # Root folder after extraction
train_dir = "c:/Users/tarek/OneDrive/Desktop/Attendance/dataset10/train_data"
test_dir = "c:/Users/tarek/OneDrive/Desktop/Attendance/dataset10/test_data"

# Create directories for train and test data
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Initialize a counter for processed people
people_processed = 0
max_people = 10  # Limit to 10 people

# Valid image extensions
valid_extensions = {".jpg", ".jpeg", ".png"}

# Load OpenCV's pre-trained Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Function to detect and crop the face from an image
def crop_face(image_path):
    # Read the image
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale for face detection

    # Detect faces in the image
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)

    if len(faces) == 0:
        return None  # No face detected

    # Take the first face (you can modify this to select a specific one if needed)
    (x, y, w, h) = faces[0]

    # Crop the face from the image
    cropped_img = img[y:y+h, x:x+w]

    return cropped_img

# Split images for each person into train and test sets
for person in os.listdir(data_dir):
    if people_processed >= max_people:
        break  # Stop after 10 people

    person_path = os.path.join(data_dir, person)
    if os.path.isdir(person_path):
        # Filter valid and existing image files
        images = [
            img for img in os.listdir(person_path)
            if os.path.splitext(img)[1].lower() in valid_extensions and
            os.path.exists(os.path.join(person_path, img))
        ]

        # Skip if the person has fewer than 5 valid images
        if len(images) < 10:
            continue

        # Perform the split (take the first 5 images)
        train_images, test_images = train_test_split(images[:10], test_size=0.2, random_state=42)

        # Copy and crop training images
        train_person_dir = os.path.join(train_dir, person)
        os.makedirs(train_person_dir, exist_ok=True)
        for img in train_images:
            src_path = os.path.join(person_path, img)
            cropped_img = crop_face(src_path)

            if cropped_img is not None:
                # Save the cropped face image
                dest_path = os.path.join(train_person_dir, img)
                cv2.imwrite(dest_path, cropped_img)

        # Copy and crop testing images
        test_person_dir = os.path.join(test_dir, person)
        os.makedirs(test_person_dir, exist_ok=True)
        for img in test_images:
            src_path = os.path.join(person_path, img)
            cropped_img = crop_face(src_path)

            if cropped_img is not None:
                # Save the cropped face image
                dest_path = os.path.join(test_person_dir, img)
                cv2.imwrite(dest_path, cropped_img)

        # Increment the counter
        people_processed += 1

print(f"Train and test datasets created with data for {people_processed} people!")


Train and test datasets created with data for 10 people!
