In [1]:
import os

dataset_path = 'dataset/lfw-deepfunneled'

folders_with_more_than_10_images = []

for folder_name in os.listdir(dataset_path):
    folder_full_path = os.path.join(dataset_path, folder_name)
    
    if os.path.isdir(folder_full_path):
        image_count = len([f for f in os.listdir(folder_full_path) if os.path.isfile(os.path.join(folder_full_path, f))])
        
        if image_count >= 10:
            folders_with_more_than_10_images.append(folder_name)

print(f"Found {len(folders_with_more_than_10_images)} folders with > 10 images.")
print(folders_with_more_than_10_images)

Found 158 folders with > 10 images.
['Abdullah_Gul', 'Adrien_Brody', 'Alejandro_Toledo', 'Alvaro_Uribe', 'Amelie_Mauresmo', 'Andre_Agassi', 'Andy_Roddick', 'Angelina_Jolie', 'Anna_Kournikova', 'Ann_Veneman', 'Ariel_Sharon', 'Ari_Fleischer', 'Arnold_Schwarzenegger', 'Atal_Bihari_Vajpayee', 'Bill_Clinton', 'Bill_Gates', 'Bill_McBride', 'Bill_Simon', 'Britney_Spears', 'Carlos_Menem', 'Carlos_Moya', 'Catherine_Zeta-Jones', 'Charles_Moose', 'Colin_Powell', 'Condoleezza_Rice', 'David_Beckham', 'David_Nalbandian', 'Dick_Cheney', 'Dominique_de_Villepin', 'Donald_Rumsfeld', 'Edmund_Stoiber', 'Eduardo_Duhalde', 'Fidel_Castro', 'George_HW_Bush', 'George_Robertson', 'George_W_Bush', 'Gerhard_Schroeder', 'Gloria_Macapagal_Arroyo', 'Gonzalo_Sanchez_de_Lozada', 'Gordon_Brown', 'Gray_Davis', 'Guillermo_Coria', 'Halle_Berry', 'Hamid_Karzai', 'Hans_Blix', 'Harrison_Ford', 'Hillary_Clinton', 'Howard_Dean', 'Hugo_Chavez', 'Hu_Jintao', 'Ian_Thorpe', 'Igor_Ivanov', 'Jackie_Chan', 'Jack_Straw', 'Jacques_Chir

In [2]:
import pandas as pd

dataset_dir = 'dataset'
output_dir = 'filtered_dataset'
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(dataset_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(dataset_dir, filename)
        df = pd.read_csv(file_path)

        filtered_df = df[df['name'].isin(folders_with_more_than_10_images)]
        
        output_path = os.path.join(output_dir, f"filtered_{filename}")
        filtered_df.to_csv(output_path, index=False)
        print(f"Created {output_path} with {len(filtered_df)} rows.")

Created filtered_dataset\filtered_lfw_allnames.csv with 158 rows.
Created filtered_dataset\filtered_matchpairsDevTest.csv with 79 rows.
Created filtered_dataset\filtered_matchpairsDevTrain.csv with 183 rows.
Created filtered_dataset\filtered_mismatchpairsDevTest.csv with 8 rows.
Created filtered_dataset\filtered_mismatchpairsDevTrain.csv with 24 rows.
Created filtered_dataset\filtered_pairs.csv with 661 rows.
Created filtered_dataset\filtered_people.csv with 158 rows.
Created filtered_dataset\filtered_peopleDevTest.csv with 47 rows.
Created filtered_dataset\filtered_peopleDevTrain.csv with 111 rows.


In [3]:
import shutil

source_base_dir = 'dataset/lfw-deepfunneled'
destination_base_dir = 'filtered_dataset/filtered_lfw-deepfunneled' 
os.makedirs(destination_base_dir, exist_ok=True)

for folder_name in folders_with_more_than_10_images:
    source_path = os.path.join(source_base_dir, folder_name)
    new_folder_name = f"filtered_{folder_name}"
    destination_path = os.path.join(destination_base_dir, new_folder_name)

    if os.path.exists(source_path) and os.path.isdir(source_path):
        if os.path.exists(destination_path):
            shutil.rmtree(destination_path)
        shutil.copytree(source_path, destination_path)