In [16]:
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
dataset_path = 'Faceshape_dataset'
# The dataset folder is inside a subdirectory called 'dataset'
base_dir = os.path.join(dataset_path)
print('using dataset folder: ', base_dir)
#list the subdirectories in base_dir
subdirs = os.listdir(base_dir)
print('subdirectories in base_dir: ', subdirs)

using dataset folder:  Faceshape_dataset
subdirectories in base_dir:  ['test', 'train']


In [20]:
#prepare a list to collect file paths and labels
data = []

#iterate over the subdirectories
for split in subdirs:
    split_dir = os.path.join(base_dir, split)
    if not os.path.isdir(split_dir):
        print(f'warning: {split_dir} is not a directory, skipping.')
        continue
    #list class folders within each split directory
    class_folders = [os.path.join(split_dir, folder) for folder in os.listdir(split_dir) if os.path.isdir(os.path.join(split_dir, folder))]
    print(f'found class folders in "{split}":', [os.path.basename(cf) for cf in class_folders])
    
    #iterate over each class folder
    for class_folder in class_folders:
        label = os.path.basename(class_folder) #folder name as label
        #get all images filepath
        img_files = glob.glob(os.path.join(class_folder,'*.*'))
        print(f'found {len(img_files)} files in "{label}" folder under "{split}".')
        for img_path in img_files:
            data.append({'filepath': img_path, 'label': label, 'orginal_split': split})

#create a dataframe from the collected data
df = pd.DataFrame(data)
#check if df contains the label
if 'label' in df.columns and not df.empty:
    print('Total samples in the dataset: ', len(df))
    print('class distribution:')
    print(df['label'].value_counts())
else:
    raise KeyError('the label column is missing from df.')

#split dataset into train 70% test 10% validation 20%
#first split the test set 10%
train_val_df, test_df = train_test_split(
    df,
    test_size=0.10,
    stratify=df['label'],
    random_state=42)

#from the remaining 90% allocate 20% for validation
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.2222, #~20% of overall data
    stratify=train_val_df['label'],
    random_state=42)

#show the number of samples for each split
print('\ntrain samples: ', len(train_df))
print('\nvalidation samples: ', len(val_df))
print('\ntest samples: ', len(test_df))

#save the splits to csv for processing
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

found class folders in "test": ['Heart', 'Oblong', 'Oval', 'Round', 'Square']
found 200 files in "Heart" folder under "test".
found 200 files in "Oblong" folder under "test".
found 199 files in "Oval" folder under "test".
found 199 files in "Round" folder under "test".
found 200 files in "Square" folder under "test".
found class folders in "train": ['Heart', 'Oblong', 'Oval', 'Round', 'Square']
found 798 files in "Heart" folder under "train".
found 798 files in "Oblong" folder under "train".
found 797 files in "Oval" folder under "train".
found 789 files in "Round" folder under "train".
found 799 files in "Square" folder under "train".
Total samples in the dataset:  4979
class distribution:
label
Square    999
Heart     998
Oblong    998
Oval      996
Round     988
Name: count, dtype: int64

train samples:  3485

validation samples:  996

test samples:  498


In [21]:
train_df

Unnamed: 0,filepath,label,orginal_split
3461,Faceshape_dataset\train\Round\Round(161).jpg,Round,train
3700,Faceshape_dataset\train\Round\Round(377).jpg,Round,train
4368,Faceshape_dataset\train\Square\Square(268).jpg,Square,train
10,Faceshape_dataset\test\Heart\Heart(107).jpg,Heart,test
2931,Faceshape_dataset\train\Oval\Oval(401).jpg,Oval,train
...,...,...,...
4649,Faceshape_dataset\train\Square\Square(520).jpg,Square,train
4000,Faceshape_dataset\train\Round\Round(647).jpg,Round,train
3260,Faceshape_dataset\train\Oval\Oval(699).jpg,Oval,train
3481,Faceshape_dataset\train\Round\Round(18).jpg,Round,train


In [22]:
val_df

Unnamed: 0,filepath,label,orginal_split
589,Faceshape_dataset\test\Oval\Oval(90).jpg,Oval,test
1607,Faceshape_dataset\train\Heart\Heart(647).jpg,Heart,train
1452,Faceshape_dataset\train\Heart\Heart(507).jpg,Heart,train
2384,Faceshape_dataset\train\Oblong\Oblong(628).jpg,Oblong,train
854,Faceshape_dataset\test\Square\Square(149).jpg,Square,test
...,...,...,...
1090,Faceshape_dataset\train\Heart\Heart(181).jpg,Heart,train
2781,Faceshape_dataset\train\Oval\Oval(267).jpg,Oval,train
957,Faceshape_dataset\test\Square\Square(62).jpg,Square,test
3274,Faceshape_dataset\train\Oval\Oval(710).jpg,Oval,train


In [23]:
test_df

Unnamed: 0,filepath,label,orginal_split
847,Faceshape_dataset\test\Square\Square(142).jpg,Square,test
3151,Faceshape_dataset\train\Oval\Oval(60).jpg,Oval,train
3346,Faceshape_dataset\train\Oval\Oval(776).jpg,Oval,train
1353,Faceshape_dataset\train\Heart\Heart(418).jpg,Heart,train
3861,Faceshape_dataset\train\Round\Round(521).jpg,Round,train
...,...,...,...
2167,Faceshape_dataset\train\Oblong\Oblong(432).jpg,Oblong,train
4471,Faceshape_dataset\train\Square\Square(360).jpg,Square,train
1211,Faceshape_dataset\train\Heart\Heart(290).jpg,Heart,train
4543,Faceshape_dataset\train\Square\Square(425).jpg,Square,train
