<a href="https://colab.research.google.com/github/Theflawlessone/Face_Detection/blob/main/Notebooks/Faces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Global Imports
import os, tarfile, re, glob, random
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split


# Importing Dataset

In [None]:
# Reproducibility seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Folder paths
BASE_DIR = "./"
DATA_DIR = os.path.join(BASE_DIR, "data", "UTKFace")
os.makedirs(DATA_DIR, exist_ok=True)

# Path to your archive files in Google Drive
from google.colab import drive
drive.mount('/content/drive')
GDRIVE_ARCHIVE_BASE_PATH = "/content/drive/MyDrive/UTKFaceDataset/UTKface_inthewild"

ARCHIVES = [
    os.path.join(GDRIVE_ARCHIVE_BASE_PATH, "part1.tar.gz"),
    os.path.join(GDRIVE_ARCHIVE_BASE_PATH, "part2.tar.gz"),
    os.path.join(GDRIVE_ARCHIVE_BASE_PATH, "part3.tar.gz"),
]

# Extraction
def safe_extract(tar_path, extract_to):
    with tarfile.open(tar_path, "r:*") as tar:
        for member in tar.getmembers():
            member_path = os.path.join(extract_to, member.name)
            if not os.path.abspath(member_path).startswith(os.path.abspath(extract_to)):
                raise Exception(f"Unsafe path in {tar_path}: {member.name}")
        tar.extractall(extract_to)
    print(f"Extracted {os.path.basename(tar_path)}")

# Idempotent extraction loop
for arc in ARCHIVES:
    flag = os.path.join(DATA_DIR, f".done_{os.path.basename(arc)}")
    if os.path.exists(flag):
        print(f"Already extracted: {os.path.basename(arc)}")
        continue
    if not os.path.exists(arc):
        print(f"Missing archive: {arc}")
        continue
    safe_extract(arc, DATA_DIR)
    open(flag, "w").close()

# Collect images
image_paths = sorted(glob.glob(os.path.join(DATA_DIR, "**", "*.jpg"), recursive=True))
print(f"Found {len(image_paths)} images in total.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Already extracted: part1.tar.gz
Already extracted: part2.tar.gz
Already extracted: part3.tar.gz
Found 24106 images in total.


## Cleaning Dataset

In [None]:
# Clean unreadable images, if any
def is_valid_image(path):
    try:
        img = Image.open(path)
        img.verify()
        return True
    except:
        return False

def parse_label(path):
    name = os.path.basename(path)
    match = re.match(r"(\d+)_(\d+)_(\d+)_", name)
    if not match:
        return None, None
    age = int(match.group(1))
    race = int(match.group(3))
    return age, race

valid_records = []
for p in image_paths:
    if not is_valid_image(p):
        continue
    age, race = parse_label(p)
    if age is None or race is None:
        continue
    if not (0 <= age <= 100):  # remove extreme outliers
        continue
    valid_records.append([p, age, race])

df = pd.DataFrame(valid_records, columns=["path", "age", "race"])
print("After cleaning:", len(df))
df.head()


After cleaning: 24080


Unnamed: 0,path,age,race
0,./data/UTKFace/part1/100_1_0_20170110183726390...,100,0
1,./data/UTKFace/part1/100_1_2_20170105174847679...,100,2
2,./data/UTKFace/part1/100_1_2_20170110182836729...,100,2
3,./data/UTKFace/part1/10_0_0_20161220222308131.jpg,10,0
4,./data/UTKFace/part1/10_0_0_20170103200329407.jpg,10,0
