Step 1: Kaggle download + imports

In [1]:
!pip -q install kaggle

import os, json, shutil, random
from pathlib import Path

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("TensorFlow:", tf.__version__)
print("GPU:", tf.config.list_physical_devices("GPU"))

TensorFlow: 2.19.0
GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Step 2: Download CelebA from Kaggle

In [6]:
!pip -q install kaggle

import os, json
from pathlib import Path
from getpass import getpass

KAGGLE_USERNAME = input("Kaggle username: ")
KAGGLE_KEY = getpass("Kaggle API key (hidden): ")

kaggle_dir = Path.home() / ".kaggle"
kaggle_dir.mkdir(parents=True, exist_ok=True)

kaggle_json_path = kaggle_dir / "kaggle.json"
with open(kaggle_json_path, "w") as f:
    json.dump({"username": KAGGLE_USERNAME, "key": KAGGLE_KEY}, f)

os.chmod(kaggle_json_path, 0o600)
print("✅ kaggle.json created at:", kaggle_json_path)

data_root = Path("/content/celeba_kaggle")
data_root.mkdir(parents=True, exist_ok=True)

!kaggle datasets download -d jessicali9530/celeba-dataset -p {data_root} --unzip
!ls -lah {data_root}

Kaggle username: prithilasaha
Kaggle API key (hidden): ··········
✅ kaggle.json created at: /root/.kaggle/kaggle.json
Dataset URL: https://www.kaggle.com/datasets/jessicali9530/celeba-dataset
License(s): other
Downloading celeba-dataset.zip to /content/celeba_kaggle
 98% 1.30G/1.33G [00:32<00:00, 62.2MB/s]
100% 1.33G/1.33G [00:32<00:00, 44.2MB/s]
total 45M
drwxr-xr-x 3 root root 4.0K Jan  7 23:18 .
drwxr-xr-x 1 root root 4.0K Jan  7 23:13 ..
-rw-r--r-- 1 root root 3.3M Jan  7 23:13 identity_CelebA.txt
drwxr-xr-x 3 root root 4.0K Jan  7 23:11 img_align_celeba
-rw-r--r-- 1 root root  24M Jan  7 23:18 list_attr_celeba.csv
-rw-r--r-- 1 root root 5.2M Jan  7 23:18 list_bbox_celeba.csv
-rw-r--r-- 1 root root 2.8M Jan  7 23:18 list_eval_partition.csv
-rw-r--r-- 1 root root 9.5M Jan  7 23:18 list_landmarks_align_celeba.csv


Step 3: Locate image folder + identity labels

In [7]:
from pathlib import Path
import os

data_root = Path("/content/celeba_kaggle")

# Find image folder
img_dir = None
for cand in ["img_align_celeba", "img_align_celeba/img_align_celeba"]:
    hits = list(data_root.rglob(cand))
    if hits:
        img_dir = hits[0]
        break

print("img_dir:", img_dir)
if img_dir is None:
    raise FileNotFoundError("Could not find img_align_celeba folder inside your dataset.")

# Find identity file
identity_path = None
hits = list(data_root.rglob("identity_CelebA.txt"))
if hits:
    identity_path = hits[0]

if identity_path is None:
    identity_path = data_root / "identity_CelebA.txt"
    !wget -q -O "{identity_path}" https://raw.githubusercontent.com/Golbstein/keras-face-recognition/master/identity_CelebA.txt
    print("Downloaded identity file to:", identity_path)

# Sanity check
if not identity_path.exists() or identity_path.stat().st_size < 1000:
    raise FileNotFoundError("identity_CelebA.txt is missing or too small; download failed.")

print("identity_path:", identity_path)
print("identity file size (bytes):", identity_path.stat().st_size)

img_dir: /content/celeba_kaggle/img_align_celeba
identity_path: /content/celeba_kaggle/identity_CelebA.txt
identity file size (bytes): 3424458


Step 4: Dataset Splitting

In [11]:
import shutil, random, json
from pathlib import Path
import pandas as pd
import numpy as np

SEED=42
random.seed(SEED); np.random.seed(SEED)

TARGET_CLASSES=10
IMAGES_PER_CLASS=25
TRAIN_PER_CLASS=20
VAL_PER_CLASS=5

data_root = Path("/content/celeba_kaggle")

# 1) Find the directory that DIRECTLY contains .jpg files (not just recursively)
all_dirs = [p for p in data_root.rglob("*") if p.is_dir()]

def direct_jpg_count(d: Path) -> int:
    return len(list(d.glob("*.jpg")))

jpg_dirs = [(d, direct_jpg_count(d)) for d in all_dirs]
jpg_dirs = [x for x in jpg_dirs if x[1] > 0]

if not jpg_dirs:
    raise FileNotFoundError("❌ No directory with direct *.jpg files found under /content/celeba_kaggle")

img_dir, n_jpg = max(jpg_dirs, key=lambda x: x[1])
print("✅ img_dir (direct jpg folder):", img_dir)
print("✅ direct jpg count:", n_jpg)

# 2) Identity file must exist and be non-empty
identity_path = data_root / "identity_CelebA.txt"
if (not identity_path.exists()) or identity_path.stat().st_size < 1000:
    print("\n⚠️ identity_CelebA.txt missing/empty -> upload it now")
    from google.colab import files
    uploaded = files.upload()  # upload identity_CelebA.txt
    src = Path("identity_CelebA.txt")
    if not src.exists():
        raise FileNotFoundError("Upload must be named exactly identity_CelebA.txt")
    shutil.copy2(src, identity_path)
    print("✅ Uploaded identity file bytes:", identity_path.stat().st_size)

# 3) Load identity file safely
raw = pd.read_csv(identity_path, sep=r"\s+", header=None)
df = raw.iloc[:, :2].copy()
df.columns = ["filename", "identity"]
df["filename"] = df["filename"].astype(str)
df["identity"] = df["identity"].astype(int)

# 4) Fast match: keep only filenames that exist in img_dir
existing = set(p.name for p in img_dir.glob("*.jpg"))
df = df[df["filename"].isin(existing)].copy()
print("✅ rows matched with existing images:", len(df))

if len(df) == 0:
    print("Example filenames from identity file:", raw.iloc[:5,0].tolist())
    print("Example filenames in img_dir:", list(existing)[:5])
    raise ValueError("❌ Still zero matches. Your identity file doesn't match your image filenames.")

# 5) Pick identities with enough images
counts = df["identity"].value_counts()
eligible = counts[counts >= IMAGES_PER_CLASS]
print("✅ eligible identities:", len(eligible), "with >=", IMAGES_PER_CLASS, "images")

if len(eligible) < TARGET_CLASSES:
    raise ValueError(f"❌ Not enough identities with >= {IMAGES_PER_CLASS} images. Try IMAGES_PER_CLASS=20")

top_ids = eligible.head(TARGET_CLASSES).index.tolist()
id_map = {ident:i for i, ident in enumerate(top_ids)}
df_small = df[df["identity"].isin(top_ids)].copy()
df_small["class_idx"] = df_small["identity"].map(id_map)

# 6) Sample IMAGES_PER_CLASS per class and copy into reduced_celeba
samples=[]
for cls, g in df_small.groupby("class_idx"):
    samples.append(g.sample(n=IMAGES_PER_CLASS, random_state=SEED))
df_sampled = pd.concat(samples, ignore_index=True)

out_root = Path("/content/reduced_celeba")
if out_root.exists(): shutil.rmtree(out_root)
out_root.mkdir(parents=True, exist_ok=True)

def cname(i): return f"person_{i:03d}"
for i in range(TARGET_CLASSES):
    (out_root / cname(i)).mkdir(parents=True, exist_ok=True)

for _, r in df_sampled.iterrows():
    src = img_dir / r["filename"]
    dst = out_root / cname(int(r["class_idx"])) / r["filename"]
    shutil.copy2(src, dst)

with open("/content/class_mapping.json","w") as f:
    json.dump({cname(v): int(k) for k,v in id_map.items()}, f, indent=2)

print("✅ reduced_celeba total jpg:", sum(1 for _ in out_root.rglob("*.jpg")))

# 7) Build balanced split
split_root = Path("/content/reduced_celeba_split")
if split_root.exists(): shutil.rmtree(split_root)
train_root = split_root/"train"; val_root = split_root/"val"
train_root.mkdir(parents=True, exist_ok=True); val_root.mkdir(parents=True, exist_ok=True)

class_dirs = sorted([d for d in out_root.iterdir() if d.is_dir()])
for cls_dir in class_dirs:
    imgs = sorted(list(cls_dir.glob("*.jpg")))
    random.shuffle(imgs)
    need = TRAIN_PER_CLASS + VAL_PER_CLASS
    if len(imgs) < need:
        raise ValueError(f"❌ {cls_dir.name} has only {len(imgs)} images. Need {need}.")
    train_imgs = imgs[:TRAIN_PER_CLASS]
    val_imgs = imgs[TRAIN_PER_CLASS:TRAIN_PER_CLASS+VAL_PER_CLASS]

    (train_root/cls_dir.name).mkdir(parents=True, exist_ok=True)
    (val_root/cls_dir.name).mkdir(parents=True, exist_ok=True)

    for p in train_imgs: shutil.copy2(p, train_root/cls_dir.name/p.name)
    for p in val_imgs: shutil.copy2(p, val_root/cls_dir.name/p.name)

print("✅ split train images:", sum(1 for _ in train_root.rglob("*.jpg")))
print("✅ split val images:", sum(1 for _ in val_root.rglob("*.jpg")))
print("✅ READY for Task 2 at:", split_root)


✅ img_dir (direct jpg folder): /content/celeba_kaggle/img_align_celeba/img_align_celeba
✅ direct jpg count: 202599
✅ rows matched with existing images: 202599
✅ eligible identities: 3661 with >= 25 images
✅ reduced_celeba total jpg: 250
✅ split train images: 200
✅ split val images: 50
✅ READY for Task 2 at: /content/reduced_celeba_split


Step 5: VGG16 Model Training + Fine-Tuning

In [12]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path

TRAIN_DIR = Path("/content/reduced_celeba_split/train")
VAL_DIR   = Path("/content/reduced_celeba_split/val")

# quick checks
print("TRAIN_DIR exists:", TRAIN_DIR.exists())
print("VAL_DIR exists:", VAL_DIR.exists())
train_imgs = sum(1 for _ in TRAIN_DIR.rglob("*.jpg"))
val_imgs   = sum(1 for _ in VAL_DIR.rglob("*.jpg"))
print("Train jpg:", train_imgs, "| Val jpg:", val_imgs)
!find /content/reduced_celeba_split -maxdepth 2 -type d | head -n 30

if train_imgs == 0 or val_imgs == 0:
    raise ValueError("No images found in train/val. Rebuild reduced_celeba_split first.")

# VGG16 expects 224x224
IMG_SIZE = (224, 224)
BATCH_SIZE = 16
SEED = 42

train_ds = tf.keras.utils.image_dataset_from_directory(
    str(TRAIN_DIR),
    label_mode="categorical",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True,
    seed=SEED,
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    str(VAL_DIR),
    label_mode="categorical",
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

NUM_CLASSES = len(train_ds.class_names)
print("NUM_CLASSES:", NUM_CLASSES)
print("Class names:", train_ds.class_names)

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

# mild augmentation
aug = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.02),
    layers.RandomZoom(0.05),
], name="aug")

# VGG16 backbone
base = keras.applications.VGG16(
    include_top=False,
    weights="imagenet",
    input_shape=IMG_SIZE + (3,)
)

# Freeze all layers first
for layer in base.layers:
    layer.trainable = False

# Unfreeze last quarter of layers
n = len(base.layers)
start_unfreeze = int(n * 0.75)  # last 25%
for layer in base.layers[start_unfreeze:]:
    layer.trainable = True

print("VGG16 total layers:", n)
print("Unfrozen from index:", start_unfreeze)
print("Trainable layers:", sum(l.trainable for l in base.layers))

# Build model
inputs = keras.Input(shape=IMG_SIZE + (3,))
x = aug(inputs)
x = keras.applications.vgg16.preprocess_input(x)
x = base(x, training=True)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(NUM_CLASSES, activation="softmax")(x)
model2 = keras.Model(inputs, outputs, name="task2_vgg16_finetune")

model2.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),  # small LR
    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.05),
    metrics=[
        keras.metrics.CategoricalAccuracy(name="accuracy"),
        keras.metrics.TopKCategoricalAccuracy(k=5, name="top5_acc"),
    ],
)

callbacks = [
    keras.callbacks.ModelCheckpoint("/content/task2_vgg16_best.keras", save_best_only=True, monitor="val_accuracy", mode="max", verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=8, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=3, factor=0.5, min_lr=1e-7, verbose=1),
]

history2 = model2.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=callbacks,
    verbose=1
)

print("Task 2 Eval:", model2.evaluate(val_ds, verbose=1))

final_path = "/content/task2_vgg16_final.keras"
model2.save(final_path)
print("✅ Saved:", final_path)

from google.colab import files
files.download("/content/task2_vgg16_best.keras")
files.download(final_path)


TRAIN_DIR exists: True
VAL_DIR exists: True
Train jpg: 200 | Val jpg: 50
/content/reduced_celeba_split
/content/reduced_celeba_split/train
/content/reduced_celeba_split/train/person_001
/content/reduced_celeba_split/train/person_006
/content/reduced_celeba_split/train/person_008
/content/reduced_celeba_split/train/person_002
/content/reduced_celeba_split/train/person_007
/content/reduced_celeba_split/train/person_004
/content/reduced_celeba_split/train/person_005
/content/reduced_celeba_split/train/person_000
/content/reduced_celeba_split/train/person_009
/content/reduced_celeba_split/train/person_003
/content/reduced_celeba_split/val
/content/reduced_celeba_split/val/person_001
/content/reduced_celeba_split/val/person_006
/content/reduced_celeba_split/val/person_008
/content/reduced_celeba_split/val/person_002
/content/reduced_celeba_split/val/person_007
/content/reduced_celeba_split/val/person_004
/content/reduced_celeba_split/val/person_005
/content/reduced_celeba_split/val/person_0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>