Cell: 1 (Install packages)

In [1]:
!pip -q install kaggle mtcnn opencv-python-headless tqdm tensorflow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Cell: 2 (Create)

In [2]:
import os, json, getpass

username = input("Kaggle username (NOT secret): ").strip()
key = getpass.getpass("Kaggle key (SECRET, hidden): ").strip()

os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/kaggle.json", "w") as f:
    json.dump({"username": username, "key": key}, f)

!chmod 600 /root/.kaggle/kaggle.json
print("✅ kaggle.json saved.")

Kaggle username (NOT secret): KAGGLE_API_TOKEN
Kaggle key (SECRET, hidden): ··········
✅ kaggle.json saved.


Cell: 3 (Download + unzip CASIA-WebFace)

In [3]:
!mkdir -p data/celeba
!kaggle datasets download -d jessicali9530/celeba-dataset -p data/celeba --unzip
!ls -lah data/celeba | head -n 50

Dataset URL: https://www.kaggle.com/datasets/jessicali9530/celeba-dataset
License(s): other
Downloading celeba-dataset.zip to data/celeba
100% 1.33G/1.33G [00:08<00:00, 119MB/s] 
100% 1.33G/1.33G [00:08<00:00, 170MB/s]
total 42M
drwxr-xr-x 3 root root 4.0K Jan  7 17:55 .
drwxr-xr-x 3 root root 4.0K Jan  7 17:54 ..
drwxr-xr-x 3 root root 4.0K Jan  7 17:55 img_align_celeba
-rw-r--r-- 1 root root  24M Jan  7 17:55 list_attr_celeba.csv
-rw-r--r-- 1 root root 5.2M Jan  7 17:55 list_bbox_celeba.csv
-rw-r--r-- 1 root root 2.8M Jan  7 17:55 list_eval_partition.csv
-rw-r--r-- 1 root root 9.5M Jan  7 17:55 list_landmarks_align_celeba.csv


In [4]:
!kaggle datasets download -d kymo9890/identity-celeba -p data/celeba --unzip
!find data/celeba -maxdepth 3 -type f -iname "*identity*" -o -iname "*ident*"

Dataset URL: https://www.kaggle.com/datasets/kymo9890/identity-celeba
License(s): unknown
Downloading identity-celeba.zip to data/celeba
  0% 0.00/943k [00:00<?, ?B/s]
100% 943k/943k [00:00<00:00, 1.16GB/s]
data/celeba/identity_CelebA.txt


Cell: 4 (Auto-find identity root folder)

In [5]:
import os, glob

img_dir = "data/celeba/img_align_celeba/img_align_celeba"
id_file = "data/celeba/identity_CelebA.txt"

print("Image dir exists:", os.path.isdir(img_dir), img_dir)
print("ID file exists:", os.path.isfile(id_file), id_file)
print("Num images:", len(glob.glob(img_dir + "/*.jpg")))

if not os.path.isfile(id_file):
    raise RuntimeError("❌ identity_CelebA.txt not found. Re-run the identity-celeba download cell.")

Image dir exists: True data/celeba/img_align_celeba/img_align_celeba
ID file exists: True data/celeba/identity_CelebA.txt
Num images: 202599


Cell: 5 (Select 3000 Identities + Build image_paths, labels)

In [6]:
import random
from collections import defaultdict
from tqdm import tqdm

random.seed(42)

# Build mapping: person_id -> list of filenames
id_to_imgs = defaultdict(list)
with open(id_file, "r") as f:
    for line in f:
        fn, pid = line.strip().split()
        id_to_imgs[int(pid)].append(fn)

all_ids = sorted(id_to_imgs.keys())
print("Total identities:", len(all_ids))

N_IDS = 3000
MAX_IMGS_PER_ID = 20

selected_ids = sorted(random.sample(all_ids, N_IDS))
id_to_label = {pid: i for i, pid in enumerate(selected_ids)}

image_paths, labels = [], []
for pid in tqdm(selected_ids):
    files = id_to_imgs[pid]
    random.shuffle(files)
    files = files[:MAX_IMGS_PER_ID]
    for fn in files:
        image_paths.append(os.path.join(img_dir, fn))
        labels.append(id_to_label[pid])

print("Collected images:", len(image_paths))
print("Example:", image_paths[0], labels[0])

Total identities: 10177


100%|██████████| 3000/3000 [00:00<00:00, 25041.57it/s]

Collected images: 48785
Example: data/celeba/img_align_celeba/img_align_celeba/011904.jpg 0





Cell: 6 (Fast Cropping (Center Crop) → data/celeba_crops)

In [7]:
import cv2, os
from tqdm import tqdm

OUT_DIR = "data/celeba_crops"
os.makedirs(OUT_DIR, exist_ok=True)

IMG_SIZE = 160
kept, skipped, already = 0, 0, 0

for path, lab in tqdm(list(zip(image_paths, labels))):
    lab_dir = os.path.join(OUT_DIR, str(lab))
    os.makedirs(lab_dir, exist_ok=True)
    out_path = os.path.join(lab_dir, os.path.basename(path))

    if os.path.exists(out_path):
        already += 1
        continue

    img = cv2.imread(path)
    if img is None:
        skipped += 1
        continue

    h, w = img.shape[:2]
    s = min(h, w)
    y0 = (h - s) // 2
    x0 = (w - s) // 2

    crop = img[y0:y0+s, x0:x0+s]
    crop = cv2.resize(crop, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)
    cv2.imwrite(out_path, crop)
    kept += 1

print("✅ Kept:", kept, "⚠️ Skipped:", skipped, "↩️ Already:", already)
print("Crops folder:", os.path.abspath(OUT_DIR))

100%|██████████| 48785/48785 [00:49<00:00, 984.69it/s] 

✅ Kept: 48785 ⚠️ Skipped: 0 ↩️ Already: 0
Crops folder: /content/data/celeba_crops





Cell: 7 (Train Model (MobileNetV2 + Fine-tune))

In [10]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

# =========================
# 1) DATASET
# =========================
OUT_DIR = "data/celeba_crops"   # your cropped dataset folder
IMG_SIZE = 160
BATCH_SIZE = 64
SEED = 42

if not os.path.isdir(OUT_DIR):
    raise RuntimeError(f"❌ Crops folder not found: {OUT_DIR}. Run cropping first.")

train_ds = tf.keras.utils.image_dataset_from_directory(
    OUT_DIR,
    labels="inferred",
    label_mode="int",
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="training",
    seed=SEED
)
val_ds = tf.keras.utils.image_dataset_from_directory(
    OUT_DIR,
    labels="inferred",
    label_mode="int",
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="validation",
    seed=SEED
)

NUM_CLASSES = len(train_ds.class_names)   # ✅ must be BEFORE prefetch
print("✅ NUM_CLASSES:", NUM_CLASSES)

train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
val_ds   = val_ds.prefetch(tf.data.AUTOTUNE)

# =========================
# 2) CLASS WEIGHTS
# =========================
counts = np.zeros(NUM_CLASSES, dtype=np.int64)
for _, y in train_ds.unbatch().batch(4096):
    yy = y.numpy()
    for v in yy:
        counts[int(v)] += 1
counts = np.maximum(counts, 1)
weights = (counts.sum() / (NUM_CLASSES * counts)).astype(np.float32)
weights = np.clip(weights, 0.2, 5.0)
class_weight = {i: float(weights[i]) for i in range(NUM_CLASSES)}
print("✅ class_weight min/max:", weights.min(), weights.max())

# =========================
# 3) FOCAL LOSS (sparse)
# =========================
def sparse_focal_loss(gamma=2.0, alpha=0.25):
    def loss_fn(y_true, y_pred):
        y_true = tf.cast(tf.reshape(y_true, [-1]), tf.int32)
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
        idx = tf.stack([tf.range(tf.shape(y_true)[0]), y_true], axis=1)
        p_t = tf.gather_nd(y_pred, idx)
        return tf.reduce_mean(-alpha * tf.pow(1.0 - p_t, gamma) * tf.math.log(p_t))
    return loss_fn

# =========================
# 4) MODEL
# =========================
data_aug = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.08),
    layers.RandomZoom(0.15),
    layers.RandomContrast(0.20),
    layers.RandomBrightness(0.15),
], name="data_aug")

base_model = MobileNetV2(
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    include_top=False,
    weights="imagenet"
)
base_model.trainable = False

inputs = keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = data_aug(inputs)
x = preprocess_input(x)
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.4)(x)
outputs = layers.Dense(NUM_CLASSES, activation="softmax")(x)
model = keras.Model(inputs, outputs)

callbacks = [
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=1),
]

# =========================
# 5) TRAIN: PHASE 1
# =========================
EPOCHS_HEAD = 6
EPOCHS_FT = 6

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=5e-4),
    loss=sparse_focal_loss(gamma=2.0, alpha=0.25),
    metrics=["accuracy"]
)

print("✅ Phase 1: training head...")
h1 = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_HEAD,
               callbacks=callbacks, class_weight=class_weight)

# =========================
# 6) TRAIN: PHASE 2 (fine-tune)
# =========================
base_model.trainable = True
for layer in base_model.layers[:-40]:
    layer.trainable = False

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss=sparse_focal_loss(gamma=2.0, alpha=0.25),
    metrics=["accuracy"]
)

print("✅ Phase 2: fine-tuning...")
h2 = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_FT,
               callbacks=callbacks, class_weight=class_weight)

print("✅ Done.")
print("Final val_accuracy:", h2.history["val_accuracy"][-1])
print("Final val_loss:", h2.history["val_loss"][-1])

Found 48785 files belonging to 3000 classes.
Using 39028 files for training.
Found 48785 files belonging to 3000 classes.
Using 9757 files for validation.
✅ NUM_CLASSES: 3000
✅ class_weight min/max: 0.65065 5.0
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_160_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
✅ Phase 1: training head...
Epoch 1/6
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 80ms/step - accuracy: 0.0081 - loss: 1.9132 - val_accuracy: 0.0493 - val_loss: 1.6172 - learning_rate: 5.0000e-04
Epoch 2/6
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 77ms/step - accuracy: 0.0991 - loss: 1.3859 - val_accuracy: 0.0872 - val_loss: 1.4823 - learning_rate: 5.0000e-04
Epoch 3/6
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 78ms/step - accuracy: 0.1997 - loss: 1.0905 - val_accurac

Cell: 8 (Save Weights + Create attendance.csv)

In [15]:
import numpy as np, pandas as pd
from google.colab import files
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

model.save_weights("attendance_weights.weights.h5")
print("✅ Saved: attendance_weights.weights.h5")

images, true_labels = next(iter(val_ds))
x = preprocess_input(images.numpy().astype("float32"))

pred = model.predict(x, verbose=0)
pred_ids = np.argmax(pred, axis=1)
conf = np.max(pred, axis=1)

THRESH = 0.70
rows = []
for i in range(len(pred_ids)):
    predicted = f"ID_{int(pred_ids[i])}" if conf[i] >= THRESH else "UNKNOWN"
    rows.append({
        "sample_index_in_batch": i,
        "true_id": f"ID_{int(true_labels[i])}",
        "predicted_id": predicted,
        "confidence": float(conf[i]),
        "present": "YES" if predicted != "UNKNOWN" else "NO"
    })

df = pd.DataFrame(rows)
df.to_csv("attendance.csv", index=False)
print("✅ Saved: attendance.csv")
display(df.head(20))

files.download("attendance_weights.weights.h5")
files.download("attendance.csv")

✅ Saved: attendance_weights.weights.h5
✅ Saved: attendance.csv


Unnamed: 0,sample_index_in_batch,true_id,predicted_id,confidence,present
0,0,ID_1886,UNKNOWN,0.023807,NO
1,1,ID_1781,UNKNOWN,0.020052,NO
2,2,ID_2839,UNKNOWN,0.019551,NO
3,3,ID_311,UNKNOWN,0.020305,NO
4,4,ID_1898,UNKNOWN,0.019794,NO
5,5,ID_627,UNKNOWN,0.019516,NO
6,6,ID_1308,UNKNOWN,0.02085,NO
7,7,ID_2019,UNKNOWN,0.026406,NO
8,8,ID_170,UNKNOWN,0.020704,NO
9,9,ID_1408,UNKNOWN,0.020883,NO


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Cell: 9 (Face Detection Demo (MTCNN) → detection_demo.csv + Download)

In [18]:
import os, glob, cv2
import pandas as pd
from mtcnn import MTCNN
from google.colab import files

detector = MTCNN()

img_dir = "data/celeba/img_align_celeba/img_align_celeba"
if not os.path.isdir(img_dir):
    raise RuntimeError(f"❌ Image folder not found: {img_dir}")

sample_paths = glob.glob(os.path.join(img_dir, "*.jpg"))[:20]
if not sample_paths:
    raise RuntimeError("❌ No images found in CelebA folder.")

rows = []
for p in sample_paths:
    img_bgr = cv2.imread(p)
    if img_bgr is None:
        continue
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    faces = detector.detect_faces(img_rgb)

    rows.append({
        "file": os.path.basename(p),
        "faces_detected": int(len(faces)),
        "boxes_first3": [f["box"] for f in faces][:3]  # proof
    })

df_det = pd.DataFrame(rows)
df_det.to_csv("detection_demo.csv", index=False)
print("✅ Saved: detection_demo.csv")
display(df_det.head(10))

files.download("detection_demo.csv")

✅ Saved: detection_demo.csv


Unnamed: 0,file,faces_detected,boxes_first3
0,185434.jpg,1,"[[56, 61, 93, 126]]"
1,072200.jpg,1,"[[43, 67, 79, 110]]"
2,157870.jpg,1,"[[45, 69, 81, 109]]"
3,006341.jpg,1,"[[32, 51, 90, 131]]"
4,183683.jpg,1,"[[48, 66, 86, 115]]"
5,001485.jpg,1,"[[37, 77, 89, 108]]"
6,053054.jpg,1,"[[62, 59, 98, 137]]"
7,113729.jpg,1,"[[49, 67, 81, 113]]"
8,100666.jpg,1,"[[39, 65, 90, 121]]"
9,059305.jpg,1,"[[61, 73, 82, 111]]"


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

