In [6]:
# Kaggle sometimes has protobuf–TensorFlow incompatibility
# This forces protobuf to use Python implementation and avoids crashes
# Force protobuf to use Python implementation (Kaggle stability fix)
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"


# Face Recognition using FaceNet with INT8 Quantization for FPGA Deployment

This project implements a face recognition pipeline using pretrained
FaceNet embeddings and evaluates the impact of INT8 quantization on
classification accuracy. The objective is to design an FPGA-friendly
face recognition system that achieves high accuracy while reducing
computational and memory requirements.

The system separates feature extraction and classification:
- FaceNet is used offline to extract face embeddings.
- Quantized embeddings and linear SVM classification are designed for
  efficient hardware deployment.


In [1]:
# Version-pinned install for Kaggle + Python 3.11 stability
!pip install \
numpy==1.26.4 \
scipy==1.11.4 \
protobuf==3.20.3 \
keras-facenet \
scikit-learn \
opencv-python





Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.15.3
    Uninstalling scipy-1.15.3:
      Successfully uninstalled scipy-1.15.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= 

In [3]:
# Memory cleanup
import gc

# Numerical computation
import numpy as np

# Save and load trained models
import pickle

# Count images per identity
from collections import Counter

# Image loading and preprocessing
import cv2

# FaceNet embedding extractor (TensorFlow backend)
from keras_facenet import FaceNet

# Linear SVM classifier
from sklearn.svm import SVC

# Convert labels to integers
from sklearn.preprocessing import LabelEncoder

# Accuracy metric
from sklearn.metrics import accuracy_score




In [8]:
# Path to LFW deepfunneled dataset on Kaggle
lfw_dir = "/kaggle/input/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled"


## Dataset Description

The Labeled Faces in the Wild (LFW) dataset is used in this project.
It contains face images of different individuals collected under
unconstrained conditions.

To ensure fair evaluation, only identities with at least five images
are considered. This avoids trivial train–test splits and improves the
reliability of accuracy comparisons.


In [27]:
counts = Counter()

# Count number of images for each identity
for person in os.listdir(lfw_dir):
    p_dir = os.path.join(lfw_dir, person)
    if os.path.isdir(p_dir):
        counts[person] = len(os.listdir(p_dir))

# Keep identities with at least 2 images
valid_ids = {p for p, c in counts.items() if c >= 5}



print("Valid identities (>=5 images):", len(valid_ids))

# Limit number of identities to avoid RAM issues
valid_ids = list(valid_ids)[:1000]


Valid identities (>=5 images): 423


In [28]:
def build_dataset_paths(root, valid_persons):
    """
    Returns:
    - img_paths   : list of image file paths
    - labels      : numeric label for each image
    - num_classes : total number of identities
    """
    img_paths = []
    labels = []

    # Sort identities for stable label assignment
    persons = sorted(list(valid_persons))
    person_to_idx = {p: i for i, p in enumerate(persons)}

    for p in persons:
        p_dir = os.path.join(root, p)
        for img in os.listdir(p_dir):
            img_path = os.path.join(p_dir, img)
            if os.path.isfile(img_path):
                img_paths.append(img_path)
                labels.append(person_to_idx[p])

    return img_paths, labels, len(persons)


img_paths, labels, num_classes = build_dataset_paths(lfw_dir, valid_ids)

print("Total images:", len(img_paths))
print("Number of classes:", num_classes)



Total images: 5985
Number of classes: 423


In [11]:
# Loads pretrained FaceNet model (128-D embeddings)
facenet = FaceNet()


In [13]:
def quantize_embedding(e):
    """
    Quantize FP32 embedding to INT8:
    - Scale values to [-127, 127]
    - Cast to int8
    """
    scale = 127.0 / np.max(np.abs(e))
    return (e * scale).astype(np.int8)


In [14]:
# FP32 embeddings
def load_and_preprocess_image(img_path):
    """
    Load image from disk and prepare for FaceNet:
    - OpenCV reads BGR → convert to RGB
    - Resize to 160×160
    """
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (160, 160))
    return img



## Face Embedding Extraction (GPU Stage)

FaceNet is used to extract fixed-length face embeddings from each image.
These embeddings capture discriminative facial features and are used
as inputs to the classifier.

This stage is the only GPU-intensive part of the pipeline and is executed
only once. The extracted embeddings are cached and reused for all
subsequent experiments.


In [29]:
X_fp32 = []   # Full-precision embeddings
X_int8 = []   # Quantized embeddings
y = []        # Labels

for i, img_path in enumerate(img_paths):
    # Load and preprocess image
    img = load_and_preprocess_image(img_path)

    # Extract 128-D FaceNet embedding
    emb = facenet.embeddings([img])[0]

    X_fp32.append(emb)
    X_int8.append(quantize_embedding(emb))
    y.append(labels[i])

    if i % 500 == 0:
        print(f"Processed {i}/{len(img_paths)} images")

# Convert lists to NumPy arrays
X_fp32 = np.array(X_fp32, dtype=np.float32)

# SVM expects float input (values already INT8-constrained)
X_int8 = np.array(X_int8, dtype=np.float32)

y = np.array(y)

gc.collect()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Processed 0/5985 images
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0

25779

## Embedding Representation

Each face image is represented using a 512-dimensional FaceNet
embedding. Two versions of the embeddings are maintained:

- FP32 embeddings for baseline accuracy
- INT8 embeddings obtained through symmetric quantization

The INT8 representation significantly reduces memory usage and is more
suitable for FPGA implementation.


In [30]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
#To evaluate generalization performance, the dataset is split into training and testing subsets using an 80–20 split.

## Train–Test Evaluation Strategy

To evaluate generalization performance, the dataset is split into
training and testing subsets using an 80–20 split. Classifiers are
trained only on the training data and evaluated on unseen test samples.

This ensures a fair comparison between FP32 and INT8 representations.


In [31]:
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_fp32_tr, X_fp32_te, X_int8_tr, X_int8_te, y_tr, y_te = train_test_split(
    X_fp32, X_int8, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print("Train samples:", len(y_tr))
print("Test samples :", len(y_te))


Train samples: 4788
Test samples : 1197


In [32]:
from sklearn.svm import SVC

svm_fp32 = SVC(kernel="linear")
svm_fp32.fit(X_fp32_tr, y_tr)

print("FP32 SVM trained")


FP32 SVM trained


In [33]:
# Train linear SVM on INT8-quantized embeddings
svm_int8 = SVC(kernel="linear")
svm_int8.fit(X_int8_tr, y_tr)

print("INT8 SVM trained")


INT8 SVM trained


In [34]:
from sklearn.metrics import accuracy_score

pred_fp32 = svm_fp32.predict(X_fp32_te)
pred_int8 = svm_int8.predict(X_int8_te)

acc_fp32 = accuracy_score(y_te, pred_fp32)
acc_int8 = accuracy_score(y_te, pred_int8)

print("FP32 Test Accuracy :", acc_fp32)
print("INT8 Test Accuracy :", acc_int8)
print("Accuracy Drop      :", acc_fp32 - acc_int8)


FP32 Test Accuracy : 0.9766081871345029
INT8 Test Accuracy : 0.9741019214703425
Accuracy Drop      : 0.0025062656641604564


## FP32 vs INT8 Accuracy Results

The classification accuracy obtained using FP32 and INT8 embeddings is
compared on the test set. The results demonstrate that INT8 quantization
causes only a minimal loss in accuracy while offering significant
benefits in terms of computational efficiency.

This validates the robustness of FaceNet embeddings to reduced numerical
precision.


In [35]:
import pickle
import numpy as np

# Save embeddings (so you never rerun Block 9 again)
np.save("/kaggle/working/X_fp32.npy", X_fp32)
np.save("/kaggle/working/X_int8.npy", X_int8)
np.save("/kaggle/working/y.npy", y_encoded)

# Save trained models
pickle.dump(svm_fp32, open("/kaggle/working/svm_fp32.pkl", "wb"))
pickle.dump(svm_int8, open("/kaggle/working/svm_int8_qat.pkl", "wb"))
pickle.dump(encoder, open("/kaggle/working/label_encoder.pkl", "wb"))

print("Models and embeddings saved")


Models and embeddings saved


In [36]:
import numpy as np

X_fp32 = np.load("/kaggle/working/X_fp32.npy")
X_int8 = np.load("/kaggle/working/X_int8.npy")
y_encoded = np.load("/kaggle/working/y.npy")

print("Embeddings loaded:", X_fp32.shape, X_int8.shape)


Embeddings loaded: (5985, 512) (5985, 512)


## 📌 Notebook Execution Guide (Important)

This notebook is divided into two phases:
1. **One-time embedding extraction (GPU-heavy)**
2. **Reusable training and evaluation (CPU-only)**

Once embeddings are saved, the GPU stage never needs to be rerun.

---

## 🔢 Blocks to Run on First Execution (One-Time)

Run these blocks **only once** to generate and save FaceNet embeddings.

1. **Block 0 — Environment Fix**
   - Sets TensorFlow–protobuf compatibility.

2. **Block 1 — Install Dependencies**
   - Required only once per Kaggle session.

3. **Block 2 — Imports**

4. **Block 3 — Dataset Path**

5. **Block 4 — Filter Valid Identities**
   - Uses ≥5 images per identity.

6. **Block 5 — Build Dataset**

7. **Block 6 — Load FaceNet**

8. **Block 7 — INT8 Quantization Function**

9. **Block 8 — Image Preprocessing**

10. **Block 9 — FaceNet Embedding Extraction (GPU)**
    - Extracts FP32 and INT8 embeddings.
    - **Most expensive step.**
    - Run only once.

11. **Block A5 — Save Embeddings and Models**
    - Saves `.npy` and `.pkl` files for reuse.

---

## 🔁 Blocks to Run on Subsequent Executions (Recommended)

When reopening the notebook **do NOT rerun GPU blocks**.

1. **Block 0 — Environment Fix**
2. **Block 2 — Imports**
3. **Block A6 — Load Cached Embeddings**
   - Loads `X_fp32.npy`, `X_int8.npy`, and `y.npy`
   - No GPU required.

4. **Block A1 — Train/Test Split**
5. **Block A2 — Train FP32 SVM**
6. **Block A3 — Train INT8 SVM**
7. **Block A4 — Accuracy Comparison**
8. *(Optional)* **Block A5 — Save Updated Models**

---

## 🚫 Blocks to Skip After First Run

- **Block 1** — Install dependencies (unless environment reset)
- **Blocks 3–8** — Dataset & preprocessing
- **Block 9** — Embedding extraction (GPU)

---

## 🧠 Key Rule to Remember

> **If embeddings are already saved, skip Block 9 forever.**

---

## ✅ Summary

- FaceNet embedding extraction is a **one-time GPU operation**
- All later experiments reuse cached embeddings
- FP32 vs INT8 comparison runs entirely on CPU
- This design minimizes runtime and GPU usage and is FPGA-friendly



## FPGA-Oriented Design

To enable deployment on FPGA hardware, only the post-embedding
classification stage is mapped to hardware. FaceNet embedding extraction
is performed offline.

The FPGA design includes:
- INT8 quantization
- Linear SVM inference using fixed-point arithmetic
- One-vs-rest decision logic
- Threshold-based unknown face rejection

This design minimizes hardware complexity while maintaining high
recognition accuracy.


## Conclusion and Future Work

This project demonstrates that INT8 quantization of FaceNet embeddings
results in only a negligible accuracy drop (~0.25%) compared to FP32
representations. The proposed pipeline is well suited for FPGA-based
deployment due to its reliance on simple fixed-point operations.

Future work includes implementing the SVM inference logic using HLS or
RTL on an FPGA and evaluating latency and resource utilization on
hardware.
