In [1]:
import numpy as np
from sklearn.decomposition import PCA
import pickle
import json

# Load original 512D embeddings
with open("face_embeddings.json", "r") as f:
    data = json.load(f)

embeddings = np.array(data["embeddings"])

# Create and fit PCA model to reduce from 512 to 128 dimensions
pca = PCA(n_components=128)
pca.fit(embeddings)

# Save using your current environment
with open("pca_512_to_128.pkl", "wb") as f:
    pickle.dump(pca, f)

print("✅ PCA model successfully saved!")


✅ PCA model successfully saved!


In [None]:
from modules.pca_reducer import load_pca_model

# Load PCA
pca = load_pca_model("pca_512_to_128.pkl")

# Assume you have: `embedding_512` from InsightFace
embedding_128 = pca.transform([embedding_512])[0]


In [2]:
import json
import numpy as np
from modules.pca_reducer import load_pca_model

# File paths
input_file = "face_embeddings.json"
output_file = "face_embeddings_128.json"
pca_model_path = "pca_512_to_128.pkl"

# Load 512D embeddings
with open(input_file, "r") as f:
    data = json.load(f)
    embeddings_512 = np.array(data["embeddings"])
    labels = data["labels"]

# Load trained PCA model
pca = load_pca_model(pca_model_path)

# Convert to 128D
embeddings_128 = pca.transform(embeddings_512)

# Save to new JSON
output_data = {
    "embeddings": embeddings_128.tolist(),
    "labels": labels
}

with open(output_file, "w") as f:
    json.dump(output_data, f)

print(f"[DONE] Converted {len(embeddings_128)} embeddings to 128D and saved to {output_file}")


[INFO] Loaded trained PCA for 128D reduction.
[DONE] Converted 5004 embeddings to 128D and saved to face_embeddings_128.json


In [5]:
import numpy as np

# Load all float32 values from the .dat file
data = np.fromfile("neuralhash_128x96_seed1.dat", dtype=np.float32)

# Save to text file
np.savetxt("projection_matrix.txt", data, fmt="%.8f")

print(f"✅ Saved {data.size} values to 'projection_matrix.txt'")


✅ Saved 12320 values to 'projection_matrix.txt'


In [6]:
import numpy as np

# Load float32 values
data = np.fromfile("neuralhash_128x96_seed1.dat", dtype=np.float32)

# Reshape to 128×96 matrix
matrix = data.reshape((128, 96))

# Save in matrix format to text
np.savetxt("projection_matrix.txt", matrix, fmt="%.8f")

print(f"✅ Saved {matrix.shape} matrix to 'projection_matrix.txt'")


ValueError: cannot reshape array of size 12320 into shape (128,96)

Problem solved the first 32 values are metadata

In [7]:
import numpy as np

# Load all float32 values
data = np.fromfile("neuralhash_128x96_seed1.dat", dtype=np.float32)

# Skip first 32 values (metadata)
matrix_data = data[32:32 + 128 * 96]

# Reshape into 128×96 matrix
proj_matrix = matrix_data.reshape((128, 96))

# Save to file
np.savetxt("projection_matrix_cleaned.txt", proj_matrix, fmt="%.8f")

print(f"✅ Loaded and saved clean projection matrix of shape {proj_matrix.shape}")


✅ Loaded and saved clean projection matrix of shape (128, 96)


In [2]:
import os
import random
import csv
from glob import glob

dataset_path = "C:\\Users\\ASUS\Desktop\\VGGFace_Dataset" # 🔁 CHANGE this to your dataset folder
output_csv = "test_pairs.csv"

# Get all people folders (non-empty)
people_dirs = [os.path.join(dataset_path, d) for d in os.listdir(dataset_path)
               if os.path.isdir(os.path.join(dataset_path, d)) and len(os.listdir(os.path.join(dataset_path, d))) >= 2]

pairs = []

# Generate Genuine Pairs (same identity)
for person_dir in people_dirs:
    images = glob(os.path.join(person_dir, "*.jpg"))
    if len(images) >= 2:
        img1, img2 = random.sample(images, 2)
        pairs.append((img1, img2, 1))  # label 1 = same person

# Generate Impostor Pairs (different identities)
for _ in range(len(pairs)):
    p1, p2 = random.sample(people_dirs, 2)
    img1_list = glob(os.path.join(p1, "*.jpg"))
    img2_list = glob(os.path.join(p2, "*.jpg"))
    if img1_list and img2_list:
        img1 = random.choice(img1_list)
        img2 = random.choice(img2_list)
        pairs.append((img1, img2, 0))  # label 0 = different people

# Save to CSV
with open(output_csv, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["img1", "img2", "label"])
    writer.writerows(pairs)

print(f"✅ Generated {len(pairs)} pairs and saved to '{output_csv}'")


  dataset_path = "C:\\Users\\ASUS\Desktop\\VGGFace_Dataset" # 🔁 CHANGE this to your dataset folder


✅ Generated 1080 pairs and saved to 'test_pairs.csv'


[INFO] Loaded trained PCA for 128D reduction.


TypeError: Can't convert object to 'str' for 'filename'

In [2]:
import csv
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from modules.embedding_extractor import extract_embedding
from modules.pca_reducer import load_pca_model, reduce_embedding
from modules.hashing import load_projection_matrix, compute_hash_from_embedding
from scipy.spatial.distance import cosine
import cv2

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load PCA and projection matrix
pca = load_pca_model("pca_512_to_128.pkl")
proj = load_projection_matrix("neuralhash_128x96_seed1.dat")

# Load CSV
pairs = []
with open("test_pairs.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        pairs.append((row["img1"], row["img2"], int(row["label"])))

cosine_similarities = []
hamming_distances = []
true_labels = []

for img1_path, img2_path, label in pairs:
    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)
    if img1 is None or img2 is None:
        print(f"[WARNING] Skipping missing image(s): {img1_path}, {img2_path}")
        continue

    emb1 = extract_embedding(img1)
    emb2 = extract_embedding(img2)

    if emb1 is None or emb2 is None:
        print(f"[WARNING] Skipping undetected face(s): {img1_path}, {img2_path}")
        continue

    # Cosine similarity (higher = more similar)
    cos_sim = 1 - cosine(emb1, emb2)
    cosine_similarities.append(cos_sim)

    # Hash comparison (lower = more similar)
    hash1 = compute_hash_from_embedding(reduce_embedding(pca, emb1), proj)
    hash2 = compute_hash_from_embedding(reduce_embedding(pca, emb2), proj)
    hamming = np.sum(hash1 != hash2)
    hamming_distances.append(hamming)

    true_labels.append(label)

# -----------------------------
# Evaluate cosine similarity
# -----------------------------
cosine_preds = [1 if sim > 0.5 else 0 for sim in cosine_similarities]  # Adjust threshold as needed
print("\n📊 Cosine Similarity Metrics:")
print("Accuracy:", accuracy_score(true_labels, cosine_preds))
print("Precision:", precision_score(true_labels, cosine_preds))
print("Recall:", recall_score(true_labels, cosine_preds))
print("F1 Score:", f1_score(true_labels, cosine_preds))
print("Confusion Matrix:\n", confusion_matrix(true_labels, cosine_preds))


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[INFO] Loaded trained PCA for 128D reduction.

📊 Cosine Similarity Metrics:
Accuracy: 0.7920978363123237
Precision: 1.0
Recall: 0.5861423220973783
F1 Score: 0.7390791027154664
Confusion Matrix:
 [[529   0]
 [221 313]]


In [4]:
import csv
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from modules.embedding_extractor import extract_embedding
from modules.pca_reducer import load_pca_model, reduce_embedding
from modules.hashing import load_projection_matrix, compute_hash_from_embedding
from scipy.spatial.distance import cosine
import cv2

# Load PCA and projection matrix
pca = load_pca_model("pca_512_to_128.pkl")
proj = load_projection_matrix("neuralhash_128x96_seed1.dat")

# Load CSV
pairs = []
with open("test_pairs.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        pairs.append((row["img1"], row["img2"], int(row["label"])))

cosine_similarities = []
hamming_distances = []
true_labels = []

for img1_path, img2_path, label in pairs:
    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)
    if img1 is None or img2 is None:
        print(f"[WARNING] Skipping missing image(s): {img1_path}, {img2_path}")
        continue

    emb1 = extract_embedding(img1)
    emb2 = extract_embedding(img2)

    if emb1 is None or emb2 is None:
        print(f"[WARNING] Skipping undetected face(s): {img1_path}, {img2_path}")
        continue

    # Cosine similarity (higher = more similar)
    cos_sim = 1 - cosine(emb1, emb2)
    cosine_similarities.append(cos_sim)

    # Hash comparison (lower = more similar)
    hash1 = compute_hash_from_embedding(reduce_embedding(pca, emb1), proj)
    hash2 = compute_hash_from_embedding(reduce_embedding(pca, emb2), proj)
    hamming = np.sum(hash1 != hash2)
    hamming_distances.append(hamming)

    true_labels.append(label)

# -----------------------------
# Evaluate cosine similarity
# -----------------------------
cosine_preds = [1 if sim > 0.65 else 0 for sim in cosine_similarities]  # Adjust threshold as needed
print("\n📊 Cosine Similarity Metrics:")
print("Accuracy:", accuracy_score(true_labels, cosine_preds))
print("Precision:", precision_score(true_labels, cosine_preds))
print("Recall:", recall_score(true_labels, cosine_preds))
print("F1 Score:", f1_score(true_labels, cosine_preds))
print("Confusion Matrix:\n", confusion_matrix(true_labels, cosine_preds))

# -----------------------------
# Evaluate Hamming distance
# -----------------------------
hamming_preds = [1 if dist < 30 else 0 for dist in hamming_distances]  # Adjust threshold as needed
print("\n📊 Hamming Distance Metrics (96-bit Hashes):")
print("Accuracy:", accuracy_score(true_labels, hamming_preds))
print("Precision:", precision_score(true_labels, hamming_preds))
print("Recall:", recall_score(true_labels, hamming_preds))
print("F1 Score:", f1_score(true_labels, hamming_preds))
print("Confusion Matrix:\n", confusion_matrix(true_labels, hamming_preds))

# -----------------------------
# Optional: ROC AUC for cosine similarity
# -----------------------------
try:
    print("ROC AUC (Cosine):", roc_auc_score(true_labels, cosine_similarities))
except:
    print("Not enough class variance for ROC AUC.")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[INFO] Loaded trained PCA for 128D reduction.

📊 Cosine Similarity Metrics:
Accuracy: 0.6077140169332079
Precision: 1.0
Recall: 0.21910112359550563
F1 Score: 0.35944700460829493
Confusion Matrix:
 [[529   0]
 [417 117]]

📊 Hamming Distance Metrics (96-bit Hashes):
Accuracy: 0.735653809971778
Precision: 0.996078431372549
Recall: 0.4756554307116105
F1 Score: 0.6438529784537389
Confusion Matrix:
 [[528   1]
 [280 254]]
ROC AUC (Cosine): 0.9780484696586734
