## Part 1:  Preparing the CelebA Dataset for a Known vs. Unknown Face Recognition Task

In this part of the project, we will prepare the CelebA face dataset for a classification task in which the model must decide if the face belongs to a known or unknown individual. Given that the dataset contains over 200 thousand faces with each face showing up anywhere from only a few times to 30+, we will be selecting identities of "known" individuals based on a list that only contains the IDs of individuals with 30 or more appearances.

1. We read from the `identity_CelebA.txt` file to map each image filename to an identity ID. This gives us the necessary labels for determining which images correspond to which person.

2. We create a subset of identities where each chosen ID appears at least 30 times within the dataset. The rest of the identities are excluded from the known-class pool.



In [1]:
import pandas as pd
import random
import numpy as np

random.seed(2)

df = pd.read_csv("data/identity_CelebA.txt", sep = " ", header = None, names=["filename", "id"])
counts = df["id"].value_counts()
possible_celebs = counts[counts >= 30].index.tolist()
# print(possible_celebs)

#Uncomment the print statement below to see a list of celebrity IDs that appear 30 or more times within the dataset.
#print("Possible celebrity IDs: ", possible_celebs) 

known_celebs = random.sample(possible_celebs, 10)
# print(known_celebs)

# creating a list of possible celebs with the filename still associated
celebs_with_rows = df[df["id"].isin(possible_celebs)]
# print(celebs_with_rows)

# known celebs with row data
known_with_rows = df[df["id"].isin(known_celebs)]
print(known_with_rows)

# One-hot encode the id column
one_hot_labels = pd.get_dummies(known_with_rows["id"], prefix="id")
print(one_hot_labels)

known_with_onehot = pd.concat([known_with_rows, one_hot_labels], axis=1)
print(known_with_onehot)

          filename    id
1440    001441.jpg  8693
5186    005187.jpg  8693
8578    008579.jpg  4865
9014    009015.jpg  6658
14664   014665.jpg  6658
...            ...   ...
202239  202240.jpg  8774
202320  202321.jpg  8495
202336  202337.jpg  8774
202423  202424.jpg  8774
202552  202553.jpg  8774

[300 rows x 2 columns]
        id_1899  id_2987  id_4865  id_5218  id_6658  id_6992  id_8495  \
1440      False    False    False    False    False    False    False   
5186      False    False    False    False    False    False    False   
8578      False    False     True    False    False    False    False   
9014      False    False    False    False     True    False    False   
14664     False    False    False    False     True    False    False   
...         ...      ...      ...      ...      ...      ...      ...   
202239    False    False    False    False    False    False    False   
202320    False    False    False    False    False    False     True   
202336    False    

In [2]:
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
import os

# pretrained model
model = resnet50(weights="DEFAULT")
model = torch.nn.Sequential(*list(model.children())[:-1])  
model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

def extract_feature(img_path):
    img = Image.open(img_path).convert("RGB")
    x = transform(img).unsqueeze(0)
    with torch.no_grad():
        feat = model(x).squeeze().numpy()  
    return feat

features = []

for _, row in known_with_onehot.iterrows():
    filename = row["filename"]
    img_path = os.path.join("data/img_align_celeba", filename)

    feature_vec = extract_feature(img_path)

    features.append({
        "filename": filename,
        "id": row["id"],
        "feature": feature_vec,
        **{col: row[col] for col in one_hot_labels.columns}
    })

feature_df = pd.DataFrame(features)
print(feature_df.head())


     filename    id                                            feature  \
0  001441.jpg  8693  [0.05619888, 0.0003232664, 0.07154604, 0.0, 0....   
1  005187.jpg  8693  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15506756...   
2  008579.jpg  4865  [0.0, 0.0, 0.04593757, 0.0, 0.15271096, 0.0039...   
3  009015.jpg  6658  [0.0057405573, 0.004871996, 0.2739114, 0.34247...   
4  014665.jpg  6658  [0.0024639198, 0.061482284, 1.4203888, 0.26132...   

   id_1899  id_2987  id_4865  id_5218  id_6658  id_6992  id_8495  id_8693  \
0    False    False    False    False    False    False    False     True   
1    False    False    False    False    False    False    False     True   
2    False    False     True    False    False    False    False    False   
3    False    False    False    False     True    False    False    False   
4    False    False    False    False     True    False    False    False   

   id_8774  id_9755  
0    False    False  
1    False    False  
2    False    False  
3   

In [3]:
from sklearn.model_selection import train_test_split

X = np.stack(feature_df["feature"].values)

y = feature_df["id"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)



In [4]:
from sklearn.svm import SVC

clf = SVC(kernel="linear", probability=True)
clf.fit(X_train, y_train)


0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [5]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.7666666666666667

Classification Report:
               precision    recall  f1-score   support

        1899       0.67      1.00      0.80         6
        2987       0.86      1.00      0.92         6
        4865       1.00      0.67      0.80         6
        5218       0.83      0.83      0.83         6
        6658       0.71      0.83      0.77         6
        6992       0.60      0.50      0.55         6
        8495       0.80      0.67      0.73         6
        8693       1.00      0.67      0.80         6
        8774       0.56      0.83      0.67         6
        9755       1.00      0.67      0.80         6

    accuracy                           0.77        60
   macro avg       0.80      0.77      0.77        60
weighted avg       0.80      0.77      0.77        60



In [6]:
import numpy as np

probs = clf.predict_proba(X_test)

def top_k_accuracy(probs, y_true, k=5):
    top_k = np.argsort(probs, axis=1)[:, -k:]
    return np.mean([y in top_k_row for y, top_k_row in zip(y_true, top_k)])

print("Top-5 Accuracy:", top_k_accuracy(probs, y_test, k=5))


Top-5 Accuracy: 0.0


In [7]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

y_raw = feature_df["id"].values

unique_ids = np.unique(y_raw)
id_to_idx = {old:i for i, old in enumerate(unique_ids)}
y = np.array([id_to_idx[x] for x in y_raw])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_test  = torch.tensor(X_test,  dtype=torch.float32)

y_train_t = torch.tensor(y_train, dtype=torch.long)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

num_classes = len(np.unique(y))
input_dim = X_train.shape[1]     # 2048-dim

class PerceptronClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

model = PerceptronClassifier(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 20
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train_t)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

with torch.no_grad():
    preds = model(X_test).argmax(dim=1)
    accuracy = (preds == y_test_t).float().mean().item()

print("Perceptron Accuracy:", accuracy)


Epoch 5/20, Loss: 2.0100
Epoch 10/20, Loss: 1.6799
Epoch 15/20, Loss: 1.4004
Epoch 20/20, Loss: 1.1701
Perceptron Accuracy: 0.5666666626930237


In [8]:
y_train_onehot = torch.zeros(len(y_train), num_classes)
y_train_onehot[torch.arange(len(y_train)), y_train] = 1.0

y_test_onehot = torch.zeros(len(y_test), num_classes)
y_test_onehot[torch.arange(len(y_test)), y_test] = 1.0

class LinearRegressionClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.fc(x)

model_lr = LinearRegressionClassifier(input_dim, num_classes)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_lr.parameters(), lr=1e-3)

epochs = 50
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model_lr(X_train)
    loss = criterion(outputs, y_train_onehot)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

with torch.no_grad():
    preds = model_lr(X_test).argmax(dim=1)
    accuracy = (preds == y_test_t).float().mean().item()

print("Linear Regression Classifier Accuracy:", accuracy)


Epoch 10/50, Loss: 0.0522
Epoch 20/50, Loss: 0.0285
Epoch 30/50, Loss: 0.0173
Epoch 40/50, Loss: 0.0114
Epoch 50/50, Loss: 0.0078
Linear Regression Classifier Accuracy: 0.6166666746139526
