# Import Dependencies

In [490]:
import os
import cv2

import insightface
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

from tqdm import tqdm
from insightface.app import FaceAnalysis
from insightface.model_zoo import model_zoo
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Create Reference Data

In [491]:
main_dir = "dataset"

train_dir = os.path.join(main_dir, "train")
train_labels_csv = os.path.join(train_dir, "labels.csv")

test_dir = os.path.join(main_dir, "test")
ref_dir = os.path.join(main_dir, "reference_faces")

In [492]:
train_labels_df = pd.read_csv(train_labels_csv)
train_labels_df.head()

Unnamed: 0,filename,emp_id
0,face_0568.jpg,emp016
1,face_0433.jpg,emp014
2,face_1751.jpg,emp004
3,face_0675.jpg,emp028
4,face_0112.jpg,emp001


In [494]:
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(320, 320), det_thresh=0.2)

Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/seang/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/seang/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/seang/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/seang/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/seang/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det

In [495]:
def robust_face_detection(img, app, attempts=3):
    """Tries multiple detection strategies"""
    if img is None:
        return None
        
    strategies = [
        lambda x: x,  # Original
        lambda x: cv2.convertScaleAbs(x, alpha=1.5, beta=40),  # Brighten
        lambda x: cv2.equalizeHist(cv2.cvtColor(x, cv2.COLOR_BGR2GRAY))[:,:,np.newaxis].repeat(3,2),  # Hist equal
        lambda x: cv2.GaussianBlur(x, (5,5), 0),  # De-noise
        lambda x: cv2.medianBlur(x, 3),  # Alternative de-noise
    ]
    
    for i in range(attempts):
        try:
            modified = strategies[i](img) if i < len(strategies) else img
            faces = app.get(modified)
            if len(faces) > 0:
                return faces
        except Exception as e:
            print(f"Detection attempt {i+1} failed: {str(e)}")
            continue
    return None

In [496]:
def create_reference_embeddings(ref_dir, ref_labels):
    gallery = {emp_id: [] for emp_id in ref_labels}

    for emp_id in tqdm(gallery.keys(), desc="Creating Embeddings"):
        emp_dir = os.path.join(ref_dir, emp_id)
        if not os.path.isdir(emp_dir):
            continue

        for file in tqdm(sorted(os.listdir(emp_dir)), desc=f"Embedding {emp_id}"):
            if file.lower().endswith('.mp4'):
                continue
            filepath = os.path.join(emp_dir, file)
            try:
                img = cv2.imread(filepath)
                if img is None:
                    continue
                faces = robust_face_detection(img, app)
                if faces:
                    embedding = faces[0].embedding / np.linalg.norm(faces[0].embedding)
                    gallery[emp_id].append(embedding)
                else:
                    print(f"Warning: {filepath} - {len(faces)} faces detected")
            except Exception as e:
                print(f"Error processing {filepath}: {str(e)}")
                continue
        if gallery[emp_id]:
            gallery[emp_id] = np.array(gallery[emp_id])
        else:
            print(f"Warning: No valid faces found for {emp_id}")
            del gallery[emp_id]
    return gallery

ref_embed = create_reference_embeddings(ref_dir, sorted(os.listdir(ref_dir)))

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
Embedding emp001: 100%|██████████| 13/13 [00:03<00:00,  4.29it/s]
Embedding emp002: 100%|██████████| 13/13 [00:02<00:00,  4.75it/s]t]
Embedding emp003: 100%|██████████| 11/11 [00:02<00:00,  4.45it/s]t]
Embedding emp004: 100%|██████████| 15/15 [00:03<00:00,  4.84it/s]t]
Embedding emp005: 100%|██████████| 14/14 [00:03<00:00,  4.46it/s]t]
Embedding emp006: 100%|██████████| 8/8 [00:01<00:00,  5.28it/s]/it]
Embedding emp007: 100%|██████████| 14/14 [00:03<00:00,  4.53it/s]t]
Embedding emp008: 100%|██████████| 15/15 [00:03<00:00,  4.26it/s]t]
Embedding emp009: 100%|██████████| 17/17 [00:03<00:00,  4.39it/s]t]
Embedding emp010: 100%|██████████| 15/15 [00:04<00:00,  3.75it/s]t]
Embedding emp011: 100%|██████████| 11/11 [00:02<00:00,  4.16it/s]it]
Embedding emp012: 100%|██████████| 14/14 [00:03<00:00,  4.12it/s]it]
Embedding emp013: 100%|██████████| 16/16 [00:04<00:00,  3.80it/s]it]
Embedding emp014: 100%|██████████| 15/15 [00:03<00:00,

In [497]:

def create_train_embeddings(train_dir, labels_df, normalize=True):
    train_data = {"embedding": [], "label": []}
    missing = []
    img_path = os.path.join(train_dir, "images")
    for _, row in tqdm(labels_df.iterrows(), desc="Creating Train Embeddings", total=len(labels_df)):
        filename, emp_id = row
        filepath = os.path.join(img_path, filename)
        img = cv2.imread(filepath)
        faces = robust_face_detection(img, app)
        if faces:
            train_data['embedding'].append(faces[0].embedding)
            train_data['label'].append(emp_id)
        else:
            train_data['embedding'].append(np.random.normal(0, 0.01, 512))
            train_data['label'].append('UNKNOWN')
            missing.append((emp_id, filepath))
    return train_data, missing

train_dict, missing_train = create_train_embeddings(train_dir, train_labels_df)

Creating Train Embeddings: 100%|██████████| 1179/1179 [05:00<00:00,  3.92it/s]


In [498]:
((len(train_labels_df) - len(missing_train)) / len(train_labels_df)) * 100

94.40203562340967

In [499]:
gallery_embed = np.vstack(list(ref_embed.values()))
gallery_labels = []

for id, embedding in ref_embed.items():
    for n in range(len(ref_embed[id])):
        gallery_labels.append(id)

len(gallery_embed), len(gallery_labels)

(452, 452)

In [500]:
all_classes = np.unique(list(ref_embed.keys()) + list(train_dict['label']))
le = LabelEncoder()
le.fit(all_classes)

In [501]:
all_classes

array(['UNKNOWN', 'emp001', 'emp002', 'emp003', 'emp004', 'emp005',
       'emp006', 'emp007', 'emp008', 'emp009', 'emp010', 'emp011',
       'emp012', 'emp013', 'emp014', 'emp015', 'emp016', 'emp017',
       'emp018', 'emp019', 'emp020', 'emp021', 'emp022', 'emp023',
       'emp024', 'emp025', 'emp026', 'emp027', 'emp028', 'emp029',
       'emp030', 'emp031', 'emp032', 'emp034', 'emp035'], dtype='<U7')

In [504]:
train_embed = np.array([x/np.linalg.norm(x) for x in train_dict['embedding']])  # Normalized
train_labels = le.transform(train_dict['label'])

X_train, X_valid, y_train, y_valid = train_test_split(
    train_embed, train_labels,
    test_size=0.2,
    random_state=42
)

gallery_embed = np.vstack([emb/np.linalg.norm(emb) for emb in ref_embed.values()])
gallery_labels = le.transform([id for id in ref_embed for _ in range(len(ref_embed[id]))])

In [505]:
X_train = np.vstack([X_train, gallery_embed])
y_train = np.hstack([y_train, gallery_labels])

In [None]:
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.svm import SVC

clf = xgb.XGBClassifier(
    objective="multi:softmax",
    eval_metric="mlogloss",  # Better for probabilities
    num_class=len(le.classes_),
    n_estimators=500,  # Reduced for faster training
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    early_stopping_rounds=20
)

clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

[0]	validation_0-mlogloss:3.38355
[1]	validation_0-mlogloss:3.24045
[2]	validation_0-mlogloss:3.10802
[3]	validation_0-mlogloss:3.00479
[4]	validation_0-mlogloss:2.91282
[5]	validation_0-mlogloss:2.81814
[6]	validation_0-mlogloss:2.74687
[7]	validation_0-mlogloss:2.67309
[8]	validation_0-mlogloss:2.60718
[9]	validation_0-mlogloss:2.55553
[10]	validation_0-mlogloss:2.50529
[11]	validation_0-mlogloss:2.45737
[12]	validation_0-mlogloss:2.41289
[13]	validation_0-mlogloss:2.37233
[14]	validation_0-mlogloss:2.33291
[15]	validation_0-mlogloss:2.29415
[16]	validation_0-mlogloss:2.26067
[17]	validation_0-mlogloss:2.22482
[18]	validation_0-mlogloss:2.19397
[19]	validation_0-mlogloss:2.16191
[20]	validation_0-mlogloss:2.13087
[21]	validation_0-mlogloss:2.09923
[22]	validation_0-mlogloss:2.07275
[23]	validation_0-mlogloss:2.05016
[24]	validation_0-mlogloss:2.02872
[25]	validation_0-mlogloss:2.00799
[26]	validation_0-mlogloss:1.98412
[27]	validation_0-mlogloss:1.95722
[28]	validation_0-mlogloss:1.9

InvalidParameterError: The 'kernel' parameter of SVC must be a str among {'precomputed', 'linear', 'sigmoid', 'poly', 'rbf'} or a callable. Got 'cosine' instead.

In [508]:
svm = SVC(kernel='linear', probability=True, class_weight="balanced")
svm.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [509]:
from sklearn.metrics import accuracy_score

def macro_accuracy(y_true, y_pred):
    classes = np.unique(y_true)
    acc_per_class = []
    for c in classes:
        mask = y_true == c
        acc = accuracy_score(y_true[mask], y_pred[mask])
        acc_per_class.append(acc)
    return np.mean(acc_per_class)

In [522]:
from sklearn.metrics import balanced_accuracy_score, classification_report

def evaluate(model, X, y, name="", threshold=0.1):
    # Get prediction probabilities
    y_probs = model.predict_proba(X)
    
    # Apply threshold to create predictions
    confident_mask = np.max(y_probs, axis=1) >= threshold
    y_pred = np.where(confident_mask,
                     model.classes_[np.argmax(y_probs, axis=1)],
                     le.transform(['UNKNOWN'])[0])  # Map low-confidence to 'unknown'
    
    # Get only classes present in y_true or confident predictions
    present_labels = np.unique(np.concatenate([
        y,
        y_pred[confident_mask]  # Only consider confident predictions for class list
    ]))
    present_classes = le.classes_[present_labels]
    
    print(f"\n{name} Classification Report (Threshold={threshold}):")
    print(classification_report(
        y, y_pred,
        labels=present_labels,
        target_names=present_classes,
        zero_division=0,
        digits=4
    ))
    
    # Calculate macro accuracy considering unknowns as a separate class
    print(f"Macro Accuracy: {macro_accuracy(y, y_pred):.4f}")
    print(f"Unknown Ratio: {1 - confident_mask.mean():.2%}")
    

In [523]:
evaluate(clf, X_valid, y_valid, "XGBoost")


XGBoost Classification Report (Threshold=0.1):
              precision    recall  f1-score   support

     UNKNOWN     0.7222    0.6500    0.6842        20
      emp001     0.6000    1.0000    0.7500         9
      emp002     0.9375    0.9375    0.9375        16
      emp003     0.8000    1.0000    0.8889         4
      emp004     0.8000    0.8889    0.8421         9
      emp006     1.0000    1.0000    1.0000         5
      emp007     1.0000    1.0000    1.0000         1
      emp008     0.0000    0.0000    0.0000         0
      emp009     1.0000    0.3333    0.5000         3
      emp010     0.9375    0.8824    0.9091        17
      emp011     1.0000    1.0000    1.0000         9
      emp012     1.0000    0.8333    0.9091         6
      emp013     0.4000    0.4000    0.4000         5
      emp014     0.9375    0.9375    0.9375        16
      emp015     0.6667    0.6667    0.6667         3
      emp016     0.9474    0.9474    0.9474        19
      emp017     0.0000    0.0000

In [524]:
evaluate(svm, X_valid, y_valid, "SVM")


SVM Classification Report (Threshold=0.1):
              precision    recall  f1-score   support

     UNKNOWN     1.0000    0.8000    0.8889        20
      emp001     0.6154    0.8889    0.7273         9
      emp002     0.7895    0.9375    0.8571        16
      emp003     1.0000    1.0000    1.0000         4
      emp004     1.0000    0.8889    0.9412         9
      emp006     1.0000    1.0000    1.0000         5
      emp007     1.0000    1.0000    1.0000         1
      emp009     0.6667    0.6667    0.6667         3
      emp010     0.9412    0.9412    0.9412        17
      emp011     1.0000    1.0000    1.0000         9
      emp012     0.8333    0.8333    0.8333         6
      emp013     1.0000    0.4000    0.5714         5
      emp014     0.9412    1.0000    0.9697        16
      emp015     0.6667    0.6667    0.6667         3
      emp016     0.9474    0.9474    0.9474        19
      emp017     1.0000    1.0000    1.0000         1
      emp018     0.6667    0.8889    

In [515]:
test_embed, test_filenames, missing_test = [], [], []
test_img_path = os.path.join(test_dir, "images")

for filename in tqdm(sorted(os.listdir(test_img_path)), desc="Creating Test Embeddings"):
    img_path = os.path.join(test_img_path, filename)
    img = cv2.imread(img_path)
    faces = robust_face_detection(img, app)
    test_filenames.append(img_path)
    if faces:
        test_embed.append(faces[0].embedding)
    else:
        test_embed.append(np.random.normal(0, 0.01, 512))
        missing_test.append(img_path)

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
Creating Test Embeddings: 100%|██████████| 636/636 [02:51<00:00,  3.70it/s]


In [525]:
def thresholded_predict(model, X, classes, threshold=0.1, unknown_label='UNKNOWN'):
    """
    Make predictions using a confidence threshold
    
    Args:
        model: Trained classifier with predict_proba() method
        X: Input features (embeddings)
        classes: Array of class names (e.g., ['emp001', 'emp002', ...])
        threshold: Confidence threshold (0-1)
        unknown_label: Label for low-confidence predictions
    """
    # Get class probabilities
    try:
        probs = model.predict_proba(X)
    except AttributeError:
        raise ValueError("Model must have predict_proba() method")
    
    # Get max probability and corresponding class index
    max_probs = np.max(probs, axis=1)
    pred_indices = np.argmax(probs, axis=1)
    
    # Apply threshold
    predictions = np.where(max_probs >= threshold,
                         classes[pred_indices],
                         unknown_label)
    
    return predictions

test_data = np.array(test_embed)
svm_test_labels = thresholded_predict(svm, test_data, le.classes_)
xgb_test_labels = thresholded_predict(clf, test_data, le.classes_)

In [531]:
svm_test_labels[:10], xgb_test_labels[:10]

(array(['emp019', 'emp014', 'emp016', 'emp016', 'emp002', 'emp019',
        'emp014', 'emp001', 'emp001', 'emp004'], dtype='<U7'),
 array(['emp019', 'emp014', 'emp016', 'emp016', 'emp002', 'emp019',
        'emp019', 'emp010', 'emp001', 'emp004'], dtype='<U7'))

In [533]:
submission = pd.DataFrame()
submission['filename'] = sorted(os.listdir(os.path.join(test_dir, "images")))
submission['employee_id'] = xgb_test_labels
submission['employee_id'] = submission['employee_id'].str.lower()

submission.head()

Unnamed: 0,filename,employee_id
0,face_0002.jpg,emp019
1,face_0005.jpg,emp014
2,face_0009.jpg,emp016
3,face_0012.jpg,emp016
4,face_0014.jpg,emp002


In [534]:
submission.tail()

Unnamed: 0,filename,employee_id
631,face_1802.jpg,emp023
632,face_1807.jpg,emp010
633,face_1812.jpg,emp020
634,face_1813.jpg,emp019
635,face_1815.jpg,emp009


In [None]:
def plot_test_predictions(submission, n=5):
    plt.figure(figsize=(15, 3))
    for i in range(n):
        idx = np.random.randint(len(submission))
        img = cv2.cvtColor(cv2.imread(test_filenames[idx]), cv2.COLOR_BGR2RGB)
        plt.subplot(1, n, i+1)
        plt.imshow(img)
        plt.title(f"{test_filenames[idx].split('/')[-1]} | Pred: {submission.iloc[idx]['employee_id']}")
        plt.axis('off')
    plt.tight_layout()

plot_test_predictions(submission)

In [530]:
submission.to_csv('submission.csv', index=False)