In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import io # Input/Output Module
import os # OS interfaces
import cv2 # OpenCV package
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from urllib import request # module for opening HTTP requests
from matplotlib import pyplot as plt # Plotting library
from sklearn.decomposition import PCA
from typing import List, Tuple

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Input data files are available in the read-only "../input/" directory

# train = pd.read_csv(
#     '/kaggle/input/kul-computer-vision-ga-1-2025/train_set.csv', index_col = 0)
# train.index = train.index.rename('id')

# test = pd.read_csv(
#     '/kaggle/input/kul-computer-vision-ga-1-2025/test_set.csv', index_col = 0)
# test.index = test.index.rename('id')

train = pd.read_csv(
    'Dataset/train_set.csv', index_col = 0)
train.index = train.index.rename('id')

test = pd.read_csv(
    'Dataset/test_set.csv', index_col = 0)
test.index = test.index.rename('id')

# read the images as numpy arrays and store in "img" column
# train['img'] = [cv2.cvtColor(np.load('/kaggle/input/kul-computer-vision-ga-1-2025/train/train_{}.npy'.format(index), allow_pickle=False), cv2.COLOR_BGR2RGB) 
#                 for index, row in train.iterrows()]

# test['img'] = [cv2.cvtColor(np.load('/kaggle/input/kul-computer-vision-ga-1-2025/test/test_{}.npy'.format(index), allow_pickle=False), cv2.COLOR_BGR2RGB) 
#                 for index, row in test.iterrows()]
  
train['img'] = [cv2.cvtColor(np.load('Dataset/train/train_{}.npy'.format(index), allow_pickle=False), cv2.COLOR_BGR2RGB) 
                for index, row in train.iterrows()]

test['img'] = [cv2.cvtColor(np.load('Dataset/test/test_{}.npy'.format(index), allow_pickle=False), cv2.COLOR_BGR2RGB) 
                for index, row in test.iterrows()]
  

train_size, test_size = len(train),len(test)

"The training set contains {} examples, the test set contains {} examples.".format(train_size, test_size)

*Note: this dataset is a subset of the* [*VGG face dataset*](https://www.robots.ox.ac.uk/~vgg/data/vgg_face/).

## 0.2. A first look
Let's have a look at the data columns and class distribution.

In [None]:
# The training set contains an identifier, name, image information and class label
train.head(1)

In [None]:
# The test set only contains an identifier and corresponding image information.

test.head(1)

In [None]:
# The class distribution in the training set:
train.groupby('name').agg({'img':'count', 'class': 'max'})

Note that **Jesse is assigned the classification label 1**, and **Mila is assigned the classification label 2**. The dataset also contains 20 images of **look alikes (assigned classification label 0)** and the raw images. 

## 0.3. Preprocess data
### 0.3.1 Example: HAAR face detector
In this example we use the [HAAR feature based cascade classifiers](https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_objdetect/py_face_detection/py_face_detection.html) to detect faces, then the faces are resized so that they all have the same shape. If there are multiple faces in an image, we only take the first one. 

<div class="alert alert-block alert-info"> <b>NOTE:</b> You can write temporary files to <code>/kaggle/temp/</code> or <code>../../tmp</code>, but they won't be saved outside of the current session
</div>


In [None]:
class HAARPreprocessor():
    """Preprocessing pipeline built around HAAR feature based cascade classifiers. """
    
    def __init__(self, path, face_size):
        self.face_size = face_size
        file_path = os.path.join(path, "haarcascade_frontalface_default.xml")
        if not os.path.exists(file_path): 
            if not os.path.exists(path):
                os.mkdir(path)
            self.download_model(file_path)
        
        self.classifier = cv2.CascadeClassifier(file_path)
  
    def download_model(self, path):
        url = "https://raw.githubusercontent.com/opencv/opencv/master/data/"\
            "haarcascades/haarcascade_frontalface_default.xml"
        
        with request.urlopen(url) as r, open(path, 'wb') as f:
            f.write(r.read())
            
    def detect_faces(self, img):
        """Detect all faces in an image."""
        
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        return self.classifier.detectMultiScale(
            img_gray,
            scaleFactor=1.2,
            minNeighbors=5,
            minSize=(30, 30),
            flags=cv2.CASCADE_SCALE_IMAGE
        )
        
    def extract_faces(self, img):
        """Returns all faces (cropped) in an image."""
        
        faces = self.detect_faces(img)

        return [img[y:y+h, x:x+w] for (x, y, w, h) in faces]
    
    def preprocess(self, data_row):
        faces = self.extract_faces(data_row['img'])
        
        # if no faces were found, return None
        if len(faces) == 0:
            nan_img = np.empty(self.face_size + (3,))
            nan_img[:] = np.nan
            return nan_img
        
        # only return the first face
        return cv2.resize(faces[0], self.face_size, interpolation = cv2.INTER_AREA)
            
    def __call__(self, data):
        return np.stack([self.preprocess(row) for _, row in data.iterrows()]).astype(int)

**Visualise**

Let's plot a few examples.

In [None]:
# parameter to play with 
FACE_SIZE = (100, 100)

def plot_image_sequence(data, n, imgs_per_row=7):
    n_rows = 1 + int(n/(imgs_per_row+1))
    n_cols = min(imgs_per_row, n)

    f,ax = plt.subplots(n_rows,n_cols, figsize=(10*n_cols,10*n_rows))
    for i in range(n):
        if n == 1:
            ax.imshow(data[i])
        elif n_rows > 1:
            ax[int(i/imgs_per_row),int(i%imgs_per_row)].imshow(data[i])
        else:
            ax[int(i%n)].imshow(data[i])
    plt.show()

    
#preprocessed data 
preprocessor = HAARPreprocessor(path = '../../tmp', face_size=FACE_SIZE)

train_X, train_y = preprocessor(train), train['class'].values
test_X = preprocessor(test)



In [None]:
def show_faces(faces: List[np.array]):
    """"
    Another function for visualization.
    """
    images_per_row = 5
    num_images = len(faces)
    num_rows = num_images // images_per_row + 1

    plt.figure(figsize=(15, 3 * num_rows))

    for i, img in enumerate(faces):
        plt.subplot(num_rows, images_per_row, i + 1) 
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"Image {i + 1}")

    plt.tight_layout()
    plt.show()

In [None]:
# plot faces of Michael and Sarah

plot_image_sequence(train_X[train_y == 0], n=20, imgs_per_row=10)

In [None]:
# plot faces of Jesse

plot_image_sequence(train_X[train_y == 1], n=30, imgs_per_row=10)

In [None]:
# plot faces of Mila

plot_image_sequence(train_X[train_y == 2], n=30, imgs_per_row=10)

## 0.4. Store Preprocessed data (optional)
<div class="alert alert-block alert-info">
<b>NOTE:</b> You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All". Feel free to use this to store intermediary results.
</div>

In [None]:
# save preprocessed data
# prep_path = '/kaggle/working/prepped_data/'
# if not os.path.exists(prep_path):
#     os.mkdir(prep_path)
    
# np.save(os.path.join(prep_path, 'train_X.npy'), train_X)
# np.save(os.path.join(prep_path, 'train_y.npy'), train_y)
# np.save(os.path.join(prep_path, 'test_X.npy'), test_X)

# load preprocessed data
# prep_path = '/kaggle/working/prepped_data/'
# if not os.path.exists(prep_path):
#     os.mkdir(prep_path)
# train_X = np.load(os.path.join(prep_path, 'train_X.npy'))
# train_y = np.load(os.path.join(prep_path, 'train_y.npy'))
# test_X = np.load(os.path.join(prep_path, 'test_X.npy'))

Now we are ready to rock!

# 1. Feature Representations
## 1.0. Example: Identify feature extractor
Our example feature extractor doesn't actually do anything... It just returns the input:
$$
\forall x : f(x) = x.
$$

It does make for a good placeholder and baseclass ;).

In [None]:
class IdentityFeatureExtractor:
    """A simple function that returns the input"""
    
    def transform(self, X):
        return X
    
    def __call__(self, X):
        return self.transform(X)

## 1.1. Baseline 1: HOG feature extractor/Scale Invariant Feature Transform
...

In [None]:
class HOGFeatureExtractor(IdentityFeatureExtractor):
    """TODO: this feature extractor is under construction"""
    
    def __init__(**params):
        self.params = params
        
    def transform(self, X):
        raise NotImplmentedError

### 1.1.1. t-SNE Plots
...

### 1.1.2. Discussion
...

## 1.2. Baseline 2: PCA feature extractor
...

In [None]:
def plot_variance_explained(faces: List[np.ndarray], whiten: bool) -> None:
    """
    Plot variance - number of PCs.
    """
    face_data = np.array([face.flatten() for face in faces])
    max_components = min(face_data.shape[0], face_data.shape[1])
    pca = PCA(n_components=max_components, whiten=whiten)
    pca.fit(face_data)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    plt.figure(figsize=(8, 6))
    plt.plot(np.arange(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='-')
    plt.xlabel("Number of Principal Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.title("Explained Variance vs. Number of Principal Components")
    plt.grid(True)
    plt.show()

In [None]:
class PCAFeatureExtractor(IdentityFeatureExtractor):
    """TODO: this feature extractor is under construction"""
    
    def __init__(self, n_components: int, whiten: bool):
        self.pca = PCA(n_components=n_components, whiten=whiten)
        self.mean_face = None
        self.eigenvectors = None

    def fit(self, faces: List[np.array]):
        """
        Performs PCA give the number of principle components.
        """
        face_data = np.array([face.flatten() for face in faces])
        self.pca.fit(face_data)
        self.mean_face = self.pca.mean_
        self.eigenvectors = self.pca.components_

    def transform(self, faces: List[np.array]) -> List[np.array]:
        """
        Transform original face images to the PCA space.
        """
        face_data = np.array([face.flatten() for face in faces])
        projections = self.pca.transform(face_data)
        return projections

    def fit_transform(self, faces: List[np.array]) -> List[np.array]:
        """
        Fit PCA on the data at first then transform the faces to the PCA space.
        """
        self.fit(faces)
        projections = self.transform(faces)
        return projections
    
    def inverse_transform(self, projections: np.array) -> List[np.array]:
        """
        Convert projections to the original image space.
        """
        reconstructed = np.dot(projections, self.eigenvectors) + self.mean_face
        return reconstructed

In [None]:
def vectors_to_images(vectors: List[np.array], std_shape: tuple) -> List[np.array]:
    """
    Change 1D vectors to 2D images for visualization.

    std_shape: tuple(H, W, C)
        Shape of the target image.
    """
    images = []
    for vector in vectors:
        image = vector.reshape(std_shape[0], std_shape[1], std_shape[2])
        image_shifted = image - np.min(image)
        image_scaled = 255 * (image_shifted / np.max(image_shifted))
        image_display = np.round(image_scaled).astype(np.uint8)
        images.append(image_display)
    return images

In [None]:
train_faces = train_X
std_shape = (train_X.shape[1], train_X.shape[2])

In [None]:
plot_variance_explained(train_faces, False)

In [None]:
pca = PCAFeatureExtractor(50, False)
projections = pca.fit_transform(train_X)
reconstructed_vectors = pca.inverse_transform(projections)

### 1.2.1. Eigenface Plots
...

In [None]:
eigenfaces = vectors_to_images(pca.eigenvectors, (std_shape[0], std_shape[1], 3))
show_faces(eigenfaces)

### 1.2.2. Feature Space Plots
...

In [None]:
reconstructed_faces = vectors_to_images(reconstructed_vectors, (std_shape[0], std_shape[1], 3))
show_faces(reconstructed_faces)

### 1.2.3. Discussion
...

# 2. Evaluation Metrics
## 2.0. Example: Accuracy
As example metric we take the accuracy. Informally, accuracy is the proportion of correct predictions over the total amount of predictions. It is used a lot in classification but it certainly has its disadvantages...

In [None]:
from sklearn.metrics import accuracy_score

# 3. Classifiers
## 3.0. Example: The *'not so smart'* classifier
This random classifier is not very complicated. It makes predictions at random, based on the distribution obseved in the training set. **It thus assumes** that the class labels of the test set will be distributed similarly to the training set.

In [None]:
class RandomClassificationModel:
    """Random classifier, draws a random sample based on class distribution observed 
    during training."""
    
    def fit(self, X, y):
        """Adjusts the class ratio instance variable to the one observed in y. 

        Parameters
        ----------
        X : tensor
            Training set
        y : array
            Training set labels

        Returns
        -------
        self : RandomClassificationModel
        """
        
        self.classes, self.class_ratio = np.unique(y, return_counts=True)
        self.class_ratio = self.class_ratio / self.class_ratio.sum()
        return self
        
    def predict(self, X):
        """Samples labels for the input data. 

        Parameters
        ----------
        X : tensor
            dataset
            
        Returns
        -------
        y_star : array
            'Predicted' labels
        """

        np.random.seed(0)
        return np.random.choice(self.classes, size = X.shape[0], p=self.class_ratio)
    
    def __call__(self, X):
        return self.predict(X)
    

## 3.1. Decision Tree

In [None]:
class DecisionTreeModel:
    def __init__(self, max_depth: int = None, random_state: int = None):
        """
        Initialization.
        """
        self.max_depth = max_depth
        self.random_state = random_state
        self.model = DecisionTreeClassifier(max_depth=self.max_depth,
                                            random_state=self.random_state)
    
    def fit(self, faces: np.array, labels: np.array):
        """
        Fit the Decision Tree model.

        Parameters:
        - faces: 
            2D np.array or tensor with shape [n_samples, n_features]. Each face should be represented as a 1D vector.
        - labels:
            1D np.array containing the class labels.
        """
        faces = np.array([face.flatten() for face in faces])
        self.model.fit(faces, labels)
    
    def predict(self, faces: np.array) -> np.array:
        """
        Predict class for faces using the trained Decision Tree model.

        Parameters:
        - faces:
            2D np.array or tensor with shape [n_samples, n_features]. Each face should be represented as a 1D vector.

        Returns:
        - np.array: The predicted labels for all input faces.
        """
        faces = np.array([face.flatten() for face in faces])
        return self.model.predict(faces)

## 3.2. Baseline 1: RandomForest

In [None]:
class RandomForestModel:
    def __init__(self, n_estimators: int = 100, max_depth: int = None, random_state: int = None):
        """
        Initialization
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.model = RandomForestClassifier(n_estimators=self.n_estimators,
                                            max_depth=self.max_depth,
                                            random_state=self.random_state)
    
    def fit(self, faces: np.array, labels: np.array):
        """
        Fit the model.

        Parameters:
        faces: 
            2d np.array or tensor with shape [n_sample, n_feature], namely each face should be
            represented with 1D vectors
        labels:
            1d np.array containing labels.
        """
        faces = np.array([face.flatten() for face in faces])
        self.model.fit(faces, labels)
    
    def predict(self, faces: np.array) -> np.array:
        """
        Predict class for faces.

        Parameters:
        faces: 
            2d np.array or tensor with shape [n_sample, n_feature], namely each face should be
            represented with 1D vectors

        Return:
            The predicted labels of all the input faces.
        """
        faces = np.array([face.flatten() for face in faces])
        return self.model.predict(faces)

# 4. Experiments
<div class="alert alert-block alert-info"> <b>NOTE:</b> Do <i>NOT</i> use this section to keep track of every little change you make in your code! Instead, highlight the most important findings and the major (best) pipelines that you've discovered.  
</div>
<br>

## 4.0. Example: basic pipeline
The basic pipeline takes any input and samples a label based on the class label distribution of the training set. As expected the performance is very poor, predicting approximately 1/4 correctly on the training set. There is a lot of room for improvement but this is left to you ;). 

In [None]:
feature_extractor = IdentityFeatureExtractor() 
classifier = RandomClassificationModel()

# train the model on the features
classifier.fit(feature_extractor(train_X), train_y)

# model/final pipeline
model = lambda X: classifier(feature_extractor(X))

In [None]:
# evaluate performance of the model on the training set
train_y_star = model(train_X)

"The performance on the training set is {:.2f}. This however, does not tell us much about the actual performance (generalisability).".format(
    accuracy_score(train_y, train_y_star))

In [None]:
# predict the labels for the test set 
test_y_star = model(test_X)

## 4.1 PCA + DecisionTree

In [None]:
# Simple validation separation
val_X = train_X[-20:]
val_y = train_y[-20:]
train_X_new = train_X[:-20]
train_y_new = train_y[:-20]

pca = PCAFeatureExtractor(50, False)
projections = pca.fit_transform(train_X_new)
cls_dt = DecisionTreeModel()
cls_dt.fit(projections, train_y_new)

val_X = pca.transform(val_X)
predictions = cls_dt.predict(val_X)
accuracy_score(val_y, predictions)

## 4.2 PCA + RandomForest

In [None]:
# Simple validation separation
val_X = train_X[-20:]
val_y = train_y[-20:]
train_X_new = train_X[:-20]
train_y_new = train_y[:-20]

pca = PCAFeatureExtractor(50, False)
projections = pca.fit_transform(train_X_new)
cls_rf = RandomForestModel()
cls_rf.fit(projections, train_y_new)

val_X = pca.transform(val_X)
predictions = cls_rf.predict(val_X)
accuracy_score(val_y, predictions)

# 5. Publishing best results

In [None]:
submission = test.copy().drop('img', axis = 1)
submission['class'] = test_y_star

submission

In [None]:
submission.to_csv('submission.csv')

# 6. Discussion
...

In summary we contributed the following: 
* 
