# Facenet recreation
Google facenet recreation

In [12]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import itertools
from matplotlib import image
import glob as glob
from PIL import Image

import torch
import torchvision
from torchvision import datasets, models, transforms
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchinfo import summary

print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)
# Detect if we have a GPU available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
if torch.backends.mps.is_available():
  print("Using the GPU!")
else:
  print("WARNING: Could not find GPU! Using CPU only. If you want to enable GPU, please to go Edit > Notebook Settings > Hardware Accelerator and select GPU.")


PyTorch Version:  2.1.1
Torchvision Version:  0.16.1
Using the GPU!


# Build Dataset from facenet
using instructions from pytorch docs: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
Reading pairs to ensure that muiltiple faces of a single person 


In [13]:
# class LabledFacesWild(Dataset):
#     def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
#         label_dict = []
#         with open(annotations_file) as f:
#             for line in f:
#                 print(line)
#                 name, i1, i2 = line.split('\t')
#                 label_dict.append({
#                     "label": name,
#                     "img1": f"{name}/{int(i1):04d}.jpg",
#                     "img2": f"{name}/{int(i2):04d}.jpg"
#                 })
#         self.img_labels = pd.DataFrame.from_records(label_dict)
#         self.img_dir = img_dir
#         self.transform = transform
#         self.target_transform = target_transform

#     def __len__(self):
#         return len(self.img_labels) * 2

#     def __getitem__(self, idx):
#         pair_num = idx // 2
#         img_path = os.path.join(self.img_dir, self.img_labels.iloc[pair_num, (idx % 2) + 1])
#         image = read_image(img_path)
#         label = self.img_labels.iloc[pair_num, 0]
#         if self.transform:
#             image = self.transform(image)
#         if self.target_transform:
#             label = self.target_transform(label)
#         return image, label

In [16]:
class MSFT_Faces(Dataset):
    def __init__(self, img_dir, start_idx, end_idx, img_per_id, transform=None):
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.img_per_id = img_per_id
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return self.end_idx - self.start_idx + 1

    def __getitem__(self, idx):
        person = idx + self.start_idx
        images = []
        for i in range(self.img_per_id):
            img_path = os.path.join(self.img_dir, f"{person}/{i}.png")
            image = torchvision.io.read_image(img_path)
            # remove alpha channel
            image = image[:3, :, :]
            if self.transform:
                image = self.transform(image)
            images.append(image)
        # create array of labels
        label = torch.tensor(np.full((self.img_per_id,), person))
        return torch.tensor(images), label

In [17]:
# actually load data

transform = transforms.Compose([
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

# construct the dataloader
train_data = MSFT_Faces('./MSFT_1M_FACES/', 0, 3999, 40, transform)
test_data = MSFT_Faces('./MSFT_1M_FACES/', 8000, 9999, 40, transform)
train_loader = DataLoader(train_data, batch_size=20, shuffle=False)
test_loader = DataLoader(test_data, batch_size=20, shuffle=False)


# Create Facenet Model
Facenet model https://arxiv.org/pdf/1503.03832.pdf
Google Lenet 22layer: https://arxiv.org/pdf/1409.4842.pdf
input size = 250x250x3
use relu


In [15]:
class Facenet_NN1(nn.Module):
    def __init__(self):
        super().__init__()
        # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)
        # torch.nn.MaxPool2d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)
        # torch.nn.LocalResponseNorm(size, alpha=0.0001, beta=0.75, k=1.0)
        self.conv1 = nn.Conv2d(3, 64, 7,padding=3, stride=2)
        self.pool1 = nn.MaxPool2d(3, padding=1, stride=2)
        self.rnorm1 = nn.LocalResponseNorm(64)
        
        self.conv2a = nn.Conv2d(64, 64, 1, stride=1)
        self.conv2 = nn.Conv2d(64, 192, 3,padding=1, stride=1)
        self.rnorm2 = nn.LocalResponseNorm(192)
        self.pool2 = nn.MaxPool2d(3, padding=1, stride=2)

        self.conv3a = nn.Conv2d(192, 192, 1, stride=1)
        self.conv3 = nn.Conv2d(192, 384, 3, padding=1, stride=1)
        self.pool3 = nn.MaxPool2d(3, padding=1, stride=2)

        self.conv4a = nn.Conv2d(384, 384, 1, stride=1)
        self.conv4 = nn.Conv2d(384, 256, 3, padding=1, stride=1)

        self.conv5a = nn.Conv2d(256, 256, 1, stride=1)
        self.conv5 = nn.Conv2d(256, 256, 3,padding=1, stride=1)

        self.conv6a = nn.Conv2d(256, 256, 1, stride=1)
        self.conv6 = nn.Conv2d(256, 256, 3,padding=1, stride=1)
        self.pool4 = nn.MaxPool2d(3,padding=1, stride=2)

        #todo what is concat layer?
        self.fc1 = nn.Linear(256*4*4, 128*4*4)
        # self.maxout1 = nn.AdaptiveMaxPool2d((32,1))
        self.fc2 = nn.Linear(128 * 4 *4, 128 * 4)
        # self.maxout2 = nn.MaxPool2d(2)

        self.fc7128 = nn.Linear(128*4,128)
        
        
        

    def forward(self, x):
        batch_size = x.size(0)
        x1 = self.rnorm1(self.pool1(F.relu(self.conv1(x))))
        x2 = self.pool2(self.rnorm2(F.relu(self.conv2(self.conv2a(x1)))))
        x3 = self.pool3(F.relu(self.conv3(self.conv3a(x2))))
        x4 = F.relu(self.conv4(self.conv4a(x3)))
        x5 = F.relu(self.conv5(self.conv5a(x4)))
        x6 = self.pool4(F.relu(self.conv6(self.conv6a(x5))))
        x6i = torch.flatten(x6, 1)
        x7 = self.fc1(x6i)
        # x7i = torch.unflatten(x7, 1, (128, 7, 7))
        # x8 = self.maxout1(x7i)
        # x8i = torch.flatten(x8, 1)
        x9 = self.fc2(x7)
        # x9i = torch.unflatten(x9, 1, (128, 32))
        # x10 = self.maxout2(x9i)
        # x10i = torch.flatten(x10, 1)
        x11 = self.fc7128(x9)
        # normalize the output to a unit vector
        x11 = F.normalize(x11)

        #x7 = self.maxout2(self.fc2(self.maxout1(self.fc1(x6))))
        # ─Conv2d: 1-16                           [1, 256, 6, 6]            590,080
        # ─MaxPool2d: 1-17                        [1, 256, 2, 2]            --
        # return self.fc1(x6)
        return x11
        # return self.fc7128(x7)
        # return x5

In [60]:
# # create inception module
# # https://www.kaggle.com/code/mohamedmustafa/10-implement-inceptionnet-from-scratch-pytorch
# class ConvBlock(nn.Module):
#     def __init__(self, In_Channels, Out_Channels, Kernel_Size, Stride, Padding):
#         super(ConvBlock, self).__init__()
#         self.Conv = nn.Conv2d(in_channels=In_Channels, out_channels=Out_Channels, kernel_size=Kernel_Size, stride=Stride, padding=Padding)
#         self.Batch_Norm = nn.BatchNorm2d(num_features=Out_Channels)
#         self.Activ_Func = nn.ReLU()
    
#     """
#     Now we'll build the forward function which defines the path to input tensor
#     meaning that we tell the tensor the sequence of layers you're going through 
#     Takecare the name of forward function is sensitive so you have to name forward not any thing else
#     """
#     def forward(self, Tensor_Path):
#         Tensor_Path = self.Conv(Tensor_Path)
#         Tensor_Path = self.Batch_Norm(Tensor_Path)
#         Tensor_Path = self.Activ_Func(Tensor_Path)
        
#         return Tensor_Path

# class InceptionModule(nn.Module):
#     def __init__(self, num1x1, num3x3reduce, num3x3, num5x5reduce, num5x5, ) -> None:
#         super().__init__()


# # NN2
# class Facenet_NN2(nn.Module):
#     def __init__(self):
#         super().__init__()
    
#     def forward(self, x):
#         return x

In [11]:
# print summary
model = Facenet_NN1()
summary(model, input_size=(10, 3, 112, 112), verbose=2)

Layer (type:depth-idx)                   Output Shape              Param #
Facenet_NN1                              [10, 128]                 --
├─Conv2d: 1-1                            [10, 64, 56, 56]          9,472
│    └─weight                                                      ├─9,408
│    └─bias                                                        └─64
├─MaxPool2d: 1-2                         [10, 64, 28, 28]          --
├─LocalResponseNorm: 1-3                 [10, 64, 28, 28]          --
├─Conv2d: 1-4                            [10, 64, 28, 28]          4,160
│    └─weight                                                      ├─4,096
│    └─bias                                                        └─64
├─Conv2d: 1-5                            [10, 192, 28, 28]         110,784
│    └─weight                                                      ├─110,592
│    └─bias                                                        └─192
├─LocalResponseNorm: 1-6                 [10, 192,

Layer (type:depth-idx)                   Output Shape              Param #
Facenet_NN1                              [10, 128]                 --
├─Conv2d: 1-1                            [10, 64, 56, 56]          9,472
│    └─weight                                                      ├─9,408
│    └─bias                                                        └─64
├─MaxPool2d: 1-2                         [10, 64, 28, 28]          --
├─LocalResponseNorm: 1-3                 [10, 64, 28, 28]          --
├─Conv2d: 1-4                            [10, 64, 28, 28]          4,160
│    └─weight                                                      ├─4,096
│    └─bias                                                        └─64
├─Conv2d: 1-5                            [10, 192, 28, 28]         110,784
│    └─weight                                                      ├─110,592
│    └─bias                                                        └─192
├─LocalResponseNorm: 1-6                 [10, 192,

# Train Facenet Model
training data
In all our experiments we train the CNN using Stochastic
Gradient Descent (SGD) with standard backprop [8, 11] and
AdaGrad [5]. In most experiments we start with a learning
rate of 0.05 which we lower to finalize the model. The models are initialized from random, similar to [16], and trained
on a CPU cluster for 1,000 to 2,000 hours. The decrease in
the loss (and increase in accuracy) slows down drastically
after 500h of training, but additional training can still significantly improve performance. The margin α is set to 0.2.

In [None]:
# online triplet selection
# https://github.com/adambielski/siamese-triplet
# https://github.com/adambielski/siamese-triplet/blob/master/utils.py



# Test Facenet Model
testing

In [None]:
# todo test