<a href="https://colab.research.google.com/github/RishitSingh10/CV_Projects/blob/main/SiameseNetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Face Recognition System using Siamese Network

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

path = "/content/drive/MyDrive/face_recognition/"
os.chdir(path)

In [None]:
%%capture
!pip uninstall -y numpy torchvision torch pillow
!pip install facenet-pytorch torchvision torch pillow numpy==1.26.4
!pip install mtcnn

In [None]:
# Restart session in case of an error on Colab

from facenet_pytorch import InceptionResnetV1

# Create an inception resnet (in eval mode):
resnet = InceptionResnetV1(pretrained='vggface2').eval()

In [None]:
import numpy as np

np.__version__

### Detect Faces using MTCNN and Creating embeddings using InceptionResnetV1

In [None]:
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from mtcnn import MTCNN
from mtcnn.utils.images import load_image
from collections import defaultdict
from tqdm import tqdm
import time

In [None]:
device = 'GPU:0' if torch.cuda.is_available() else 'CPU:0'
detector = MTCNN(device='CPU:0')  # Initialize MTCNN with the selected device

In [None]:
device

### Calculate Embeddings

In [None]:

def detect_crop(img, detector):
  # Detect faces in the image
  result = detector.detect_faces(img)

  if len(result) == 0:
    return None

  imgs = []
  for i in range(len(result)):
    bbox = result[i]['box']
    imgs.append((img[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]], bbox))

  return imgs


def embed(img1):

  img1 = cv2.resize(img1, (160, 160))
  img1 = img1[:,:,:3]

  # Normalize
  mean = np.mean(img1, axis=(0,1,2), keepdims=True)
  std = np.std(img1, axis=(0,1,2), keepdims=True)
  img1 = (img1 - mean) / std

  img1 = torch.tensor(img1, dtype=torch.float32)
  img1 = img1.permute(2, 0, 1) # Permute to (C, H, W)
  img_embedding = resnet(img1.unsqueeze(0))


  return img_embedding

# Evaluating time taken by "for loops" for detecting and generating embeddings for each of the known faces

start = time.time()

# List of known people
known_people = os.listdir('./Face_database/')


known_embeddings = defaultdict(list)

for people in tqdm(known_people, 'Processing Faces'):
  path = f'./Face_database/{people}/'
  for img in os.listdir(path):
    pth = os.path.join(path, img)
    img = load_image(pth)
    imgs = detect_crop(img, detector)
    img_embedding = embed(imgs[0][0])
    known_embeddings[people].append(img_embedding)

end = time.time()

print(f"Time taken = {end - start}")

### Re Calculating Embeddings using Haar Cascade Detector

Faster than mtcnn

In [None]:
# OPTIMIZING CODE USING VECTORIZATION
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def detect_crop_mtcnn(img):
  # Detect faces in the image
  result = detector.detect_faces(img)

  if len(result) == 0:
    return None

  return result



def detect_crop_haar(img):
    # Convert image to grayscale (Haar Cascade works on grayscale images)
    # Ensure the input image is in a suitable format (e.g., uint8) for cv2.cvtColor
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Detect faces using Haar cascade
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

    if len(faces) == 0:
        return None

    return faces  # returns list of rectangles (x, y, w, h)

def embed(img1):

  img1 = cv2.resize(img1, (160, 160))
  img1 = img1[:,:,:3] # Ignore ther transparency channel incase of a png image

  # Normalize
  mean = np.mean(img1, axis=(0,1,2), keepdims=True)
  std = np.std(img1, axis=(0,1,2), keepdims=True)
  img1 = (img1 - mean) / std

  img1 = torch.tensor(img1, dtype=torch.float32)
  img1 = img1.permute(2, 0, 1) # Permute to (C, H, W)
  img_embedding = resnet(img1.unsqueeze(0)).detach().numpy()

  # Normalize embedding
  img_embedding = img_embedding / np.linalg.norm(img_embedding)

  return img_embedding.squeeze()


start = time.time()

# List of known people
known_people = os.listdir('./Face_database/')

# Saving paths of images for known people in a dict
known_dict = defaultdict(list)


for name in known_people:
  imgs = os.listdir(f'./Face_database/{name}/')
  for img_name in imgs:
    pth = os.path.join(f'./Face_database/{name}/', img_name)
    img = load_image(pth)
    result = detect_crop_mtcnn(img) # Pass the converted image to detect_crop
    if result: # Changed condition to check if result is not None
      bbox = result[0]['box']
      crop = img[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]]
      known_dict[name].append(embed(crop))

  known_dict[name] = np.array(known_dict[name])

end = time.time()

print(f"Time taken = {end - start}")

In [None]:
# save the known_dict using pickle
import pickle

with open('embeddings.pkl', 'wb') as f:
  pickle.dump(known_dict, f)


# Load saved embeddings
with open('embeddings.pkl', 'rb') as f:
    known_dict = pickle.load(f)


In [None]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [None]:
# Function to recognize a face (if it is in known_faces)

def recognize(img, known_embeddings, threshold = 0.3):

  scores = defaultdict(int)

  enc = embed(img)

  for person, emb_lst in known_embeddings.items():
    if len(emb_lst) > 0:
      for emb in emb_lst:
        scores[person] += torch.nn.functional.cosine_similarity(enc, emb)


    scores[person] /= len(emb_lst)

  if not scores:
    return ('UNKNOWN', 0)

  match = max(scores, key=scores.get)
  score = scores[match].detach()[0]

  if score > 0.3:
    return (match, f'{score:2f}')

  else:
    return ('UNKNOWN', 0)

# Function to perform real-time face recognition through a webcam

def face_recognition(known_embeddings, detector = detector, threshold = 0.75):

  # start streaming video from webcam
  video_stream()
  # label for video
  label_html = 'Capturing...'
  bbox_data = '' # Initialize bbox_data for the JavaScript function

  while True:
    # Pass an empty string for bbox_data initially, or adapt JS to receive bbox
    # For simplicity, let's not pass bbox data to JS for now and draw on img
    js_reply = video_frame(label_html, '') # Pass empty string instead of bbox
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # Recognize and get label, score, and bbox
    faces = detect_crop_haar(img)

    if faces is None:
      cv2.putText(img, 'No Face Detected', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
      # Convert the OpenCV image back to a base64 string
      img_bytes = cv2.imencode('.jpg', img)[1].tobytes()
      img_base64 = b64encode(img_bytes).decode('utf-8')
      bbox_data = 'data:image/jpeg;base64,' + img_base64 # Format for imgElement.src
      # Update label and image data for the next frame
      label_html = 'No Face Detected'
      js_reply = video_frame(label_html, bbox_data)

    else:
      for (face, bbox) in faces:
        label, score = recognize(face, known_embeddings)

        # Draw bounding box and text on the image if a face is detected
        if bbox is not None:
            x, y, w, h = bbox
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
            # Put text near the bounding box, adjust position as needed
            cv2.putText(img, f'{label} {score}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            # Convert the OpenCV image back to a base64 string
            # encode OpenCV image to JPEG data
            img_bytes = cv2.imencode('.jpg', img)[1].tobytes()
            # convert numpy array to base64 string
            img_base64 = b64encode(img_bytes).decode('utf-8')
            bbox_data = 'data:image/jpeg;base64,' + img_base64 # Format for imgElement.src

            # Update label and image data for the next frame
            label_html = f'{label} (Score: {score})'
            js_reply = video_frame(label_html, bbox_data) # Pass the image with drawn elements

    if not js_reply:
        break

In [None]:
# Optimized Function to recognize a face (if it is in known_faces)

INT_MIN = -1e30

def recognize(enc, known_dict, threshold = 0.5):
  scores = defaultdict(int)

  maxi = INT_MIN
  rec_name = None

  for name in known_people:
    score = np.mean(known_dict[name] @ enc)

    if score > maxi:
      maxi = score
      rec_name = name

  return (rec_name, f'{maxi:2f}')

In [None]:

# Function to perform real-time face recognition through a webcam
def face_recognition(known_dict, detector = detector, threshold = 0.75):

  # start streaming video from webcam
  video_stream()
  # label for video
  label_html = 'Capturing...'
  bbox_data = '' # Initialize bbox_data for the JavaScript function

  while True:
    # Pass an empty string for bbox_data initially, or adapt JS to receive bbox
    # For simplicity, let's not pass bbox data to JS for now and draw on img
    js_reply = video_frame(label_html, '') # Pass empty string instead of bbox
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    img = js_to_image(js_reply["img"])

    # Convert the loaded image to a NumPy array of type uint8
    img_np = np.array(img, dtype=np.uint8)
    result = detect_crop_haar(img_np) # Pass the converted image to detect_crop



    if result is None:
      cv2.putText(img, 'No Face Detected', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
      # Convert the OpenCV image back to a base64 string
      img_bytes = cv2.imencode('.jpg', img)[1].tobytes()
      img_base64 = b64encode(img_bytes).decode('utf-8')
      bbox_data = 'data:image/jpeg;base64,' + img_base64 # Format for imgElement.src
      # Update label and image data for the next frame
      label_html = 'No Face Detected'
      js_reply = video_frame(label_html, bbox_data)

    else:
      for bbox in result: # Iterate through detected faces if multiple
        crop = img_np[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]]
        enc = embed(crop)
        label, score = recognize(enc, known_dict)

        # Draw bounding box and text on the image if a face is detected
        if bbox is not None:
            x, y, w, h = bbox
            cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
            # Put text near the bounding box, adjust position as needed
            cv2.putText(img, f'{label} {score}', (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            # Convert the OpenCV image back to a base64 string
            # encode OpenCV image to JPEG data
            img_bytes = cv2.imencode('.jpg', img)[1].tobytes()
            # convert numpy array to base64 string
            img_base64 = b64encode(img_bytes).decode('utf-8')
            bbox_data = 'data:image/jpeg;base64,' + img_base64 # Format for imgElement.src

            # Update label and image data for the next frame
            label_html = f'{label} (Score: {score})'
            js_reply = video_frame(label_html, bbox_data) # Pass the image with drawn elements

    if not js_reply:
        break

In [None]:
face_recognition(known_dict)

In [None]:
result