In [66]:
pip install face_recognition



In [67]:
pip install pyheif



In [68]:
!pip install scikit-learn



In [69]:
import os

from google.colab import drive
drive.mount('/content/drive')

# root_path = '/content/drive/MyDrive/SCU/ML/Project/'
root_path = '/content/drive/MyDrive/COEN240/'

def get_training_set_path():
  return os.path.join(root_path, 'training')

def get_testing_set_path():
  return os.path.join(root_path, 'test')

def get_test_labels_path():
  return os.path.join(get_testing_set_path(), 'labels.txt')

def get_trained_results_path(trained):
  return '/content/trained_results' if trained else os.path.join(root_path, 'trained_results')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
# KNN training and inferencing

import numpy as np
from collections import Counter

def cosine_similarity(point1, point2):
    dot_product = np.dot(point1, point2)
    norm_point1 = np.linalg.norm(point1)
    norm_point2 = np.linalg.norm(point2)
    similarity = dot_product / (norm_point1 * norm_point2)
    return similarity

def cosine_distance(point1, point2):
    similarity = cosine_similarity(point1, point2)
    distance = 1 - similarity
    return distance

def euclidean_distance(point1, point2):
  return np.linalg.norm(point1 - point2)

def knn_inferencing(dataset, dataset_labels, testing_point, k=1):
  distances = []

  x_test = testing_point

  for each in range(len(dataset)):
    x_train = dataset[each]
    y_train = dataset_labels[each]
    distance = cosine_distance(x_train, x_test)
    distances.append((distance, y_train))

  sorted_distances = sorted(distances, key=lambda x: x[0])[:k]
  k_nearest_labels = [label for _, label in sorted_distances]
  label_counts = Counter(k_nearest_labels)
  majority_label = label_counts.most_common(1)[0][0]

  return majority_label


In [71]:
# SVM Classifier

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def get_svm_classifier():
  data = np.loadtxt('/content/trained_data', delimiter=',', dtype=str)
  encodings = data[:, :-1].astype(float).tolist()
  labels = data[:, -1].tolist()

  label_encoder = LabelEncoder()
  numeric_labels = label_encoder.fit_transform(labels)

  X_train, X_test, y_train, y_test = train_test_split(encodings, numeric_labels, test_size=0.2, random_state=42)

  svm_classifier = SVC(kernel='linear', decision_function_shape='ovr', random_state=42)

  svm_classifier.fit(X_train, y_train)

  return svm_classifier, label_encoder


In [72]:
# Naive Bayesian Classifier

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def get_nb_classifier():
  data = np.loadtxt('/content/trained_data', delimiter=',', dtype=str)
  encodings = data[:, :-1].astype(float).tolist()
  labels = data[:, -1].tolist()

  label_encoder = LabelEncoder()
  numeric_labels = label_encoder.fit_transform(labels)

  X_train, X_test, y_train, y_test = train_test_split(encodings, numeric_labels, test_size=0.1, random_state=42)

  nb_classifier = GaussianNB()

  nb_classifier.fit(X_train, y_train)

  return nb_classifier, label_encoder


In [73]:
# Random Forest Classifier

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def get_rf_classifier():
  data = np.loadtxt('/content/trained_data', delimiter=',', dtype=str)
  encodings = data[:, :-1].astype(float).tolist()
  labels = data[:, -1].tolist()

  label_encoder = LabelEncoder()
  numeric_labels = label_encoder.fit_transform(labels)

  X_train, X_test, y_train, y_test = train_test_split(encodings, numeric_labels, test_size=0.1, random_state=42)

  rf_classifier = RandomForestClassifier(random_state=42)

  rf_classifier.fit(X_train, y_train)

  return rf_classifier, label_encoder


In [74]:
def train_model(model = 'knn'):
  function_dict = {
    'knn': None,
    'svm': 'get_svm_classifier',
    'nb': 'get_nb_classifier',
    'rf': 'get_rf_classifier'
  }

  selected_model = function_dict.get(model, None)
  if selected_model:
    return selected_model()
  else:
    return ""

def get_label(trained_model, label_encoder, encoding):
  encoding = np.array(encoding).reshape(1, -1)
  predicted_label = trained_model.predict(encoding)[0]

  return label_encoder.inverse_transform([predicted_label])[0]

In [75]:
import face_recognition
import cv2

def get_normalized_image(image):
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  resized_image = cv2.resize(gray, (224, 224))
  return resized_image

def locations_exists(path):
  return face_recognition.face_locations(face_recognition.load_image_file(path)) if True else False

def get_face_encoding(path):
  image = face_recognition.load_image_file(path)
  face_locations = face_recognition.face_locations(image)
  if face_locations:
    face_encoding = face_recognition.face_encodings(image, face_locations)[0]
    return face_encoding
  else:
    return [False]


In [76]:
import os
import shutil
import pyheif
from PIL import Image

def is_an_image_file(path):
  return (path.lower().endswith('.jpeg') and locations_exists(path)) if True else False

def convert_heic_to_jpeg(heic_file_path):
  heif_file = pyheif.read(heic_file_path)
  image_data = heif_file.data

  pil_image = Image.frombytes(
            heif_file.mode,
            heif_file.size,
            image_data,
            "raw",
            heif_file.mode,
            heif_file.stride,
  )
  jpg_file_path = os.path.splitext(heic_file_path)[0] + '.jpeg'
  pil_image.save('/content/img', format="JPEG")
  shutil.move(heic_file_path, jpg_file_path)

def correct_image_formats(path):
  if os.path.isdir(path):
    for file in os.listdir(path):
      correct_image_formats(os.path.join(path, file))
  elif path.lower().endswith('.heic'):
    convert_heic_to_jpeg(path)

def get_encodings_and_labels(path, train_percentage = 80, *, for_validation = False):
  encodings = []
  labels = []
  for name in os.listdir(path):
    path_with_name = os.path.join(path, name)
    if "." in path_with_name:
      continue
    file_list = os.listdir(path_with_name)
    train_until = int((train_percentage / 100) * len(file_list))
    if for_validation:
      file_list = file_list[train_until:]
    else:
      file_list = file_list[:train_until]

    for image_file in file_list:
      path_with_image = os.path.join(path_with_name, image_file)
      if not os.path.isdir(path_with_image) and not path_with_image.lower().endswith('.heic'):
        # print("path : ", path_with_image)
        encoding = get_face_encoding(path_with_image)
        if all(encoding):
          encodings.append(encoding)
          labels.append(name)

  return encodings, labels

def save_encodings_and_labels_to_file(encodings, labels, file_path):
  data = np.column_stack((encodings, labels))
  np.savetxt(file_path, data, fmt='%s', delimiter=',')

def read_encodings_and_labels_from_file(file_path):
  data = np.loadtxt(file_path, delimiter=',', dtype=str)
  encodings = data[:, :-1].astype(float).tolist()
  labels = data[:, -1].tolist()
  return encodings, labels

In [77]:
import re

trained = False

def train(train_percentage = 80):
  global trained
  training_set_encodings, training_set_labels = get_encodings_and_labels(get_training_set_path(), train_percentage)
  trained = True
  save_encodings_and_labels_to_file(training_set_encodings, training_set_labels, get_trained_results_path(trained))
  print("training set encodings length: ", len(training_set_encodings))
  print("training set labels length: ", len(training_set_labels))
  train_details = {'train_percentage': train_percentage, 'training_set_encodings': training_set_encodings, 'training_set_labels': training_set_labels}
  return train_details

def validate(train_details):
  train_percentage = train_details['train_percentage']
  training_set_encodings = train_details['training_set_encodings']
  training_set_labels = train_details['training_set_labels']
  validation_set_encodings, validation_set_labels = get_encodings_and_labels(get_training_set_path(), train_percentage, for_validation = True)
  correctly_classified_count = 0
  for each in range(len(validation_set_encodings)):
    test_encoding = validation_set_encodings[each]
    knn_label = knn_inferencing(training_set_encodings, training_set_labels, test_encoding)
    # print("knn label = ", knn_label)
    # print("validation label = ", validation_set_labels[each])
    if (knn_label == validation_set_labels[each]):
      correctly_classified_count = correctly_classified_count + 1
  accuracy = (correctly_classified_count / len(validation_set_encodings)) * 100
  return accuracy

def extract_number_from_path(path):
    match = re.search(r'(\d+)_\d+.jpeg', path)
    return int(match.group(1)) - 1

def extract_names_from_labels(path):
  names = []
  with open(path, 'r') as file:
    for line in file:
        _, name = line.strip().split(maxsplit=1)
        names.append(name)
  return names

def get_labels_dict(path):
  with open(path, 'r') as file:
    labels_dict = {}

    for line in file:
      image_name, person_name = line.strip().split()
      image_name = image_name.split('.')[0]
      labels_dict[image_name] = person_name

  return labels_dict

def test():
  correctly_classified_count = 0
  test_set_count = 0
  training_set_encodings, training_set_labels = read_encodings_and_labels_from_file(get_trained_results_path(trained))
  labels_dict = get_labels_dict(get_test_labels_path())
  names = extract_names_from_labels(get_test_labels_path())

  for image_file in os.listdir(get_testing_set_path()):
    path_with_image = os.path.join(get_testing_set_path(), image_file)
    if not os.path.isdir(path_with_image):
      if is_an_image_file(path_with_image):
        test_set_count = test_set_count + 1
        encoding = get_face_encoding(path_with_image)
        knn_label = knn_inferencing(training_set_encodings, training_set_labels, encoding)
        actual_label = labels_dict[image_file.split('.')[0]]

        if actual_label.lower().replace(" ", "") == knn_label.lower().replace(" ", ""):
          correctly_classified_count = correctly_classified_count + 1

  accuracy = (correctly_classified_count / test_set_count) * 100
  return accuracy

# Training runs for about 3-5 minutes
# Comment the 3 lines below to test directly using the trained results
# Upload the 'trained_results' file (zipped along with the code) to the drive such that
# path of 'trained_results' is /content/drive/MyDrive/COEN240/training/trained_results
# ---------------------- Comment below lines to directly test --------------------------
# --------- MAKE SURE TO UPLOAD THE 'trained_results' FILE TO THE DRIVE ----------------
train_details = train()
accuracy = validate(train_details)
print("validation accuracy: ", accuracy)
# ---------------------- Comment above lines to directly test --------------------------

print("testing accuracy: ", test())

training set encodings length:  166
training set labels length:  166
validation accuracy:  100.0
testing accuracy:  100.0


# New Section