Using face + voiceprint + secret spoken phrase (which is used for both voice print and text) we can make a private key unique to you and deterministic.

In [1]:
!pip install deterministic-rsa-keygen mtcnn matplotlib scipy librosa numpy pocketsphinx SpeechRecognition pydub

import urllib.request
data_dir="https://raw.githubusercontent.com/TBD54566975/experimental-face-voice-key/main/data"
def download_file(filename):
    urllib.request.urlretrieve(data_dir + "/" + filename, filename)

download_file("test1.jpg")
download_file("test2.jpg")
download_file("voice_mic1.m4a")
download_file("voice_mic2.m4a")
download_file("voice_mic3.m4a")

download_file("voice_mic4.m4a")
download_file("voice_jo1.m4a")
download_file("oli_mic1.m4a")
download_file("oli_mic2.m4a")


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deterministic-rsa-keygen
  Downloading deterministic_rsa_keygen-0.0.1-py3-none-any.whl (6.8 kB)
Collecting mtcnn
  Downloading mtcnn-0.1.1-py3-none-any.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 28.6 MB/s 
Collecting pocketsphinx
  Downloading pocketsphinx-5.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.1 MB)
[K     |████████████████████████████████| 29.1 MB 1.5 MB/s 
[?25hCollecting SpeechRecognition
  Downloading SpeechRecognition-3.9.0-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 156 kB/s 
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting pycryptodome>=3.10
  Downloading pycryptodome-3.16.0-cp35-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 61.2 MB/s 
Colle

# Face metrics

Make a rudimentary face detector, which looks for keypoints in a frame of a detected face (as a set of numbers). Scale these to a large enough Multi Dimensional Space and convert it into one value which can be used as part of a seed for a key, which is unique to a face.

What we really want to do is put the face dimensions in a large vector space and get one number we can use as the seed - we can round things down on the way so that it is "close" even with differing values from the picture (borrowed from https://stackabuse.com/guide-to-multidimensional-scaling-in-python-with-scikit-learn/)

In [11]:
def face_to_stress(face_file):

  from matplotlib import pyplot
  from mtcnn.mtcnn import MTCNN


  # load image from file
  pixels = pyplot.imread(face_file)

  # create the detector, using default weights
  detector = MTCNN()

  # detect faces in the image
  faces = detector.detect_faces(pixels)
  face = faces[0]


  from sklearn.manifold import MDS
  from matplotlib import pyplot as plt
  import sklearn.datasets as dt
  import seaborn as sns         
  import numpy as np
  from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
  from matplotlib.offsetbox import OffsetImage, AnnotationBbox

  x_offset = face['box'][0]
  y_offset = face['box'][1]


  def round_ten(num):    
    return round(num/10)*10

  (left_eye_1, left_eye_2) = face['keypoints']['left_eye'] 
  (right_eye_1, right_eye_2) = face['keypoints']['right_eye']
  (nose_1, nose_2) = face['keypoints']['nose']
  (mouth_left_1, mouth_left_2) = face['keypoints']['mouth_left']
  (mouth_right_1, mouth_right_2) = face['keypoints']['mouth_right']

  X = np.array([[round_ten(left_eye_1-x_offset), round_ten(left_eye_2-y_offset)], 
                [round_ten(right_eye_1-x_offset), round_ten(right_eye_2-y_offset)], 
                [round_ten(nose_1-x_offset), round_ten(nose_2-y_offset)], 
                [round_ten(mouth_left_1-x_offset), round_ten(mouth_left_2-y_offset)], 
                [round_ten(mouth_right_1-x_offset), round_ten(mouth_right_2-y_offset)]])
  mds = MDS(random_state=0)
  X_transform = mds.fit_transform(X)

  stress = mds.stress_
  print(stress)



# Voice print

Everyone in the world can have a reasonably unique voice print which is hard to spoof, especially if combined with a secret phrase. librosa provides some simple utilities to calculate this. Using https://en.wikipedia.org/wiki/Linear_predictive_coding to provide utterance tolerant fingerprint (not secure enough to be non replayable - needs to be combined with a spoken secret)

In [2]:
import librosa
import numpy as np

def calculate_voiceprint(audio_file, num_coeffs=200):


  # Calculate the linear predictive coefficients (LPCs) for the audio signal
  audio, sr = librosa.load(audio_file)
  lpcs = librosa.lpc(audio, num_coeffs)

  def round_vector(vector, precision):
    rounded_vector = []
    for i in range(len(vector)):
      element = vector[i]
      rounded_element = round(element / precision) * precision
      rounded_vector.append(rounded_element)
      precision += 0.2  # Increase precision by a small amount after each iteration, so we are less sensitive to future predictions
    return rounded_vector
  
  return round_vector(lpcs, 0.5)[:6]


Lets try it out on a few voice files



In [7]:

mic1 = calculate_voiceprint("voice_mic1.m4a")
mic2 = calculate_voiceprint("voice_mic2.m4a")

mic3  = calculate_voiceprint("voice_mic3.m4a")
oli_mic1 = calculate_voiceprint("oli_mic1.m4a")
oli_mic2 = calculate_voiceprint("oli_mic2.m4a")

not_mic = calculate_voiceprint("voice_mic4.m4a")

jo = calculate_voiceprint("voice_jo1.m4a")

print("                    mic1", mic1)
print("                    mic2", mic2)
print("                    mic3", mic3)
print("Mic but different phrase", not_mic)
print("  Oli speaking like mic1", oli_mic1)
print("  Oli speaking like mic2", oli_mic2)
print("                      jo", jo)




                    mic1 [1.0, -2.0999999999999996, 2.6999999999999997, -3.3, 5.199999999999999, -5.999999999999999]
                    mic2 [1.0, -2.0999999999999996, 2.6999999999999997, -3.3, 5.199999999999999, -5.999999999999999]
                    mic3 [1.0, -2.0999999999999996, 2.6999999999999997, -3.3, 5.199999999999999, -5.999999999999999]
Mic but different phrase [1.0, -1.4, 1.7999999999999998, -2.1999999999999997, 2.5999999999999996, -2.9999999999999996]
  Oli speaking like mic1 [1.0, -2.0999999999999996, 1.7999999999999998, -2.1999999999999997, 2.5999999999999996, -2.9999999999999996]
  Oli speaking like mic2 [1.0, -1.4, 0.8999999999999999, -1.0999999999999999, 1.2999999999999998, -2.9999999999999996]
                      jo [1.0, -2.0999999999999996, 2.6999999999999997, -4.3999999999999995, 5.199999999999999, -7.499999999999999]




# Voice to text

Here is some rudimentary voice to text to provide some extra signal

In [8]:
def voice_text(audio_file):
  import speech_recognition as sr
  from pydub import AudioSegment

  audio = AudioSegment.from_file(audio_file, format="m4a")
  raw_data = audio.raw_data
  audio_data = sr.AudioData(raw_data, audio.frame_rate, audio.sample_width)


  r = sr.Recognizer()
  text = r.recognize_sphinx(audio_data)
  print("text detected: " + text)
  return text

# Combine into deterministic seed

In [9]:
def make_seed(face_file, voice_file):
  return str(face_to_stress(face_file)) + str(calculate_voiceprint(voice_file)) + voice_text(voice_file)

# Encrypt from face and voice

Use the determinisic seed to create a private key

In [12]:
from rsa import generate_key, encrypt, decrypt

secret_key = generate_key(make_seed("test1.jpg", "voice_mic1.m4a"))

public_key = secret_key.publickey().exportKey("PEM")

# eg round trip:
secret = encrypt("Hello World using face as key", public_key)

print(secret)


0.07707175558959707




text detected: if my voice is my passport
b'bjlIQDj0URzpm4JgsKWdKvdC5ifHjZKH+wghJgnmxZDkymmsxNf/8oLipEm+V7kAdpHCF1etCSrF3zSQfEOSEvc3QBTSHVXRj0vQpkcHyKi8THRCn3UhhXhvIFX6ltFpsVK3nufPSs4C8ACD9fzANJPk/AQBQjlYZrx3QzqJmJxY4xG3j0lId5OVV40AyQYUB2WaLTCY48vW8CbVdDw25rUxS/R4AvuGE5Ow/zh/0MsDLRF7KRNI84oR0PFpw3N/SgOoSYnZvHoG18JUGVIST5tdz0/+taUU3tAzxkcExmJMb9+24iWhyM+MOt2n08xiv+IIAm1pDlYZSbA+uxAGrQ=='


Now will use a different photo and voice to ensure we can make the same key and then decrypt

In [13]:

# using the other photo we can make the same key
secret_key = generate_key(make_seed("test2.jpg", "voice_mic1.m4a"))

private_key = secret_key.exportKey("PEM")

# and we get the secret back (and can use alternative audio if we are clear enough)
decrypt(secret, private_key)















0.12541588643659957
text detected: if my voice is my passport


b'Hello World using face as key'