## **Unzip the File**

In [1]:
import zipfile

# Specify the name of the uploaded zip file
zip_file_name = '/content/drive/MyDrive/Capstone(Face, Speech, Recorder).zip'

# Specify the directory where you want to extract the contents
extract_dir = '/content/Models'

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Optionally, you can print a message to confirm that the extraction was successful
print("Extraction complete.")

Extraction complete.


## **Libraries**

In [1]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adamax
from PIL import Image
import cv2
import numpy as np
import librosa
import pickle
from base64 import b64decode
from IPython.display import display, Javascript, Image as IPythonImage

## **Hybrid Integration of the models**

### **Face Recognition model**

In [2]:
# Load and prepare the face recognition model
def load_face_model():
    model = load_model('/content/drive/MyDrive/face_recognition.h5', compile=False)
    model.compile(Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Predict emotion from an image
def predict_emotion(image_path, model):
    image = Image.open(image_path)
    img = image.resize((224, 224))
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = np.expand_dims(img_array, 0)
    predictions = model.predict(img_array)
    class_labels = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
    score = tf.nn.softmax(predictions[0])
    return class_labels[np.argmax(score)]

### **Voice Recognition model**

In [3]:
# Load the voice recognition model from JSON file
def load_voice_model(json_path, weights_path):
    with open(json_path, 'r') as json_file:
        loaded_model_json = json_file.read()
    loaded_model = tf.keras.models.model_from_json(loaded_model_json)
    loaded_model.load_weights(weights_path)
    print("Loaded voice model from disk")
    return loaded_model

# Load data using pickle (for scalers and encoders)
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# Feature extraction functions
def zcr(data, frame_length, hop_length):
    zcr = librosa.feature.zero_crossing_rate(data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(zcr)

def rmse(data, frame_length=2048, hop_length=512):
    rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)
    return np.squeeze(rmse)

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten: bool = True):
    mfccs = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=20, hop_length=hop_length, n_fft=frame_length)
    return np.squeeze(mfccs.T) if not flatten else np.ravel(mfccs.T)

def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
    features = np.hstack((
        zcr(data, frame_length, hop_length),
        rmse(data, frame_length, hop_length),
        mfcc(data, sr, frame_length, hop_length)
    ))
    return features

def get_predict_feat(path, scaler):
    data, s_rate = librosa.load(path, duration=2.5, offset=0.6)
    features = extract_features(data, s_rate)
    features = np.reshape(features, newshape=(1, -1))
    scaled_features = scaler.transform(features)
    scaled_features = np.expand_dims(scaled_features, axis=2)
    return scaled_features

# Predict emotion from voice using extracted features
def voice_prediction(model, audio_path, scaler):
    scaled_features = get_predict_feat(audio_path, scaler)
    predictions = model.predict(scaled_features)
    y_pred = np.argmax(predictions, axis=1)
    emotions = {0: 'Neutral', 1: 'Calm', 2: 'Happy', 3: 'Sad', 4: 'Angry', 5: 'Fear', 6: 'Disgust', 7: 'Surprise'}
    return emotions[y_pred[0]]

### **Hybrid Integration part**

In [4]:
# Main processing function to analyze both image and audio
def process_image_and_audio(image_path, audio_path):
    face_model = load_face_model()
    face_emotion = predict_emotion(image_path, face_model)
    print(f"Detected Face Emotion: {face_emotion}")

    voice_model = load_voice_model('/content/Models/Speech Recognition/Model/CNN_model.json', '/content/Models/Speech Recognition/Model/CNN_model_weights.h5')
    scaler2 = load_pickle('/content/drive/MyDrive/scaler2.pickle')

    voice_emotion = voice_prediction(voice_model, audio_path, scaler2)
    print(f"Detected Voice Emotion: {voice_emotion}")

    if face_emotion == 'Happy' and voice_emotion in ['Happy', 'Calm']:
        print("Both face and voice are consistent and positive.")
    elif face_emotion == 'Sad' and voice_emotion in ['Sad', 'Calm']:
        print("Both face and voice are consistent and positive.")
    elif face_emotion == 'Angry' and voice_emotion in ['Angry', 'Fear']:
        print("Both face and voice are consistent and positive.")
    elif face_emotion == 'Fear' and voice_emotion in ['Fear', 'Surprise']:
        print("Both face and voice are consistent and positive.")
    elif face_emotion == 'Surprise' and voice_emotion in ['Surprise', 'Happy']:
        print("Both face and voice are consistent and positive.")
    elif face_emotion == 'Disgust' and voice_emotion in ['Disgust', 'Angry']:
        print("Both face and voice are consistent and positive.")
    elif face_emotion == 'Neutral' and voice_emotion in ['Neutral', 'Calm']:
        print("Both face and voice are consistent and positive.")
    else:
        print("Face and voice emotions do not align well or are not consistent.")

if __name__ == "__main__":
    image_path = '/content/Models/Face Recognition/Face photos/test/happy/im0.png'
    audio_path = '/content/Models/Speech Recognition/Audio Data/Actor_07/03-01-06-02-01-01-07.wav'
    process_image_and_audio(image_path, audio_path)

Detected Face Emotion: Happy
Loaded voice model from disk


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Detected Voice Emotion: Happy
Both face and voice are consistent and positive.


## **Testing with Live Photo and Voice Recording**

### **Camera Function**

In [16]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode
from IPython.display import Image as IPythonImage
import cv2
import numpy as np

In [None]:
def take_photo(filename='photo.jpg', quality=0.8):
    js = Javascript('''
    async function takePhoto(quality) {
        const div = document.createElement('div');
        const capture = document.createElement('button');
        capture.textContent = 'Capture';
        div.appendChild(capture);

        const video = document.createElement('video');
        video.style.display = 'block';
        const stream = await navigator.mediaDevices.getUserMedia({video: true});

        document.body.appendChild(div);
        div.appendChild(video);
        video.srcObject = stream;
        await video.play();

        // Resize the output to fit the video element.
        google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

        // Wait for Capture to be clicked.
        await new Promise((resolve) => capture.onclick = resolve);

        const canvas = document.createElement('canvas');
        canvas.width = video.videoWidth;
        canvas.height = video.videoHeight;
        canvas.getContext('2d').drawImage(video, 0, 0);
        stream.getVideoTracks()[0].stop();
        div.remove();
        return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
    display(js)
    data = eval_js('takePhoto({})'.format(quality))
    binary = b64decode(data.split(',')[1])
    with open(filename, 'wb') as f:
        f.write(binary)
    return filename

filename = take_photo()

# Open the captured image and flip it
image = cv2.imread(filename)
if image is None:
    raise ValueError("No image captured or read. Please check camera access and file path.")
flipped_image = cv2.flip(image, 1)  # Flip horizontally

# Save the flipped image
cv2.imwrite(filename, flipped_image)

# Display the flipped image using IPython.display.Image
display(IPythonImage(filename))

### **Voice Recording Function**

In [19]:
from IPython.display import HTML, Audio
from google.colab import output
from base64 import b64decode
import io

In [20]:
# JavaScript to handle microphone recording and provide a "Record" and "Stop" button
def record_audio(filename='audio.wav', seconds=5):
    RECORD_AUDIO_HTML = f"""
    <script>
    function recordAudio(recorder, resolve) {{
        recorder.start();
        setTimeout(() => {{
            recorder.stop();
        }}, {seconds * 1000});
    }}

    function handleSuccess(stream, resolve) {{
        const options = {{ mimeType: 'audio/webm' }};
        const recordedChunks = [];
        const mediaRecorder = new MediaRecorder(stream, options);

        mediaRecorder.addEventListener('dataavailable', function(e) {{
            if (e.data.size > 0) {{
                recordedChunks.push(e.data);
            }}
        }});

        mediaRecorder.addEventListener('stop', function() {{
            const blob = new Blob(recordedChunks);
            const reader = new FileReader();
            reader.readAsDataURL(blob);
            reader.onloadend = function() {{
                const base64data = reader.result;
                resolve(base64data);
            }};
        }});

        recordAudio(mediaRecorder, resolve);
    }}

    const handleReject = (e) => {{
        console.log('navigator.MediaDevices.getUserMedia error: ', e);
    }}

    async function startRecording() {{
        const stream = await navigator.mediaDevices.getUserMedia({{ audio: true }});
        new Promise((resolve) => handleSuccess(stream, resolve))
        .then((base64data) => {{
            google.colab.kernel.invokeFunction('notebook.SaveAudio', [base64data], {{}});
        }});
    }}

    startRecording();
    </script>
    """
    display(HTML(RECORD_AUDIO_HTML))
    output.register_callback('notebook.SaveAudio', save_audio)

def save_audio(base64_audio):
    header, data = base64_audio.split(',')
    binary_data = b64decode(data)
    with open('audio.wav', 'wb') as f:
        f.write(binary_data)
    print("Audio recording saved as 'audio.wav'")

In [21]:
# Call this function to start recording
record_audio(seconds=5)

Audio recording saved as 'audio.wav'


In [22]:
Audio('audio.wav')  # Playback the recorded audio

### **Testing**

In [23]:
if __name__ == "__main__":
    image_path = '/content/photo.jpg'
    audio_path = '/content/audio.wav'
    process_image_and_audio(image_path, audio_path)

Detected Face Emotion: Happy
Loaded voice model from disk


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  data, s_rate = librosa.load(path, duration=2.5, offset=0.6)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Detected Voice Emotion: Calm
Both face and voice are consistent and positive.
