In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from keras.models import model_from_json, load_model
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf

# Assuming you have TensorFlow and Keras installed


In [4]:
# Load the speech emotion recognition model
with open('/content/drive/MyDrive/model1.json', 'r') as json_file:
    speech_model_json = json_file.read()
speech_model = model_from_json(speech_model_json)
speech_model.load_weights('/content/drive/MyDrive/Emotion_Voice_Detection_Model1 (1).h5')

# Load the face emotion recognition model
face_model = load_model('/content/drive/MyDrive/face_cnn.h5')

# Define emotion labels
emotion_labels = ["neutral", "happy", "sad", "angry"]




##1.Feature Level fusion extract
####1.Extract Features for Speech and Face Models
####2.Combine Features
####3.Train a Meta-Classifier
####4.Predict and Evaluate

In [65]:
import numpy as np
import librosa
import soundfile as sf
import cv2
from tensorflow.keras.models import load_model

# Extract features from audio (same as your original function)
def extract_speech_features(file_name, print_flag=False, **kwargs):
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")

    with sf.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        if X.ndim >= 2:
            X = np.mean(X, 1)
        sample_rate = sound_file.samplerate
        result = np.array([])
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            if print_flag:
                print(f"MFCC shape: {mfccs.shape}")
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
            result = np.hstack((result, tonnetz))
    return result

# Extract features from face image using face model
def extract_face_features(image_file, face_model):
    image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (128, 128))  # Adjusted based on your input size
    image = np.expand_dims(image, axis=-1)
    image = np.expand_dims(image, axis=0)
    face_features = face_model.predict(image)
    return face_features.flatten()

# Load pre-trained face model
face_model = load_model('/content/drive/MyDrive/face_cnn.h5')




###Feature-level fusion Logic


In [58]:
def feature_level_fusion(audio_file, image_file, speech_model, face_model):
    speech_features = extract_speech_features(audio_file, mfcc=True)
    face_features = extract_face_features(image_file, face_model)

    # Combine the features by concatenating them
    fused_features = np.hstack((speech_features, face_features))
    return fused_features


###Meta-Classifier Training

In [68]:
def extract_face_features(image_file, face_model):
    image = cv2.imread(image_file, cv2.IMREAD_COLOR)
    image = cv2.resize(image, (128, 128))  # Resize to match model input
    image = image.astype('float32') / 255.0

    if image.shape[-1] == 1:
        image = np.repeat(image, 3, axis=-1)  # Convert grayscale to RGB

    image = np.expand_dims(image, axis=0)
    face_features = face_model.predict(image)
    return face_features.flatten()

In [77]:
def predict_face_emotion(image_file, face_model):
    face_features = extract_face_features(image_file, face_model)
    if face_features.size == 0:
        return None
    predicted_face_emotion = np.argmax(face_features)
    return predicted_face_emotion

def predict_speech_emotion(audio_file, speech_model):
    speech_features = extract_speech_features(audio_file, mfcc=True)
    speech_features = speech_features.reshape(1, -1)
    predicted_speech_emotion = speech_model.predict(speech_features)
    return np.argmax(predicted_speech_emotion)

def feature_level_fusion(audio_file, image_file, speech_model, face_model):
    speech_emotion = predict_speech_emotion(audio_file, speech_model)
    face_emotion = predict_face_emotion(image_file, face_model)

    # Ensure the emotions are valid (not None)
    if speech_emotion is not None and face_emotion is not None:
        return np.concatenate([np.array([speech_emotion]), np.array([face_emotion])])
    else:
        # Handle cases where one of the features is None
        return np.concatenate([np.array([speech_emotion if speech_emotion is not None else -1]),
                                np.array([face_emotion if face_emotion is not None else -1])])

# Example data
audio_files = ['/content/drive/MyDrive/03-02-05-01-02-02-05.wav',
               '/content/drive/MyDrive/03-02-05-02-02-01-05.wav',
               '/content/drive/MyDrive/03-02-06-01-01-01-05.wav']
image_files = ['/content/drive/MyDrive/front.jpg',
               '/content/drive/MyDrive/gowf.jpg',
               '/content/drive/MyDrive/satvikf.jpg']
labels = ['happy', 'sad', 'neutral']  # Corresponding labels

fused_features = []
for audio_file, image_file in zip(audio_files, image_files):
    fused_features.append(feature_level_fusion(audio_file, image_file, speech_model, face_model))

# Convert to numpy array
X_fused = np.array(fused_features)

# Encode labels into numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
from sklearn.model_selection import train_test_split
print("Shape of X_fused:", X_fused.shape)
print("Shape of y:", len(y))
print("Labels:", y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_fused, y, test_size=0.2, random_state=42)

meta_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
meta_classifier.fit(X_train, y_train)

y_pred = meta_classifier.predict(X_test)

unique_y_test = np.unique(y_test)
unique_y_pred = np.unique(y_pred)

print("Unique classes in y_test:", unique_y_test)
print("Unique classes in y_pred:", unique_y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, labels=unique_y_test, target_names=label_encoder.inverse_transform(unique_y_test)))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Shape of X_fused: (3, 2)
Shape of y: 3
Labels: [0 2 1]
Unique classes in y_test: [0]
Unique classes in y_pred: [2]
Accuracy: 0.0
              precision    recall  f1-score   support

       happy       0.00      0.00      0.00       1.0

   micro avg       0.00      0.00      0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
from collections import Counter
print(Counter(y))


Counter({0: 1, 2: 1, 1: 1})


##Prediction

In [84]:
def predict_emotion(audio_file, image_file, speech_model,face_model):
    fused_features = feature_level_fusion(audio_file, image_file, speech_model, face_model)
    predicted_emotion = meta_classifier.predict([fused_features])
    return predicted_emotion

# Example usage
audio_file = '/content/drive/MyDrive/03-02-01-01-01-01-05.wav'
image_file = '/content/drive/MyDrive/front.jpg'
predicted_emotion = predict_emotion(audio_file, image_file, speech_model,face_model)
print("Predicted Emotion:", predicted_emotion)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Predicted Emotion: [1]



###Label-Level Fusion Implementation
####1.Predict Labels Separately from Both Models
####2.Combine Labels Using a Voting Mechanism
####3.Predict Final Label

###Label-level fusion that takes input from drive mounted

In [85]:
# Assuming you have separate models for audio and face:
def predict_speech_emotion(audio_file):
    speech_features = extract_speech_features(audio_file, mfcc=True)
    speech_features = speech_features.reshape(1, -1)
    predicted_speech_emotion = speech_model.predict(speech_features)
    return predicted_speech_emotion

def predict_face_emotion(image_file, face_model):
    face_features = extract_face_features(image_file, face_model)
    predicted_face_emotion = np.argmax(face_features)
    return predicted_face_emotion


In [92]:

def extract_face_features(image_file, face_model):
    image = cv2.imread(image_file, cv2.IMREAD_COLOR)
    image = cv2.resize(image, (128, 128))
    image = image.astype('float32') / 255.0

    if image.shape[-1] == 1:
        image = np.repeat(image, 3, axis=-1)

    image = np.expand_dims(image, axis=0)
    face_features = face_model.predict(image)
    return face_features.flatten()

def predict_speech_emotion(audio_file, speech_model):
    speech_features = extract_speech_features(audio_file, mfcc=True)
    speech_features = speech_features.reshape(1, -1)
    predicted_speech_emotion = speech_model.predict(speech_features)
    return np.argmax(predicted_speech_emotion)

def predict_face_emotion(image_file, face_model):
    face_features = extract_face_features(image_file, face_model)
    if face_features.size == 0:
        return "Unknown"  # Or any default value you prefer
    predicted_face_emotion = np.argmax(face_features)
    return predicted_face_emotion

def label_level_fusion(audio_file, image_file, speech_model, face_model):
    speech_emotion = predict_speech_emotion(audio_file, speech_model)
    face_emotion = predict_face_emotion(image_file, face_model)

    # Debugging: Print the emotions
    print(f"Speech Emotion: {speech_emotion}, Face Emotion: {face_emotion}")

    # Voting mechanism: Majority vote or weighted voting
    emotions = [speech_emotion, face_emotion]

    # Handle the case where one of the predictions might be "Unknown"
    if "Unknown" in emotions:
        final_emotion = speech_emotion if face_emotion == "Unknown" else face_emotion
    else:
        final_emotion_result = mode(emotions)
        final_emotion = final_emotion_result.mode  # This is already the scalar mode value

    return final_emotion

# Example usage
audio_file = '/content/drive/MyDrive/03-02-01-01-01-01-05.wav'
image_file = '/content/drive/MyDrive/front.jpg'


final_emotion = label_level_fusion(audio_file, image_file, speech_model, face_model)
print("Final Predicted Emotion:", final_emotion)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Speech Emotion: 2, Face Emotion: 0
Final Predicted Emotion: 0


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [88]:
import numpy as np
from collections import Counter

# Dummy functions to simulate predictions
def predict_speech_emotion(audio_features):
    # Randomly simulate a predicted emotion from the audio features
    emotions = ['neutral', 'happy', 'sad', 'angry']
    return np.random.choice(emotions)

def predict_face_emotion(image_data, face_model):
    # Randomly simulate a predicted emotion from the image data
    emotions = ['neutral', 'happy', 'sad', 'angry']
    return np.random.choice(emotions)

# Label-level fusion function
def label_level_fusion(audio_features, image_data, speech_model=None, face_model=None):
    speech_emotion = predict_speech_emotion(audio_features)
    face_emotion = predict_face_emotion(image_data, face_model)

    # Use Counter to find the most common emotion (majority voting)
    emotion_counter = Counter([speech_emotion, face_emotion])
    final_emotion = emotion_counter.most_common(1)[0][0]
    return final_emotion

# Simulate random audio features and image data
audio_features = np.random.rand(100)  # Simulate a 100-dimensional audio feature vector
image_data = np.random.rand(128, 128, 3)  # Simulate a 128x128 RGB image

# Run label-level fusion
final_emotion = label_level_fusion(audio_features, image_data, None, face_model=None)
print("Final Predicted Emotion:", final_emotion)


Final Predicted Emotion: neutral
