In [17]:
import pickle
from predictor_all import AudioPredictor
from predictor_all import VoteClassifier
from predictor_all import ImagePredictor
from keras.models import load_model
import speech_recognition as sr
from collections import Counter

import numpy as np
import os

import librosa
from keras.preprocessing import sequence
from keras.preprocessing import image
import warnings

In [2]:
image_path = "./Image/6 Emotions for image classification/anger.jpg/gv232bwvvkb8m27mh1.jpg"
text_input = "i am very angry of u i will burn u"
audio_path = "./AUDIO/TESS Toronto emotional speech set data/OAF_angry/OAF_burn_angry.wav"


In [43]:
import pickle
import numpy as np

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

class MultiModalSentimentAnalysiss:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # Combine predictions (if more than one prediction is available)
        if len(predictions) > 0:
            # Majority voting logic or other combining logic can be applied here
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion: {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysiss(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with all data present (text, image, and audio)

image_input_path = "./Image/6 Emotions for image classification/anger.jpg/gv232bwvvkb8m27mh1.jpg"
audio_input_path = "./AUDIO/TESS Toronto emotional speech set data/OAF_angry/OAF_burn_angry.wav"
text_input = extract_text_from_audio(audio_input_path)

final_prediction = multi_modal_model.predict(text=text_input, image_path=image_input_path, audio_path=audio_input_path)

# Test with missing data (for example, only text and audio)
final_prediction = multi_modal_model.predict(text=text_input, audio_path=audio_input_path)

# Test with only one modality (image)
final_prediction = multi_modal_model.predict(image_path=image_input_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step
Predicted emotion from audio: Anger
Final predicted emotion: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 711ms/step
Predicted emotion from text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted emotion from audio: Anger
Final predicted emotion: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Predicted emotion from image: Anger
Final predicted emotion: Anger


In [29]:
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

In [47]:
class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def priority_majority_fusion(self, predictions, text_prediction=None):
        # Priority-majority fusion logic:

        # If all predictions are different, prioritize image only if all are unequal and text is "Neutral"
        if len(set(predictions)) == 3:
            if text_prediction == "Neutral":
                return "Neutral"  # If text is Neutral, return Neutral
            return predictions[2]  # Otherwise, return the image prediction if all are different
        
        # Majority voting logic
        most_common_emotion, _ = Counter(predictions).most_common(1)[0]
        
        # If text is "Neutral" and the majority is different, return majority, not neutral
        if text_prediction == "Neutral" and most_common_emotion != "Neutral":
            return most_common_emotion

        return most_common_emotion

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []
        text_prediction = None

        # Step 1: Predict using audio model if audio is provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")

            # Step 2: Extract text from audio and predict using text model
            extracted_text = extract_text_from_audio(audio_path)
            if extracted_text:
                text_prediction = self.text_model.text_classify(extracted_text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from extracted text: {text_emotion}")
            else:
                print("No text extracted from audio.")

        # Step 3: Predict from image if image path is provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # If predictions are available, apply fusion logic
        if len(predictions) > 0:
            final_emotion = self.priority_majority_fusion(predictions, text_prediction)
            print(f"Final predicted emotion: {final_emotion}")
            return final_emotion
        else:
            print("No data provided for prediction.")
            return None

In [49]:
# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

In [51]:
# Test with all data present (text, image, and audio)
image_input_path = "./Image/6 Emotions for image classification/anger.jpg/gv232bwvvkb8m27mh1.jpg"
audio_input_path = "./AUDIO/TESS Toronto emotional speech set data/OAF_angry/OAF_burn_angry.wav"
text_input = extract_text_from_audio(audio_input_path)

final_prediction = multi_modal_model.predict(text=text_input, image_path=image_input_path, audio_path=audio_input_path)

# Test with missing data (for example, only text and audio)
final_prediction = multi_modal_model.predict(text=text_input, audio_path=audio_input_path)

# Test with only one modality (image)
final_prediction = multi_modal_model.predict(image_path=image_input_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted emotion from audio: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 685ms/step
Predicted emotion from extracted text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Predicted emotion from image: Anger
Final predicted emotion: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted emotion from audio: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 726ms/step
Predicted emotion from extracted text: Neutral
Final predicted emotion: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Predicted emotion from image: Anger
Final predicted emotion: Anger


In [58]:
final_prediction = multi_modal_model.predict(text=text_input, image_path=image_input_path, audio_path=audio_input_path)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Predicted emotion from audio: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 674ms/step
Predicted emotion from extracted text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Predicted emotion from image: Anger
Final predicted emotion: Anger


# _________________________________________________________

In [13]:
import pickle
import numpy as np
import speech_recognition as sr
from collections import Counter

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def priority_majority_fusion(self, predictions, text_prediction=None):
        # Priority-majority fusion logic:

        # If all predictions are different, prioritize image only if all are unequal and text is "Neutral"
        if len(set(predictions)) == 3:
            if text_prediction == "Neutral":
                return "Neutral"  # If text is Neutral, return Neutral
            return predictions[2]  # Otherwise, return the image prediction if all are different
        
        # Majority voting logic
        most_common_emotion, _ = Counter(predictions).most_common(1)[0]
        
        # If text is "Neutral" and the majority is different, return majority, not neutral
        if text_prediction == "Neutral" and most_common_emotion != "Neutral":
            return most_common_emotion

        return most_common_emotion

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []
        text_prediction = None

        # Step 1: Try extracting text from audio if audio is provided
        if audio_path is not None:
            extracted_text = extract_text_from_audio(audio_path)
            
            if extracted_text:  # If text was successfully extracted from audio
                text_prediction = self.text_model.text_classify(extracted_text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from extracted text: {text_emotion}")
            else:
                print("No text extracted from audio. Proceeding with audio and image prediction.")

                # Step 2: Predict from audio if no text is extracted
                try:
                    audio_prediction = self.audio_model.audio_classify(audio_path)
                    audio_emotion = emotion_labels_audio[int(audio_prediction)]
                    predictions.append(audio_emotion)
                    print(f"Predicted emotion from audio: {audio_emotion}")
                except Exception as e:
                    print(f"Error in audio prediction: {e}")

        # Step 3: Predict from image if image path is provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # If predictions are available, apply fusion logic
        if len(predictions) > 0:
            final_emotion = self.priority_majority_fusion(predictions, text_prediction)
            print(f"Final predicted emotion: {final_emotion}")
            return final_emotion
        else:
            print("No data provided for prediction.")
            return None

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with all data present (text, image, and audio)
image_input_path = "./Image/6 Emotions for image classification/anger.jpg/gv232bwvvkb8m27mh1.jpg"
audio_input_path = "./AUDIO/TESS Toronto emotional speech set data/OAF_angry/OAF_burn_angry.wav"
text_input = extract_text_from_audio(audio_input_path)

final_prediction = multi_modal_model.predict(text=text_input, image_path=image_input_path, audio_path=audio_input_path)

# Test with missing data (for example, only text and audio)
final_prediction = multi_modal_model.predict(text=text_input, audio_path=audio_input_path)

# Test with only one modality (image)
final_prediction = multi_modal_model.predict(image_path=image_input_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from extracted text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Anger
Final predicted emotion: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 732ms/step
Predicted emotion from extracted text: Neutral
Final predicted emotion: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Predicted emotion from image: Anger
Final predicted emotion: Anger


In [45]:
import pickle
import numpy as np
import speech_recognition as sr
from collections import Counter

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def priority_majority_fusion(self, predictions, text_prediction=None):
        # Priority-majority fusion logic:

        # If all predictions are different, prioritize image only if all are unequal and text is "Neutral"
        if len(set(predictions)) == 3:
            if text_prediction == "Neutral":
                return "Neutral"  # If text is Neutral, return Neutral
            return predictions[2]  # Otherwise, return the image prediction if all are different
        
        # Majority voting logic
        most_common_emotion, _ = Counter(predictions).most_common(1)[0]
        
        # If text is "Neutral" and the majority is different, return majority, not neutral
        if text_prediction == "Neutral" and most_common_emotion != "Neutral":
            return most_common_emotion

        return most_common_emotion

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []
        text_prediction = None

        # Step 1: Predict using audio model if audio is provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")

            # Step 2: Extract text from audio and predict using text model
            extracted_text = extract_text_from_audio(audio_path)
            if extracted_text:
                text_prediction = self.text_model.text_classify(extracted_text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from extracted text: {text_emotion}")
            else:
                print("No text extracted from audio.")

        # Step 3: Predict from image if image path is provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # If predictions are available, apply fusion logic
        if len(predictions) > 0:
            final_emotion = self.priority_majority_fusion(predictions, text_prediction)
            print(f"Final predicted emotion: {final_emotion}")
            return final_emotion
        else:
            print("No data provided for prediction.")
            return None

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with all data present (text, image, and audio)
image_input_path = "./Image/6 Emotions for image classification/anger.jpg/gv232bwvvkb8m27mh1.jpg"
audio_input_path = "./AUDIO/TESS Toronto emotional speech set data/OAF_angry/OAF_burn_angry.wav"
text_input = extract_text_from_audio(audio_input_path)

final_prediction = multi_modal_model.predict(text=text_input, image_path=image_input_path, audio_path=audio_input_path)

# Test with missing data (for example, only text and audio)
final_prediction = multi_modal_model.predict(text=text_input, audio_path=audio_input_path)

# Test with only one modality (image)
final_prediction = multi_modal_model.predict(image_path=image_input_path)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
Predicted emotion from audio: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from extracted text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Anger
Final predicted emotion: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Predicted emotion from audio: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 969ms/step
Predicted emotion from extracted text: Neutral
Final predicted emotion: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Predicted emotion from image: Anger
Final predicted emotion: Anger
