In [2]:
import pickle
from predictor_all import AudioPredictor
from predictor_all import VoteClassifier
from predictor_all import ImagePredictor
from keras.models import load_model
import speech_recognition as sr
from collections import Counter
import cv2

import pandas as pd
import numpy as np
import os

import threading
import speech_recognition as sr
from collections import Counter
from keras.preprocessing import image
import librosa
from moviepy.editor import VideoFileClip
import tempfile
import os
from queue import Queue
from keras.preprocessing import sequence
from keras.preprocessing import image
import warnings



In [3]:
# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

In [4]:
# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]


In [5]:
# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

In [44]:
class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []
        audio_emotion = None
        image_emotion = None
        text_emotion = None

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # Handle Neutral audio or text emotion: if "Neutral", shift priority to Neutral
        if audio_emotion == "Neutral" and text_emotion == "Neutral":
            print("Audio and Text are both 'Neutral'. Prioritizing Neutral prediction.")
            predictions = ["Neutral"]  # Prioritize Neutral regardless of image prediction

        # Handle Neutral audio emotion: if "Neutral", shift priority to image
        elif audio_emotion == "Neutral" and image_emotion is not None:
            print(f"Audio prediction is 'Neutral'. Shifting priority to image prediction.")
            predictions = [image_emotion]  # Ignore the Neutral audio emotion and use the image emotion
        
        # Majority voting logic: check all predictions
        if predictions:
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion (after majority check): {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_video(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        image_path = None
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
            # Save only the first frame as an image for simplicity
            if frame_count == 1:
                image_path = "temp_image.jpg"
                cv2.imwrite(image_path, frame)
                break
        cap.release()

        # Predict sentiment using the extracted audio, text, and image
        text_input = extract_text_from_audio(audio_path)  # Extract text from the audio
        final_emotion = self.predict(text=text_input, image_path=image_path, audio_path=audio_path)

        # Clean up temporary files
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if image_path and os.path.exists(image_path):
            os.remove(image_path)

        return final_emotion

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with a video input
video_input_path = "./video_data/Anger/Angry Man Free Stock Footage _ Man yells at woman Angry _ No Copyright _.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check the final prediction
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




Error in audio text extraction: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Predicted emotion from image: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Predicted emotion from audio: Neutral
Audio prediction is 'Neutral'. Shifting priority to image prediction.
Final predicted emotion (after majority check): Anger
Final predicted emotion from video: Anger


In [50]:
# Test with a video input
video_input_path = "./video_data/Anger/Just an Act👿👿😤#newshorts#art#english#dialogue#angry.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}")

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




Error in audio text extraction: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Predicted emotion from image: Sadness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
Predicted emotion from audio: Anger
Final predicted emotion (after majority check): Anger
Final predicted emotion from video: Anger


In [46]:
# Test with a video input
video_input_path = "./video_data/Disgust/disgust.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}")

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 770ms/step
Predicted emotion from text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
Predicted emotion from image: Disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Predicted emotion from audio: Neutral
Audio and Text are both 'Neutral'. Prioritizing Neutral prediction.
Final predicted emotion (after majority check): Neutral
Final predicted emotion from video: Neutral


In [52]:
   # Test with a video input
video_input_path = "./video_data/Sadness/I am problem - sad  English status.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}") 

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 755ms/step
Predicted emotion from text: Sadness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
Predicted emotion from image: Disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Predicted emotion from audio: Sadness
Final predicted emotion (after majority check): Sadness
Final predicted emotion from video: Sadness


In [60]:
   # Test with a video input
video_input_path = "./video_data/Happiness/videoplayback (2).mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}") 

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 800ms/step
Predicted emotion from text: Fear
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Predicted emotion from image: Happiness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
Predicted emotion from audio: Anger
Final predicted emotion (after majority check): Happiness
Final predicted emotion from video: Happiness


In [58]:
   # Test with a video input
video_input_path = "./video_data/Happiness/The Easiest Monologue in the World! All Emotions in 1 Minute!.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}") 

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.
Error in text prediction: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} Error in user-defined function passed to ParallelMapDatasetV2:832 transformation with iterator: Iterator::Root::Prefetch::ParallelMapV2: indices[0] = 0 is not in [0, 0)
	 [[{{node RaggedGather/RaggedGather}}]] [Op:IteratorGetNext] name: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
Predicted emotion from image: Happiness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395ms/step
Predicted emotion from audio: Sadness
Final predicted emotion (after majority check): Happiness
Final predicted emotion from video: Happiness


In [62]:
   # Test with a video input
video_input_path = "./video_data/Happiness/videoplayback.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}") 

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.
Error in audio text extraction: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Predicted emotion from image: Surprise
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
Predicted emotion from audio: Neutral
Audio prediction is 'Neutral'. Shifting priority to image prediction.
Final predicted emotion (after majority check): Surprise
Final predicted emotion from video: Surprise


# ___________________________________________________________________________________________________________

In [None]:
import threading
import sounddevice as sd
import wave
import os

# Global flag for stopping audio recording
recording_active = True

# Function to record audio in the background
def record_audio():
    global recording_active
    fs = 16000  # Sample rate
    duration = 5  # Record for 5 seconds at a time
    while recording_active:  # Loop while recording_active is True
        print("Recording audio...")
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
        sd.wait()

        # Save recorded audio to a WAV file
        with wave.open("temp_audio.wav", 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(fs)
            wf.writeframes(recording.tobytes())
        print("Audio recorded and saved.")

# Function to stop all recording processes
def stop_all_recording():
    global recording_active
    recording_active = False  # This will stop the audio recording thread
    print("Audio recording stopped.")

# Start the audio recording in a separate thread
audio_thread = threading.Thread(target=record_audio, daemon=True)
audio_thread.start()

# Wait for the user to stop the recording
input("Press Enter to stop all recording...")

# Call the function to stop recording
stop_all_recording()

# Wait for the audio recording thread to finish
audio_thread.join()

print("All processes stopped.")


In [None]:
import threading
import sounddevice as sd
import wave
import os

# Global flag for stopping audio recording
recording_active = True

# Function to record audio in the background
def record_audio():
    global recording_active
    fs = 16000  # Sample rate
    duration = 5  # Record for 5 seconds at a time
    while recording_active:  # Loop while recording_active is True
        print("Recording audio...")
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
        sd.wait()

        # Save recorded audio to a WAV file
        with wave.open("temp_audio.wav", 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(fs)
            wf.writeframes(recording.tobytes())
        print("Audio recorded and saved.")

# Function to stop all recording processes
def stop_all_recording():
    global recording_active
    recording_active = False  # This will stop the audio recording thread
    print("Audio recording stopped.")

# Start the audio recording in a separate thread
audio_thread = threading.Thread(target=record_audio, daemon=True)
audio_thread.start()

# Wait for the user to stop the recording
input("Press Enter to stop all recording...")

# Call the function to stop recording
stop_all_recording()

# Wait for the audio recording thread to finish
audio_thread.join()

print("All processes stopped.")


In [84]:
import pickle
import cv2
import numpy as np
import threading
import speech_recognition as sr
from collections import Counter
from keras.preprocessing import image
import librosa
from moviepy.editor import VideoFileClip
import tempfile
import os
from queue import Queue
from tensorflow.keras.models import load_model  # Ensure Keras models are imported correctly

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)  # Ensure this loads your image model

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None


class AudioPredictor:
    def audio_classify(self, audio_path):
        # Implement actual audio classification logic here using your model
        # Example using librosa to extract features
        audio, sr = librosa.load(audio_path)
        features = librosa.feature.mfcc(audio, sr=sr)
        features = np.mean(features, axis=1).reshape(1, -1)  # Feature extraction
        prediction = audio_classifier.predict(features)
        return prediction[0]  # Return the predicted emotion class


class ImagePredictor:
    def __init__(self, model):
        self.model = model

    def preprocess_image(self, image_array):
        # Resize the image to match the input size of your model (e.g., 299x299 for InceptionV3)
        img = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB if using OpenCV
        img = cv2.resize(img, (299, 299))  # Resize to the desired shape
        img_array = image.img_to_array(img)  # Convert image to array
        img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
        img_array /= 255.0  # Normalize the image
        return img_array

    def image_classify(self, image_array):
        # Preprocess the image
        processed_image = self.preprocess_image(image_array)
        
        # Predict the emotion
        predictions = self.model.predict(processed_image)  # Ensure this works for your model
        predicted_class = np.argmax(predictions, axis=1)
        
        return predicted_class[0]  # Return the predicted class index


class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # If all predictions are different, prioritize the image prediction
        if len(predictions) == 3 and len(set(predictions)) == 3:  # All predictions are different
            print(f"Different emotions predicted from all models. Prioritizing image prediction.")
            return predictions[1]  # Image prediction is in the second position (index 1)
        
        # Majority voting logic if predictions are the same or not all different
        if len(predictions) > 0:
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion: {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_audio_data(self, audio_queue):
        # Initialize an empty byte string to hold the audio data
        audio_data = b''

        # Continue to process the audio queue until there's data
        while not audio_queue.empty():
            audio_data += audio_queue.get()

        # Save the audio data to a temporary .wav file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            tmp_file.write(audio_data)
            audio_path = tmp_file.name
    
        # Extract text from the audio
        text_input = extract_text_from_audio(audio_path)
    
        # Clean up temporary audio file
        if os.path.exists(audio_path):
            os.remove(audio_path)

        return text_input

    def live_stream(self):
        # Create video capture and audio stream
        cap = cv2.VideoCapture(0)
        if not cap.isOpened():
            print("Error: Could not access the webcam.")
            return

        # Create a queue for audio data
        audio_queue = Queue()
        audio_thread = threading.Thread(target=self.capture_audio, args=(audio_queue,))
        audio_thread.daemon = True
        audio_thread.start()

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # Process image (live face frame)
            image_prediction = self.image_model.image_classify(frame)
            image_emotion = emotion_labels_img[int(image_prediction)]

            # Process audio data
            text_input = self.process_audio_data(audio_queue)  # Convert audio to text
            final_emotion = self.predict(text=text_input, image=frame, audio_data=None)

            # Display the predicted emotion
            cv2.putText(frame, f"Predicted Emotion: {final_emotion}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow("Live Sentiment Analysis", frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()

    def capture_audio(self, audio_queue):
        recognizer = sr.Recognizer()
        microphone = sr.Microphone()

        while True:
            with microphone as source:
                audio_data = recognizer.listen(source)
                audio_queue.put(audio_data.get_wav_data())  # Store audio data for processing


# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=AudioPredictor(),
    text_model=text_classifier,
    image_model=image_predictor
)

# Start live stream for real-time sentiment prediction
multi_modal_model.live_stream()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step


ValueError: Audio file could not be read as PCM WAV, AIFF/AIFF-C, or Native FLAC; check if file is corrupted or in another format

In [64]:
import cv2
import numpy as np
import speech_recognition as sr
import tempfile
import os
from moviepy.editor import VideoFileClip
import threading
from queue import Queue

# Assuming emotion labels are predefined
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]
emotion_labels_audio = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image=None, audio_data=None):
        predictions = []

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image is not None:
            try:
                image_prediction = self.image_model.image_classify(image)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_data is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_data)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # Majority voting logic if predictions are the same or not all different
        if len(predictions) > 0:
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion: {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_audio_data(self, audio_queue):
        # Process captured audio data for sentiment prediction
        audio_data = b''
        while not audio_queue.empty():
            audio_data += audio_queue.get()  # Get the audio data from the queue

        if not audio_data:
            return ""  # Return empty if no audio data is available

        # Save the audio data to a temporary .wav file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            tmp_file.write(audio_data)
            tmp_filename = tmp_file.name

        # Use speech recognition to convert audio to text
        recognizer = sr.Recognizer()
        with sr.AudioFile(tmp_filename) as source:
            audio = recognizer.record(source)

        try:
            text = recognizer.recognize_google(audio)
            os.remove(tmp_filename)  # Clean up the temporary file
            return text
        except Exception as e:
            print(f"Error in speech recognition: {e}")
            os.remove(tmp_filename)  # Clean up the temporary file
            return ""

    def live_stream(self):
        # Create video capture and audio stream
        cap = cv2.VideoCapture(0)
        if not cap.isOpened():
            print("Error: Could not access the webcam.")
            return
        
        # Create a queue for audio data
        audio_queue = Queue()
        audio_thread = threading.Thread(target=self.capture_audio, args=(audio_queue,))
        audio_thread.daemon = True
        audio_thread.start()

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # Process image (live face frame)
            image_prediction = self.image_model.image_classify(frame)
            image_emotion = emotion_labels_img[int(image_prediction)]

            # Process audio data
            text_input = self.process_audio_data(audio_queue)  # Convert audio to text
            final_emotion = self.predict(text=text_input, image=frame, audio_data=None)

            # Display the predicted emotion
            cv2.putText(frame, f"Predicted Emotion: {final_emotion}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow("Live Sentiment Analysis", frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        cap.release()
        cv2.destroyAllWindows()

    def capture_audio(self, audio_queue):
        # Use SpeechRecognition to capture live audio from the microphone
        recognizer = sr.Recognizer()
        mic = sr.Microphone()

        with mic as source:
            while True:
                audio = recognizer.listen(source)
                audio_data = audio.get_wav_data()
                audio_queue.put(audio_data)

# Instantiate the multi-modal sentiment analysis model with dummy models
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=DummyAudioModel(),
    text_model=DummyTextModel(),
    image_model=DummyImageModel()
)

# Start live stream for real-time sentiment prediction
multi_modal_model.live_stream()


NameError: name 'DummyAudioModel' is not defined

In [66]:
import cv2

# Load the pre-trained Haar Cascade Classifier for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Start the webcam
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not access the webcam.")
else:
    print("Press 'q' to quit.")

# Frame count for naming captured images
frame_count = 0

while cap.isOpened():
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        print("Failed to capture frame.")
        break
    
    # Convert to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # Detect faces in the frame
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    
    # Draw rectangles around the detected faces and capture the face image
    for (x, y, w, h) in faces:
        cv2.rectangle(frame, (x, y), (x + w, y + h), (255, 0, 0), 2)
        face = frame[y:y + h, x:x + w]
        frame_count += 1
        
        # Save the captured face image
        cv2.imwrite(f'face_{frame_count}.jpg', face)

    # Display the frame with rectangles drawn
    cv2.imshow('Face Capture', frame)

    # Break the loop when 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close OpenCV windows
cap.release()
cv2.destroyAllWindows()

Press 'q' to quit.


# ____________________________________________________________________________________

In [26]:
class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # If all predictions are different, prioritize the image prediction
        if len(predictions) == 3 and len(set(predictions)) == 3:  # All predictions are different
            print(f"Different emotions predicted from all models. Prioritizing image prediction.")
            return predictions[0]  # Image prediction is in the second position (index 1)
        
        # Majority voting logic if predictions are the same or not all different
        if len(predictions) > 0:
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion: {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_video(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        image_path = None
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
            # Save only the first frame as an image for simplicity
            if frame_count == 1:
                image_path = "temp_image.jpg"
                cv2.imwrite(image_path, frame)
                break
        cap.release()

        # Predict sentiment using the extracted audio, text, and image
        text_input = extract_text_from_audio(audio_path)  # Extract text from the audio
        final_emotion = self.predict(text=text_input, image_path=image_path, audio_path=audio_path)

        # Clean up temporary files
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if image_path and os.path.exists(image_path):
            os.remove(image_path)

        return final_emotion

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with a video input
video_input_path = "./video_data/Anger/Angry Man Free Stock Footage _ Man yells at woman Angry _ No Copyright _.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)

# Check if the final prediction contains two different emotions
if isinstance(final_prediction, list):
    print(f"Final predicted emotions from image, audio, and text: {final_prediction}")
else:
    print(f"Final predicted emotion from video: {final_prediction}")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




Error in audio text extraction: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Predicted emotion from image: Anger
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Predicted emotion from audio: Neutral
Final predicted emotion: Neutral
Final predicted emotion from video: Neutral


In [13]:
import cv2
import numpy as np
import pickle
import os
import speech_recognition as sr
from moviepy.editor import VideoFileClip
from collections import Counter

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def priority_majority_fusion(self, predictions, text_prediction=None):
        # Priority-majority fusion logic
        if len(set(predictions)) == 3:
            if text_prediction == "Neutral":
                return "Neutral"
            return predictions[2]
        
        most_common_emotion, _ = Counter(predictions).most_common(1)[0]
        
        if text_prediction == "Neutral" and most_common_emotion != "Neutral":
            return most_common_emotion

        return most_common_emotion

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []
        text_prediction = None

        # Extract and predict from text if audio is provided
        if audio_path is not None:
            extracted_text = extract_text_from_audio(audio_path)
            
            if extracted_text:
                text_prediction = self.text_model.text_classify(extracted_text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from extracted text: {text_emotion}")
            else:
                print("No text extracted from audio. Proceeding with audio and image prediction.")
                try:
                    audio_prediction = self.audio_model.audio_classify(audio_path)
                    audio_emotion = emotion_labels_audio[int(audio_prediction)]
                    predictions.append(audio_emotion)
                    print(f"Predicted emotion from audio: {audio_emotion}")
                except Exception as e:
                    print(f"Error in audio prediction: {e}")

        # Predict from image if image path is provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Apply fusion logic if predictions are available
        if len(predictions) > 0:
            final_emotion = self.priority_majority_fusion(predictions, text_prediction)
            print(f"Final predicted emotion: {final_emotion}")
            return final_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_video(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        image_path = None
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
            # Save only the first frame as an image for simplicity
            if frame_count == 1:
                image_path = "temp_image.jpg"
                cv2.imwrite(image_path, frame)
                break
        cap.release()

        # Predict sentiment using the extracted audio, text, and image
        final_emotion = self.predict(image_path=image_path, audio_path=audio_path)

        # Clean up temporary files
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if image_path and os.path.exists(image_path):
            os.remove(image_path)

        return final_emotion

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage
video_input_path = "./Angry.mp4"
final_prediction = multi_modal_model.process_video(video_input_path)
print(f"Final predicted emotion from video: {final_prediction}")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted emotion from extracted text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Disgust
Final predicted emotion: Neutral
Final predicted emotion from video: Neutral


In [30]:
import pickle
import numpy as np
import cv2
import os
from moviepy.editor import VideoFileClip
import speech_recognition as sr

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # Combine predictions (if more than one prediction is available)
        if len(predictions) > 0:
            # Majority voting logic
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion: {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_video(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        image_path = None
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
            # Save only the first frame as an image for simplicity
            if frame_count == 1:
                image_path = "temp_image.jpg"
                cv2.imwrite(image_path, frame)
                break
        cap.release()

        # Predict sentiment using the extracted audio, text, and image
        text_input = extract_text_from_audio(audio_path)  # Extract text from the audio
        final_emotion = self.predict(text=text_input, image_path=image_path, audio_path=audio_path)

        # Clean up temporary files
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if image_path and os.path.exists(image_path):
            os.remove(image_path)

        return final_emotion

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with a video input
video_input_path = "./Sad.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)
print(f"Final predicted emotion from video: {final_prediction}")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 856ms/step
Predicted emotion from text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Disgust
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
Predicted emotion from audio: Sadness
Final predicted emotion: Disgust
Final predicted emotion from video: Disgust


In [28]:
import pickle
import numpy as np
import cv2
import os
from moviepy.editor import VideoFileClip
import speech_recognition as sr

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []

        # Predict from text if provided
        if text is not None:
            try:
                text_prediction = self.text_model.text_classify(text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from text: {text_emotion}")
            except Exception as e:
                print(f"Error in text prediction: {e}")
        
        # Predict from image if provided
        if image_path is not None:
            try:
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")
        
        # Predict from audio if provided
        if audio_path is not None:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")
        
        # Combine predictions (if more than one prediction is available)
        if len(predictions) > 0:
            # Majority voting logic
            most_common_emotion = max(set(predictions), key=predictions.count)
            print(f"Final predicted emotion: {most_common_emotion}")
            return most_common_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_video(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        image_path = None
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1

            # Display video frames in real-time
            cv2.imshow('Processing Video', frame)

            # Save only the first frame as an image for simplicity
            if frame_count == 1:
                image_path = "temp_image.jpg"
                cv2.imwrite(image_path, frame)
            
            # Check for 'q' key to stop the video window if needed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()

        # Predict sentiment using the extracted audio, text, and image
        text_input = extract_text_from_audio(audio_path)  # Extract text from the audio
        final_emotion = self.predict(text=text_input, image_path=image_path, audio_path=audio_path)

        # Clean up temporary files
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if image_path and os.path.exists(image_path):
            os.remove(image_path)

        return final_emotion

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage:

# Test with a video input
video_input_path = "./Fear.mp4"  # Replace with your video file path
final_prediction = multi_modal_model.process_video(video_input_path)
print(f"Final predicted emotion from video: {final_prediction}")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




Error in audio text extraction: 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Surprise
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step
Predicted emotion from audio: Sadness
Final predicted emotion: Sadness
Final predicted emotion from video: Sadness


In [20]:
import cv2
import numpy as np
import pickle
import os
import speech_recognition as sr
from moviepy.editor import VideoFileClip
from collections import Counter

# Load the pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion Labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def majority_fusion(self, predictions):
        # Majority voting for predictions
        most_common_emotion, _ = Counter(predictions).most_common(1)[0]
        return most_common_emotion

    def predict(self, text=None, image_path=None, audio_path=None):
        predictions = []
        text_prediction = None

        # Extract and predict from audio if audio is provided
        if audio_path is not None:
            extracted_text = extract_text_from_audio(audio_path)
            
            if extracted_text:
                # Predict sentiment using text classifier from extracted text
                text_prediction = self.text_model.text_classify(extracted_text)
                text_emotion = emotion_labels_text[int(text_prediction)]
                predictions.append(text_emotion)
                print(f"Predicted emotion from extracted text: {text_emotion}")
            else:
                print("No text extracted from audio. Proceeding with audio and image prediction.")
            
            try:
                # Predict sentiment using audio classifier from raw audio features
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio features: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")

        # Predict from image if image path is provided
        if image_path is not None:
            try:
                # Predict sentiment using image classifier
                image_prediction = self.image_model.image_classify(image_path)
                image_emotion = emotion_labels_img[int(image_prediction)]
                predictions.append(image_emotion)
                print(f"Predicted emotion from image: {image_emotion}")
            except Exception as e:
                print(f"Error in image prediction: {e}")

        # Apply fusion logic if predictions are available
        if len(predictions) > 0:
            final_emotion = self.majority_fusion(predictions)
            print(f"Final predicted emotion: {final_emotion}")
            return final_emotion
        else:
            print("No data provided for prediction.")
            return None

    def process_video(self, video_path):
        # Extract audio from video
        video = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path)
        
        # Extract frames from video
        cap = cv2.VideoCapture(video_path)
        frame_count = 0
        image_path = None
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
            # Save only the first frame as an image for simplicity
            if frame_count == 1:
                image_path = "temp_image.jpg"
                cv2.imwrite(image_path, frame)
                break
        cap.release()

        # Predict sentiment using the extracted audio, text, and image
        final_emotion = self.predict(image_path=image_path, audio_path=audio_path)

        # Clean up temporary files
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if image_path and os.path.exists(image_path):
            os.remove(image_path)

        return final_emotion

# Instantiate the multi-modal sentiment analysis model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Example usage
video_input_path = "./Sad.mp4"
final_prediction = multi_modal_model.process_video(video_input_path)
print(f"Final predicted emotion from video: {final_prediction}")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 963ms/step
Predicted emotion from extracted text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 272ms/step
Predicted emotion from audio features: Sadness
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted emotion from image: Disgust
Final predicted emotion: Neutral
Final predicted emotion from video: Neutral


In [5]:
import cv2
import numpy as np
import pickle
import speech_recognition as sr
from moviepy.editor import VideoFileClip
from collections import Counter
import io
from PIL import Image

# Load pre-trained models
with open('audio_classifier_model.pkl', 'rb') as f:
    audio_classifier = pickle.load(f)

with open('text_classifier_model.pkl', 'rb') as f:
    text_classifier = pickle.load(f)

with open('image_predictor.pkl', 'rb') as f:
    image_predictor = pickle.load(f)

# Emotion labels for different models
emotion_labels_img = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise"]
emotion_labels_audio = ["Happiness", "Neutral", "Sadness", "Anger", "Fear", "Disgust"]
emotion_labels_text = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

# Function to extract audio from video and return as variable
def extract_audio_from_video(video_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio_path = "temp_audio.wav"
    audio.write_audiofile(audio_path, codec='pcm_s16le')
    return audio_path

# Function to extract frames from video and return as a list of variables
def extract_frames_from_video(video_path, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frames.append(frame)
        frame_count += 1

    cap.release()
    return frames

# Function to extract text from audio using SpeechRecognition
def extract_text_from_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except (sr.UnknownValueError, sr.RequestError) as e:
        print(f"Error in audio text extraction: {e}")
        return None

class MultiModalSentimentAnalysis:
    def __init__(self, audio_model, text_model, image_model):
        self.audio_model = audio_model
        self.text_model = text_model
        self.image_model = image_model

    def priority_majority_fusion(self, predictions, text_prediction=None):
        if len(set(predictions)) == len(predictions):
            if text_prediction == "Neutral":
                return "Neutral"
            return predictions[-1]

        most_common_emotion, _ = Counter(predictions).most_common(1)[0]
        if text_prediction == "Neutral" and most_common_emotion != "Neutral":
            return most_common_emotion
        return most_common_emotion

    def predict(self, audio_path=None, frames=None, text=None):
        predictions = []
        text_prediction = None

        # Text prediction
        if text:
            text_prediction = self.text_model.text_classify(text)
            text_emotion = emotion_labels_text[int(text_prediction)]
            predictions.append(text_emotion)
            print(f"Predicted emotion from text: {text_emotion}")

        # Audio prediction
        if audio_path:
            try:
                audio_prediction = self.audio_model.audio_classify(audio_path)
                audio_emotion = emotion_labels_audio[int(audio_prediction)]
                predictions.append(audio_emotion)
                print(f"Predicted emotion from audio: {audio_emotion}")
            except Exception as e:
                print(f"Error in audio prediction: {e}")

        # Image prediction
        if frames:
            frame_emotions = []
            for frame in frames:
                try:
                    frame_prediction = self.image_model.image_classify(frame)
                    frame_emotion = emotion_labels_img[int(frame_prediction)]
                    frame_emotions.append(frame_emotion)
                except Exception as e:
                    print(f"Error in image prediction for a frame: {e}")
            if frame_emotions:
                most_common_frame_emotion, _ = Counter(frame_emotions).most_common(1)[0]
                predictions.append(most_common_frame_emotion)
                print(f"Most common emotion from frames: {most_common_frame_emotion}")

        # Determine final prediction
        if predictions:
            final_emotion = self.priority_majority_fusion(predictions, text_prediction)
            print(f"Final predicted emotion: {final_emotion}")
            return final_emotion
        else:
            print("No data provided for prediction.")
            return None

# Example usage
audio_path = extract_audio_from_video("./Angry.mp4")
frames = extract_frames_from_video("./Angry.mp4")
text_from_audio = extract_text_from_audio(audio_path)

# Initialize the multi-modal model
multi_modal_model = MultiModalSentimentAnalysis(
    audio_model=audio_classifier,
    text_model=text_classifier,
    image_model=image_predictor
)

# Get the final prediction
final_prediction = multi_modal_model.predict(audio_path=audio_path, frames=frames, text=text_from_audio)


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 858ms/step
Predicted emotion from text: Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step
Predicted emotion from audio: Neutral
Error in image prediction for a frame: path should be path-like or io.BytesIO, not <class 'numpy.ndarray'>
Error in image prediction for a frame: path should be path-like or io.BytesIO, not <class 'numpy.ndarray'>
Error in image prediction for a frame: path should be path-like or io.BytesIO, not <class 'numpy.ndarray'>
Error in image prediction for a frame: path should be path-like or io.BytesIO, not <class 'numpy.ndarray'>
Final predicted emotion: Neutral
