<a href="https://colab.research.google.com/github/Sivabhavani123/Audio-and-Video-Fusion/blob/main/audio_and_video_fuse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opencv-python deepface transformers librosa torch tensorflow
!pip install deepface opencv-python-headless



In [None]:
import cv2
from deepface import DeepFace
from collections import Counter

def detect_video_emotion(video_path, num_frames=100):
    """Detect dominant emotion from video frames using DeepFace."""
    emotions = []  # Store detected emotions for each frame
    cap = cv2.VideoCapture(video_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = max(1, frame_count // num_frames)  # Sample frames evenly

    print("Analyzing video frames for emotions...")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
        if frame_number % step == 0:  # Process every 'step'-th frame
            try:
                analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)

                # Adjust handling for the response format
                if isinstance(analysis, list):
                    # For newer DeepFace versions returning a list of results
                    emotions.append(analysis[0]['dominant_emotion'])
                else:
                    # For older versions returning a dictionary
                    emotions.append(analysis['dominant_emotion'])

                print(f"Frame {frame_number}: {emotions[-1]}")
            except Exception as e:
                print(f"Error analyzing frame {frame_number}: {e}")

    cap.release()

    # Calculate the most frequent emotion across all analyzed frames
    if emotions:
        dominant_emotion = Counter(emotions).most_common(1)[0][0]
        return dominant_emotion
    else:
        return "Unknown"

# Example Usage
video_path = "/content/drive/MyDrive/data/6038291_Woman_Young_3840x2160.mp4"  # Replace with the path to your video file
dominant_emotion = detect_video_emotion(video_path)
print(f"Dominant Emotion in Video: {dominant_emotion}")


Analyzing video frames for emotions...
25-02-09 03:51:04 - facial_expression_model_weights.h5 will be downloaded...


Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/facial_expression_model_weights.h5
To: /root/.deepface/weights/facial_expression_model_weights.h5
100%|██████████| 5.98M/5.98M [00:00<00:00, 18.3MB/s]


Frame 2: neutral
Frame 4: neutral
Frame 6: neutral
Frame 8: neutral
Frame 10: neutral
Frame 12: neutral
Frame 14: neutral
Frame 16: neutral
Frame 18: neutral
Frame 20: neutral
Frame 22: neutral
Frame 24: neutral
Frame 26: neutral
Frame 28: neutral


In [None]:
!pip install speechbrain opensmile librosa torch transformers
!pip install speechbrain torch librosa opensmile
!pip install speechbrain torchaudio

In [None]:

import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import seaborn as sns
import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras import regularizers
import os
import glob
import pandas as pd
import IPython.display as ipd
import plotly.express as px
import scipy.io.wavfile
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

# Load a fully trained emotion classification model
model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Load the classroom audio file
audio_path = "/content/drive/MyDrive/data/kids-laugh-45357.mp3"
waveform, sample_rate = torchaudio.load(audio_path)

# Ensure correct sampling rate (16kHz required)
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert to numpy and normalize
waveform = waveform.numpy().flatten()

# Process the audio
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

# Perform emotion classification
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted emotion label
predicted_index = torch.argmax(logits, dim=-1).item()
emotion_labels = ["neutral", "happy", "sad", "angry", "fear", "disgust", "surprise"]
predicted_emotion = emotion_labels[predicted_index]
print(f"Predicted Emotion in Audio: {predicted_emotion}")