In [1]:
import cv2
import numpy as np
import tensorflow as tf
import moviepy.editor as mp
import librosa
import os

from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")


In [2]:
import tensorflow as tf
import numpy as np
import cv2
import librosa
import os
import moviepy.editor as mp

# ✅ Load trained models
face_model = tf.keras.models.load_model("D:/depression_detect/models/mobilenet_affectnet.keras")
voice_model = tf.keras.models.load_model("D:/depression_detection/models/audio_emotion_model.keras")

# ✅ Prediction threshold
POSITIVE_THRESHOLD = 0.5

# ✅ Preprocess video frame
def preprocess_face_frame(frame):
    img = cv2.resize(frame, (224, 224))
    img = img / 255.0  # Normalize to [0, 1]
    return img

# ✅ Predict facial expression
def predict_facial_expression(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_rate = 5  # analyze every 5th frame

    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if count % frame_rate == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(preprocess_face_frame(frame_rgb))
        count += 1
    cap.release()

    if not frames:
        print("⚠️ No frames extracted.")
        return 0

    frames = np.array(frames)
    preds = face_model.predict(frames, verbose=0)
    avg_pred = np.mean(preds)
    return int(avg_pred >= POSITIVE_THRESHOLD)

# ✅ Extract audio from video
def extract_audio(video_path, output_audio_path="temp_audio.wav"):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(output_audio_path, codec='pcm_s16le', verbose=False, logger=None)
    return output_audio_path

# ✅ Predict voice emotion
def predict_voice_emotion(audio_path):
    y, sr = librosa.load(audio_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mfcc = np.mean(mfcc.T, axis=0)
    input_audio = np.expand_dims(mfcc, axis=0)

    pred = voice_model.predict(input_audio, verbose=0)
    return int(pred[0][0] >= POSITIVE_THRESHOLD)


# ✅ Combine and decide depression status
def detect_depression(video_path):
    print("🔍 Analyzing facial expressions...")
    face_result = predict_facial_expression(video_path)

    print("🔉 Analyzing voice emotions...")
    audio_path = extract_audio(video_path)
    voice_result = predict_voice_emotion(audio_path)

    print(f"🧠 Facial Expression: {'Positive' if face_result else 'Negative'}")
    print(f"🗣️ Voice Emotion: {'Positive' if voice_result else 'Negative'}")

    if face_result == 0 and voice_result == 0:
        return "😔 Depression Detected"
    elif face_result == 1 and voice_result == 1:
        return "😊 No Depression"
    else:
        return "❓ Inconclusive"

# ✅ Path to your test video
video_file = "D:/depression_detect/videos/videoplayback.mp4"

# ✅ Run analysis
final_result = detect_depression(video_file)
print(f"\n🔎 Final Status: {final_result}")


🔍 Analyzing facial expressions...
🔉 Analyzing voice emotions...
🧠 Facial Expression: Positive
🗣️ Voice Emotion: Negative

🔎 Final Status: ❓ Inconclusive


In [3]:
import cv2
import numpy as np
import librosa
import os
import tensorflow as tf
import moviepy.editor as mp
import csv
from datetime import datetime

# Load models
facial_model = tf.keras.models.load_model("D:/depression_detect/models/mobilenet_affectnet.keras")
voice_model = tf.keras.models.load_model("D:/depression_detection/models/audio_emotion_model.keras")

POSITIVE_THRESHOLD = 0.5
LOG_FILE = "D:/depression_detect/results/depression_analysis_log.csv"

# 🧠 Facial Expression Extraction
def extract_faces(video_path, every_n_frames=30):
    cap = cv2.VideoCapture(video_path)
    faces = []
    frame_count = 0
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % every_n_frames == 0:
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            detected = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)
            for (x, y, w, h) in detected:
                face = frame[y:y + h, x:x + w]
                face = cv2.resize(face, (224, 224))
                faces.append(face)
        frame_count += 1

    cap.release()
    return np.array(faces)


# 🎤 Audio Extraction
def extract_audio(video_path):
    audio_path = "temp_audio.wav"
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(audio_path, verbose=False, logger=None)
    return audio_path

# 🎶 MFCC Feature Extraction
def extract_mfcc(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=None)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        mfcc = mfcc.T  # shape (time, 13)

        if mfcc.shape[0] < 1:
            return None

        mfcc = mfcc[:1]  # Use only the first frame
        mfcc = np.expand_dims(mfcc, axis=-1)  # shape (1, 13, 1)
        return mfcc
    except Exception as e:
        print(f"❌ Error extracting MFCC: {e}")
        return None


# 🤖 Facial Emotion Prediction
def predict_facial_emotion(video_path):
    faces = extract_faces(video_path)
    if faces.size == 0:
        print("❌ No face detected.")
        return 0, 0.0

    faces = faces / 255.0
    preds = facial_model.predict(faces, verbose=0)

    # ✅ Model returns only 1 output per face
    avg_pred = np.mean(preds[:, 0])
    return int(avg_pred >= POSITIVE_THRESHOLD), float(avg_pred)



# 🔊 Voice Emotion Prediction
def predict_voice_emotion(audio_path):
    mfcc = extract_mfcc(audio_path)
    if mfcc is None:
        return 0, 0.0

    pred = voice_model.predict(mfcc, verbose=0)[0][0]
    return int(pred >= POSITIVE_THRESHOLD), float(pred)


# 📋 Main Depression Detection Logic
def detect_depression(video_path):
    print("🔍 Analyzing facial expressions...")
    face_result, face_conf = predict_facial_emotion(video_path)

    print("🔉 Analyzing voice emotions...")
    audio_path = extract_audio(video_path)
    voice_result, voice_conf = predict_voice_emotion(audio_path)

    # Determine final label
    if face_result == 0 and voice_result == 0:
        status = "🟥 Depressed"
    elif face_result == 1 and voice_result == 1:
        status = "🟩 Not Depressed"
    else:
        status = "❓ Inconclusive"

    # Print summary
    print(f"\n🧠 Facial Expression: {'Positive' if face_result else 'Negative'} (Confidence: {face_conf:.2f})")
    print(f"🗣️ Voice Emotion: {'Positive' if voice_result else 'Negative'} (Confidence: {voice_conf:.2f})")
    print(f"\n🔎 Final Status: {status}")

    # Save to CSV
    save_to_csv(video_path, face_result, face_conf, voice_result, voice_conf, status)
    return status


# 💾 Save Results to CSV
def save_to_csv(video_path, face_result, face_conf, voice_result, voice_conf, status):
    file_exists = os.path.isfile(LOG_FILE)
    with open(LOG_FILE, mode='a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow([
                "Timestamp", "Video", "Facial Result", "Facial Confidence",
                "Voice Result", "Voice Confidence", "Final Status"
            ])
        writer.writerow([
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            os.path.basename(video_path),
            "Positive" if face_result else "Negative",
            f"{face_conf:.2f}",
            "Positive" if voice_result else "Negative",
            f"{voice_conf:.2f}",
            status.replace("🟥", "").replace("🟩", "").replace("❓", "").strip()
        ])


# 🧪 Run Example
video_file = "D:/depression_detect/videos/videoplayback.mp4"
final_result = detect_depression(video_file)


🔍 Analyzing facial expressions...
🔉 Analyzing voice emotions...

🧠 Facial Expression: Positive (Confidence: 0.73)
🗣️ Voice Emotion: Negative (Confidence: 0.01)

🔎 Final Status: ❓ Inconclusive


In [4]:
print(voice_model.input_shape)


(None, 13, 1)


In [6]:
# ✅ Step 1: Imports
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ✅ Step 2: Define RAVDESS dataset path
ravdess_dir = r"D:\depression_detect\datasets\ravdess"

# ✅ Step 3: Label Extraction Function (RAVDESS only)
def extract_emotion_label(filename):
    try:
        emotion_code = int(os.path.basename(filename).split("-")[2])
        if emotion_code in [2, 3, 8]:  # Calm, Happy, Surprised
            return 'positive'
        elif emotion_code in [4, 5, 6, 7]:  # Sad, Angry, Fearful, Disgust
            return 'negative'
    except:
        return None  # Ignore files with invalid format

# ✅ Step 4: Feature Extraction using MFCCs
def extract_features(file_path, max_len=173):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return np.mean(mfcc, axis=1)

# ✅ Step 5: Load and Prepare the Dataset
X_total, y_total = [], []

for root, _, files in os.walk(ravdess_dir):
    for file in files:
        if file.endswith('.wav'):
            full_path = os.path.join(root, file)
            label = extract_emotion_label(file)
            if label is not None:
                features = extract_features(full_path)
                X_total.append(features)
                y_total.append(label)

print(f"✅ Loaded {len(X_total)} valid audio samples from RAVDESS")

# ✅ Step 6: Convert to NumPy arrays
X_array = np.array(X_total)
y_encoded = LabelEncoder().fit_transform(y_total)  # 0 = negative, 1 = positive
y_array = np.array(y_encoded)

# ✅ Step 7: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_array, y_array, test_size=0.2, random_state=42)

# ✅ Step 8: Build and Compile the Model
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(40,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# ✅ Step 9: Train the Model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32)

# ✅ Step 10: Save the Trained Model
model_path = r"D:\depression_detect\models\audio_emotion_model_ravdess.h5"
model.save(model_path)
print(f"✅ Model saved to: {model_path}")


✅ Loaded 1344 valid audio samples from RAVDESS
Epoch 1/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.4942 - loss: 11.6829 - val_accuracy: 0.5576 - val_loss: 3.8933
Epoch 2/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5076 - loss: 6.1691 - val_accuracy: 0.5836 - val_loss: 1.3162
Epoch 3/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5047 - loss: 4.1321 - val_accuracy: 0.5576 - val_loss: 2.2680
Epoch 4/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5205 - loss: 3.3742 - val_accuracy: 0.5465 - val_loss: 0.8406
Epoch 5/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5178 - loss: 2.7748 - val_accuracy: 0.5502 - val_loss: 0.7653
Epoch 6/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.4957 - loss: 2.5647 - val_accuracy: 0.4647 - val_l



✅ Model saved to: D:\depression_detect\models\audio_emotion_model_ravdess.h5


In [15]:
for i, layer in enumerate(loaded_audio_model.layers):
    try:
        output_shape = layer.output_shape
    except AttributeError:
        output_shape = "N/A"
    print(f"{i}: {layer.name} — {layer.__class__.__name__} — {output_shape}")


0: bidirectional — Bidirectional — N/A
1: dropout_1 — Dropout — N/A
2: batch_normalization_1 — BatchNormalization — N/A
3: bidirectional_1 — Bidirectional — N/A
4: dropout_2 — Dropout — N/A
5: batch_normalization_2 — BatchNormalization — N/A
6: dense_2 — Dense — N/A
7: dropout_3 — Dropout — N/A
8: batch_normalization_3 — BatchNormalization — N/A
9: dense_3 — Dense — N/A
