In [16]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.25.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
[0mCollecting safehttpx<0.2.0,>=0.1

In [8]:
#Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')  # Mount Google Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Emotion Detection Model

In [None]:
#Step 2: Set Dataset Path

import os
import librosa
import numpy as np
import pandas as pd

# Define dataset path (Modify the path if needed)
DATASET_PATH = "/content/drive/MyDrive/RAVDESS_Dataset/audio_speech_actors_01-24"
data = []


In [None]:
#Step 3: Define Feature Extraction
import librosa
import numpy as np

# Function to extract features from an audio file
def extract_features(file_path, augment=False):
    y, sr = librosa.load(file_path, duration=3, offset=0.5)  # Load audio

    # Apply augmentation if enabled
    if augment:
        y = add_noise(y)
        y = pitch_shift(y, sr)
        y = time_stretch(y)

    # Extract features
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)

    return np.hstack([mfcc, chroma, mel])  # Concatenate features

# Function to add noise
def add_noise(data, noise_factor=0.005):
    noise = noise_factor * np.random.randn(len(data))
    return data + noise

# Function to shift pitch
def pitch_shift(data, sr, n_steps=2):
    return librosa.effects.pitch_shift(data, sr=sr, n_steps=n_steps)

# Function to stretch time (speed up/down)
def time_stretch(data, rate=0.9):
    return librosa.effects.time_stretch(data, rate)


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Define RAVDESS dataset path
#DATASET_PATH = "/content/drive/MyDrive/RAVDESS_Dataset/audio_speech_actors_01-24"

# Emotion labels
emotion_map = {
    "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
    "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
}

data = []

# Load dataset
for folder in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, folder)
    if not os.path.isdir(folder_path):
        continue

    for file in os.listdir(folder_path):
        if file.endswith(".wav"):
            parts = file.split("-")
            emotion = emotion_map.get(parts[2], "unknown")  # Extract emotion
            actor_id = ''.join(filter(str.isdigit, parts[-1].split(".")[0]))  # Keep only digits
            actor_id = int(actor_id) if actor_id.isdigit() else None  # Convert to integer safely


            # Use only female voices (Even Actor IDs are Female)
            if actor_id % 2 == 0:
                file_path = os.path.join(folder_path, file)
                feature_vector = extract_features(file_path)
                data.append([feature_vector, emotion])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["features", "emotion"])
df["emotion"] = df["emotion"].astype("category").cat.codes  # Encode labels

# Convert to numpy arrays
X = np.array(df["features"].tolist())
y = np.array(df["emotion"].tolist())

# One-hot encode labels
y = to_categorical(y, num_classes=8)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape for CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# CNN Model
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(8, activation='softmax')  # 8 emotion classes
])

# Compile Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test))

# Save Model
model.save("/content/drive/MyDrive/emotion_detection_model.h5")
print("✅ Model training complete and saved to Google Drive!")


In [None]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"New Test Accuracy: {accuracy * 100:.2f}%")


In [14]:
import os
import numpy as np
import librosa
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# ----------------------------------------
# FEATURE EXTRACTION FUNCTION
# ----------------------------------------
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=3, offset=0.5)

    if len(y) < 1024:
        raise ValueError("Audio too short for n_fft")

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

    features = np.hstack([
        np.mean(mfcc, axis=1),
        np.mean(chroma, axis=1),
        np.mean(contrast, axis=1),
        np.mean(tonnetz, axis=1)
    ])
    return features

# ----------------------------------------
# LOAD DATA
# ----------------------------------------
data_dir = '/content/drive/MyDrive/RAVDESS_Dataset/audio_speech_actors_01-24'
X, y = [], []

for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    for file in os.listdir(folder_path):
        if file.endswith('.wav'):
            file_path = os.path.join(folder_path, file)
            try:
                features = extract_features(file_path)
                X.append(features)

                # Extract actor ID using regex and handle cases like "17 (1)"
                cleaned_filename = re.sub(r'\s*\(.*\)', '', file)  # Remove (1), (2), etc.
                actor_id = int(cleaned_filename.split('-')[-1].split('.')[0])

                label = 0 if actor_id % 2 == 1 else 1  # 0 = Male, 1 = Female
                y.append(label)
            except Exception as e:
                print(f"⚠️ Skipping {file}: {e}")

X = np.array(X)
y = np.array(y)

print(f"✅ Loaded {len(X)} samples successfully.")

# ----------------------------------------
# NORMALIZE & RESHAPE FEATURES
# ----------------------------------------
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = np.expand_dims(X, axis=2)  # Shape: (samples, timesteps, features)

# ----------------------------------------
# TRAIN/TEST SPLIT
# ----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------------------
# BUILD LSTM MODEL
# ----------------------------------------
model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True), input_shape=(X.shape[1], 1)),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# ----------------------------------------
# COMPILE & TRAIN MODEL
# ----------------------------------------
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# ----------------------------------------
# SAVE MODEL
# ----------------------------------------
model.save("gender_classification_model.keras")
print("✅ Gender classification model saved!")




✅ Loaded 1450 samples successfully.
Epoch 1/50


  super().__init__(**kwargs)


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 124ms/step - accuracy: 0.7564 - loss: 0.5643 - val_accuracy: 0.8207 - val_loss: 0.4524
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 150ms/step - accuracy: 0.8555 - loss: 0.4255 - val_accuracy: 0.8379 - val_loss: 0.4472
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 110ms/step - accuracy: 0.8506 - loss: 0.4312 - val_accuracy: 0.8379 - val_loss: 0.4599
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 163ms/step - accuracy: 0.8056 - loss: 0.4744 - val_accuracy: 0.8276 - val_loss: 0.4168
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 110ms/step - accuracy: 0.8190 - loss: 0.4271 - val_accuracy: 0.8724 - val_loss: 0.3712
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 168ms/step - accuracy: 0.8353 - loss: 0.3814 - val_accuracy: 0.8483 - val_loss: 0.4061
Epoch 7/50
[1m37/37[0m [32m━━━━━━━━

In [None]:
import gradio as gr
import numpy as np
import librosa
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import pickle

# Load models
emotion_model = load_model("/content/drive/MyDrive/emotion_detection_model.h5")
gender_model = load_model("/content/gender_classification_model.keras")

# Load scaler used during gender model training
# If you used a saved scaler, load it like this:
# scaler = pickle.load(open("scaler.pkl", "rb"))
# But if you're using the same runtime, reuse the same `scaler` object from above

# If not saved, you can re-fit using the same training data used for gender model (not shown here)

# Emotion labels
emotion_labels = {
    0: "neutral", 1: "calm", 2: "happy", 3: "sad",
    4: "angry", 5: "fearful", 6: "disgust", 7: "surprised"
}

# Feature extractor for both models
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=3, offset=0.5)
    if len(y) < 1024:
        raise ValueError("Audio too short")

    # Features for gender model
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

    gender_features = np.hstack([
        np.mean(mfcc, axis=1),
        np.mean(chroma, axis=1),
        np.mean(contrast, axis=1),
        np.mean(tonnetz, axis=1)
    ])

    # Features for emotion model
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    emotion_features = np.hstack([mfcc, chroma, mel])

    return gender_features, emotion_features

# Prediction function
def predict_audio(audio_path):
    try:
        gender_feat, emotion_feat = extract_features(audio_path)

        # Gender prediction
        gender_scaled = scaler.transform([gender_feat])
        gender_input = np.expand_dims(gender_scaled, axis=2)
        gender_prob = gender_model.predict(gender_input)[0][0]
        gender = "Female" if gender_prob >= 0.5 else "Male"

        if gender == "Male":
            return f"🚫 Detected Gender: Male (Confidence: {1 - gender_prob:.2f})\nPlease upload a female voice."

        # Emotion prediction
        emotion_input = np.expand_dims(emotion_feat, axis=0)
        emotion_input = np.expand_dims(emotion_input, axis=2)
        emotion_pred = emotion_model.predict(emotion_input)
        emotion_label = emotion_labels[np.argmax(emotion_pred)]
        emotion_conf = np.max(emotion_pred)

        return (
            f"✅ Detected Gender: Female (Confidence: {gender_prob:.2f})\n"
            f"🎭 Emotion: {emotion_label.capitalize()} (Confidence: {emotion_conf:.2f})"
        )
    except Exception as e:
        return f"❌ Error processing audio: {str(e)}"

# Gradio Interface
interface = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(type="filepath", label="Upload or Record Voice"),
    outputs=gr.Textbox(label="Prediction"),
    title="🎙️ Emotion & Gender Detection (Female Voices Only)",
    description="Upload or record a voice clip. System detects gender first — only female voices are accepted for emotion prediction."
)

interface.launch(debug=True)




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ce35e3501f8758ef20.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 422ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5