In [9]:
import librosa
print(librosa.__version__)


0.11.0


In [2]:
import torch
import torch.nn as nn
import torchaudio
import sounddevice as sd
import numpy as np
import librosa
import os

# ========== Define the CNN Model ==========
class EmotionCNN(nn.Module):
    def __init__(self):
        super(EmotionCNN, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(32 * 32 * 32, 128),   # CHANGED from 64 → 128
            nn.ReLU(),
            nn.Linear(128, 8)               # CHANGED from 64 → 128 input
        )

    def forward(self, x):
        return self.model(x)

# ========== Load Model ==========
MODEL_PATH = r"C:\Users\Vihas\Documents\Pytesting\voice_tone\emotion_model.pth"
model = EmotionCNN()
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()

# ========== Emotion Label Mapping ==========
emotion_labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

# ========== Mel Spectrogram Extraction ==========
def extract_mel(y, sr=22050):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = mel_db[:, :128]  # Ensure consistent shape
    return mel_db


# ========== Main Loop ==========
print("Press 'r' to record 3 seconds of your voice...")

while True:
    key = input("Press 'r' to record or 'q' to quit: ").strip().lower()

    if key == 'q':
        print("Exiting...")
        break

    elif key == 'r':
        print("Recording for 3 seconds...")
        duration = 3  # seconds
        sr = 22050

        recording = sd.rec(int(duration * sr), samplerate=sr, channels=1)
        sd.wait()
        y = recording.flatten()

        mel = extract_mel(y, sr)

        # Normalize and prepare for model
        mel_tensor = torch.tensor(mel).unsqueeze(0).unsqueeze(0).float()  # Shape: [1, 1, 128, 128]

        # Predict
        with torch.no_grad():
            output = model(mel_tensor)
            predicted = torch.argmax(output, dim=1).item()
            emotion = emotion_labels[predicted]

        print(f"Predicted Emotion: {emotion}")


  model.load_state_dict(torch.load(MODEL_PATH))


Press 'r' to record 3 seconds of your voice...
Recording for 3 seconds...
Predicted Emotion: fearful
Recording for 3 seconds...
Predicted Emotion: happy
Recording for 3 seconds...
Predicted Emotion: calm
Recording for 3 seconds...
Predicted Emotion: disgust
Exiting...
