In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
class Generator(nn.Module):
    def __init__(self, noise_dim, num_classes, feature_dim):
        super(Generator, self).__init__()
        self.label_embedding = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(noise_dim + num_classes, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, feature_dim),
        )

    def forward(self, noise, labels):
        label_input = self.label_embedding(labels)
        input = torch.cat((noise, label_input), -1)
        return self.model(input)

class Discriminator(nn.Module):
    def __init__(self, num_classes, feature_dim):
        super(Discriminator, self).__init__()
        self.label_embedding = nn.Embedding(num_classes, num_classes)
        input_dim = feature_dim + num_classes  
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, features, labels):
        label_input = self.label_embedding(labels)
        input = torch.cat((features, label_input), -1)
        return self.model(input)

In [3]:
class EmotionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

def train_gan(generator, discriminator, dataloader, epochs, noise_dim, num_classes):
    optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    adversarial_loss = nn.BCELoss()

    for epoch in range(epochs):
        for features, labels in dataloader:
            batch_size = features.size(0)

            valid = torch.ones(batch_size, 1)
            fake = torch.zeros(batch_size, 1)

            
            optimizer_D.zero_grad()
            real_loss = adversarial_loss(discriminator(features, labels), valid)

            noise = torch.randn(batch_size, noise_dim)
            gen_labels = torch.randint(0, num_classes, (batch_size,))
            fake_features = generator(noise, gen_labels)
            fake_loss = adversarial_loss(discriminator(fake_features.detach(), gen_labels), fake)

            d_loss = (real_loss + fake_loss) / 2
            d_loss.backward()
            optimizer_D.step()

            optimizer_G.zero_grad()

            gen_features = generator(noise, gen_labels)
            g_loss = adversarial_loss(discriminator(gen_features, gen_labels), valid)

            g_loss.backward()
            optimizer_G.step()

        print(f"Epoch {epoch+1}/{epochs}, D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

In [4]:
def extract_features(file_path, sr=16000):
    import librosa
    y, sr = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

    features = np.hstack([
        np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
        np.mean(chroma, axis=1), np.std(chroma, axis=1),
        np.mean(mel, axis=1), np.std(mel, axis=1),
        np.mean(contrast, axis=1), np.std(contrast, axis=1),
        np.mean(tonnetz, axis=1), np.std(tonnetz, axis=1)
    ])
    return features

def load_ravdess_dataset(dataset_path, emotion_map):
    features = []
    labels = []

    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                emotion_code = file.split("-")[2]
                emotion_label = emotion_map.get(emotion_code, None)
                if emotion_label is not None:
                    try:
                        feature_vector = extract_features(file_path)
                        features.append(feature_vector)
                        labels.append(emotion_label)
                    except Exception as e:
                        print(f"Error processing {file}: {e}")
    return np.array(features), np.array(labels)

EMOTION_MAP = {
    "01": "calmness",
    "02": "calmness",
    "03": "joy",
    "04": "sadness",
    "05": "anger",
    "06": "fear",
    "07": "disgust",
    "08": "surprise",
    "09": "excitement",
    "10": "gratitude",
    "11": "indifference",
    "12": "frustration"
}


In [5]:
DATASET_PATH = r"C:\\Users\\lenovo\\Desktop\\model\\English\\EnglishAudioSent\\RAVDESS Emotional speech audio"
X, y = load_ravdess_dataset(DATASET_PATH, EMOTION_MAP)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
joblib.dump(label_encoder, "english_label_encoder.pkl")

dataset = EmotionDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

generator = Generator(noise_dim=100, num_classes=len(label_encoder.classes_), feature_dim=X.shape[1])
discriminator = Discriminator(num_classes=len(label_encoder.classes_), feature_dim=X.shape[1])

# Train the GAN
train_gan(generator, discriminator, dataloader, epochs=100, noise_dim=100, num_classes=len(label_encoder.classes_))

Epoch 1/100, D Loss: 0.3087, G Loss: 0.7977
Epoch 2/100, D Loss: 0.2887, G Loss: 0.9069
Epoch 3/100, D Loss: 0.5698, G Loss: 1.1647
Epoch 4/100, D Loss: 0.1839, G Loss: 1.6106
Epoch 5/100, D Loss: 0.1125, G Loss: 2.4617
Epoch 6/100, D Loss: 0.0618, G Loss: 3.3607
Epoch 7/100, D Loss: 0.0524, G Loss: 3.2379
Epoch 8/100, D Loss: 0.4391, G Loss: 3.9008
Epoch 9/100, D Loss: 0.0345, G Loss: 4.5962
Epoch 10/100, D Loss: 0.0109, G Loss: 5.6314
Epoch 11/100, D Loss: 0.0054, G Loss: 7.5405
Epoch 12/100, D Loss: 0.3807, G Loss: 4.4444
Epoch 13/100, D Loss: 0.0667, G Loss: 7.0554
Epoch 14/100, D Loss: 0.0015, G Loss: 9.1905
Epoch 15/100, D Loss: 0.0050, G Loss: 5.8254
Epoch 16/100, D Loss: 0.0051, G Loss: 5.4397
Epoch 17/100, D Loss: 0.0008, G Loss: 11.2128
Epoch 18/100, D Loss: 0.0006, G Loss: 8.4003
Epoch 19/100, D Loss: 0.0039, G Loss: 5.5394
Epoch 20/100, D Loss: 0.0234, G Loss: 5.8280
Epoch 21/100, D Loss: 0.0000, G Loss: 20.1119
Epoch 22/100, D Loss: 0.0005, G Loss: 8.9812
Epoch 23/100, D L

In [7]:
torch.save(discriminator.state_dict(), "english_discriminator_model.pth")

In [8]:
discriminator.load_state_dict(torch.load("english_discriminator_model.pth"))
discriminator.eval()
label_encoder = joblib.load("english_label_encoder.pkl")

  discriminator.load_state_dict(torch.load("english_discriminator_model.pth"))


In [10]:
def predict_emotion_gan(audio_path, discriminator, label_encoder):
    features = extract_features(audio_path)
    features = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
    num_classes = len(label_encoder.classes_)

    # Evaluate 
    probabilities = []
    for emotion_label in range(num_classes):
        label_tensor = torch.tensor([emotion_label], dtype=torch.long)
        prob = discriminator(features, label_tensor).item()  
        probabilities.append(prob)

    predicted_label_index = np.argmax(probabilities)
    predicted_emotion = label_encoder.inverse_transform([predicted_label_index])[0]
    return predicted_emotion

audio_path = r"C:\\Users\\lenovo\\Desktop\\model\\English\\EnglishAudioSent\\RAVDESS Emotional speech audio\\Actor_01\\03-01-01-01-01-01-01.wav"
predicted_emotion = predict_emotion_gan(audio_path, discriminator, label_encoder)
print(f"The predicted emotion is: {predicted_emotion}")

The predicted emotion is: joy
