In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
class Generator(nn.Module):
    def __init__(self, noise_dim, num_classes, feature_dim):
        super(Generator, self).__init__()
        self.label_embedding = nn.Embedding(num_classes, num_classes)
        self.model = nn.Sequential(
            nn.Linear(noise_dim + num_classes, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, feature_dim),
        )

    def forward(self, noise, labels):
        label_input = self.label_embedding(labels)
        input = torch.cat((noise, label_input), -1)
        return self.model(input)

class Discriminator(nn.Module):
    def __init__(self, num_classes, feature_dim):
        super(Discriminator, self).__init__()
        self.label_embedding = nn.Embedding(num_classes, num_classes)
        input_dim = feature_dim + num_classes  
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, features, labels):
        label_input = self.label_embedding(labels)
        input = torch.cat((features, label_input), -1)
        return self.model(input)

In [3]:
class EmotionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

In [4]:
def train_gan(generator, discriminator, dataloader, epochs, noise_dim, num_classes):
    optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
    adversarial_loss = nn.BCELoss()

    for epoch in range(epochs):
        for features, labels in dataloader:
            batch_size = features.size(0)

            valid = torch.ones(batch_size, 1)
            fake = torch.zeros(batch_size, 1)

            optimizer_D.zero_grad()


            real_loss = adversarial_loss(discriminator(features, labels), valid)


            noise = torch.randn(batch_size, noise_dim)
            gen_labels = torch.randint(0, num_classes, (batch_size,))
            fake_features = generator(noise, gen_labels)
            fake_loss = adversarial_loss(discriminator(fake_features.detach(), gen_labels), fake)

            d_loss = (real_loss + fake_loss) / 2
            d_loss.backward()
            optimizer_D.step()

            optimizer_G.zero_grad()

            gen_features = generator(noise, gen_labels)
            g_loss = adversarial_loss(discriminator(gen_features, gen_labels), valid)

            g_loss.backward()
            optimizer_G.step()

        print(f"Epoch {epoch+1}/{epochs}, D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

In [5]:
def extract_features(file_path, sr=16000):
    import librosa
    y, sr = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

    features = np.hstack([
        np.mean(mfccs, axis=1), np.std(mfccs, axis=1),
        np.mean(chroma, axis=1), np.std(chroma, axis=1),
        np.mean(mel, axis=1), np.std(mel, axis=1),
        np.mean(contrast, axis=1), np.std(contrast, axis=1),
        np.mean(tonnetz, axis=1), np.std(tonnetz, axis=1)
    ])
    return features

In [6]:
def load_dataset(dataset_paths):
    features = []
    labels = []
    for label, folder in dataset_paths.items():
        for file_name in os.listdir(folder):
            file_path = os.path.join(folder, file_name)
            try:
                feature_vector = extract_features(file_path)
                features.append(feature_vector)
                labels.append(label)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    return np.array(features), np.array(labels)

In [7]:
def predict_emotion_gan(audio_path, discriminator, label_encoder):

    features = extract_features(audio_path)
    features = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
    num_classes = len(label_encoder.classes_)

    probabilities = []
    for emotion_label in range(num_classes):
        label_tensor = torch.tensor([emotion_label], dtype=torch.long)
        prob = discriminator(features, label_tensor).item()  
        probabilities.append(prob)

    predicted_label_index = np.argmax(probabilities)
    predicted_emotion = label_encoder.inverse_transform([predicted_label_index])[0]

    return predicted_emotion

In [8]:
DATASET_PATHS = {
    "angry": r"C:\\Users\\lenovo\\Desktop\\model\\Tamil\\TamilAudioSent\\final_data\\angry",
    "fear": r"C:\\Users\\lenovo\\Desktop\\model\\Tamil\\TamilAudioSent\\final_data\\fear",
    "happy": r"C:\\Users\\lenovo\\Desktop\\model\\Tamil\\TamilAudioSent\\final_data\\happy",
    "neutral": r"C:\\Users\\lenovo\\Desktop\\model\\Tamil\\TamilAudioSent\\final_data\\neutral",
    "sad": r"C:\\Users\\lenovo\\Desktop\\new\\model\\TamilAudioSent\\final_data\\sad",
}

In [9]:

X, y = load_dataset(DATASET_PATHS)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [10]:
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [11]:
dataset = EmotionDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

generator = Generator(noise_dim=100, num_classes=5, feature_dim=X.shape[1])
discriminator = Discriminator(num_classes=5, feature_dim=X.shape[1])

train_gan(generator, discriminator, dataloader, epochs=100, noise_dim=100, num_classes=5)

Epoch 1/100, D Loss: 0.3763, G Loss: 0.6566
Epoch 2/100, D Loss: 0.3634, G Loss: 0.6808
Epoch 3/100, D Loss: 0.3425, G Loss: 0.7283
Epoch 4/100, D Loss: 0.3323, G Loss: 0.7542
Epoch 5/100, D Loss: 0.3303, G Loss: 0.7668
Epoch 6/100, D Loss: 0.3178, G Loss: 0.8122
Epoch 7/100, D Loss: 0.3085, G Loss: 0.8479
Epoch 8/100, D Loss: 0.3148, G Loss: 0.8499
Epoch 9/100, D Loss: 0.2849, G Loss: 0.9621
Epoch 10/100, D Loss: 0.3423, G Loss: 0.8491
Epoch 11/100, D Loss: 0.3046, G Loss: 0.9864
Epoch 12/100, D Loss: 0.2954, G Loss: 1.0201
Epoch 13/100, D Loss: 0.4481, G Loss: 0.6422
Epoch 14/100, D Loss: 0.2778, G Loss: 0.9533
Epoch 15/100, D Loss: 0.2697, G Loss: 1.1096
Epoch 16/100, D Loss: 0.2216, G Loss: 1.2962
Epoch 17/100, D Loss: 0.2249, G Loss: 1.2319
Epoch 18/100, D Loss: 0.2144, G Loss: 1.3079
Epoch 19/100, D Loss: 0.2366, G Loss: 1.1507
Epoch 20/100, D Loss: 0.2839, G Loss: 1.1802
Epoch 21/100, D Loss: 0.1263, G Loss: 1.9516
Epoch 22/100, D Loss: 0.2196, G Loss: 1.4325
Epoch 23/100, D Los

In [12]:
torch.save(discriminator.state_dict(), "discriminator_model.pth")

In [15]:
discriminator.load_state_dict(torch.load("discriminator_model.pth"))
discriminator.eval()
label_encoder = joblib.load("label_encoder.pkl")

audio_path = r"C:\\Users\\lenovo\\Desktop\\new\\Tamil\\TamilAudioSent\\final_data\\happy\\audio_0.wav"
predicted_emotion = predict_emotion_gan(audio_path, discriminator, label_encoder)
print(f"The predicted emotion is: {predicted_emotion}")

  discriminator.load_state_dict(torch.load("discriminator_model.pth"))


The predicted emotion is: happy
