In [None]:
pip install torch torchaudio numpy scikit-learn pandas

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
mkdir -p data/happy data/sad data/angry


In [None]:
import os

root = 'data'
if not os.path.exists(root):
    print("‚ùå 'data' folder not found!")
else:
    print(f"‚úÖ 'data' folder found at: {os.path.abspath(root)}")

    subfolders = os.listdir(root)
    print("Subfolders found:", subfolders)

    for sub in subfolders:
        sub_path = os.path.join(root, sub)
        if os.path.isdir(sub_path):
            wav_files = [f for f in os.listdir(sub_path) if f.endswith('.wav')]
            print(f"{sub}/ - {len(wav_files)} .wav files")


‚úÖ 'data' folder found at: /content/data
Subfolders found: ['angry', 'happy', 'sad']
angry/ - 0 .wav files
happy/ - 0 .wav files
sad/ - 0 .wav files


In [None]:
from google.colab import files
import shutil
import os

# Upload file
uploaded = files.upload()  # This opens a file chooser

# Choose the emotion folder (e.g., "happy")
emotion = "sad"
target_dir = f"./data/{emotion}"
os.makedirs(target_dir, exist_ok=True)

# Move uploaded file to the appropriate emotion folder
for filename in uploaded.keys():
    shutil.move(filename, os.path.join(target_dir, filename))
    print(f"Moved {filename} to {target_dir}")


Saving YAF_thumb_sad.wav to YAF_thumb_sad.wav
Moved YAF_thumb_sad.wav to ./data/sad


In [None]:
from google.colab import files
import shutil
import os

# Upload file
uploaded = files.upload()  # This opens a file chooser

# Choose the emotion folder (e.g., "happy")
emotion = "happy"
target_dir = f"./data/{emotion}"
os.makedirs(target_dir, exist_ok=True)

# Move uploaded file to the appropriate emotion folder
for filename in uploaded.keys():
    shutil.move(filename, os.path.join(target_dir, filename))
    print(f"Moved {filename} to {target_dir}")


Saving YAF_youth_happy.wav to YAF_youth_happy.wav
Moved YAF_youth_happy.wav to ./data/happy


In [None]:
from google.colab import files
import shutil
import os

# Upload file
uploaded = files.upload()  # This opens a file chooser

# Choose the emotion folder (e.g., "happy")
emotion = "angry"
target_dir = f"./data/{emotion}"
os.makedirs(target_dir, exist_ok=True)

# Move uploaded file to the appropriate emotion folder
for filename in uploaded.keys():
    shutil.move(filename, os.path.join(target_dir, filename))
    print(f"Moved {filename} to {target_dir}")


Saving YAF_youth_angry.wav to YAF_youth_angry.wav
Moved YAF_youth_angry.wav to ./data/angry


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

# Create train/val split
def split_dataset(dataset, val_ratio=0.3):
    indices = list(range(len(dataset)))
    train_indices, val_indices = train_test_split(indices, test_size=val_ratio, stratify=dataset.labels)
    return Subset(dataset, train_indices), Subset(dataset, val_indices)


In [None]:
def main():
    full_dataset = EmotionDataset(data_dir='data/')
    train_dataset, val_dataset = split_dataset(full_dataset)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    num_classes = len(set(full_dataset.labels))
    model = EmotionClassifier(output_dim=num_classes).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    train(model, train_loader, val_loader, optimizer, criterion)

    torch.save(model.state_dict(), 'emotion_model.pth')


In [None]:
def train(model, train_loader, val_loader, optimizer, criterion, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total

        # Validation
        model.eval()
        val_correct, val_total = 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == labels).sum().item()
                val_total += labels.size(0)

        val_acc = val_correct / val_total

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Train Acc: {train_acc:.2f}, Val Acc: {val_acc:.2f}")


In [None]:
def predict(model, filepath, label_encoder):
    model.eval()
    waveform, sr = torchaudio.load(filepath)
    waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)

    mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=NUM_MFCC)(waveform)
    mfcc = mfcc.squeeze(0).transpose(0, 1)

    max_len = 200
    if mfcc.shape[0] < max_len:
        pad = torch.zeros(max_len - mfcc.shape[0], NUM_MFCC)
        mfcc = torch.cat((mfcc, pad), dim=0)
    else:
        mfcc = mfcc[:max_len, :]

    mfcc = mfcc.unsqueeze(0).to(DEVICE)  # Add batch dim
    with torch.no_grad():
        output = model(mfcc)
        pred = torch.argmax(output, dim=1).item()
        emotion = label_encoder.inverse_transform([pred])[0]
        return emotion


In [None]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np
import joblib  # For saving the label encoder

# Constants
SAMPLE_RATE = 16000
NUM_MFCC = 13
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Dataset Class
class EmotionDataset(Dataset):
    def __init__(self, data_dir):
        self.data = []
        self.labels = []
        self.label_encoder = LabelEncoder()

        for emotion in os.listdir(data_dir):
            emotion_dir = os.path.join(data_dir, emotion)
            for file in os.listdir(emotion_dir):
                if file.endswith('.wav'):
                    self.data.append(os.path.join(emotion_dir, file))
                    self.labels.append(emotion)

        self.labels = self.label_encoder.fit_transform(self.labels)
        self.label_classes = self.label_encoder.classes_

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        filepath = self.data[idx]
        label = self.labels[idx]

        waveform, sr = torchaudio.load(filepath)
        waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)

        # Convert to MFCC
        mfcc = torchaudio.transforms.MFCC(
            sample_rate=SAMPLE_RATE,
            n_mfcc=NUM_MFCC,
            melkwargs={"n_mels": 40, "n_fft": 400, "hop_length": 160}
        )(waveform)

        mfcc = mfcc.squeeze(0).transpose(0, 1)  # Shape: [Time, MFCC]

        # Pad or trim
        max_len = 200
        if mfcc.shape[0] < max_len:
            pad = torch.zeros(max_len - mfcc.shape[0], NUM_MFCC)
            mfcc = torch.cat((mfcc, pad), dim=0)
        else:
            mfcc = mfcc[:max_len, :]

        return mfcc, label

# Simple Neural Network
class EmotionClassifier(nn.Module):
    def __init__(self, input_dim=13, hidden_dim=64, output_dim=3):
        super(EmotionClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim * 200, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Training Loop
def train(model, loader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Main
def main():
    dataset = EmotionDataset(data_dir='data/')
    loader = DataLoader(dataset, batch_size=8, shuffle=True)

    num_classes = len(dataset.label_classes)
    model = EmotionClassifier(output_dim=num_classes).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    train(model, loader, optimizer, criterion)

    # Save model
    torch.save(model.state_dict(), 'emotion_model.pth')

    # Save label encoder
    joblib.dump(dataset.label_encoder, 'label_encoder.pkl')
    print(f"‚úÖ Model and label encoder saved. Classes: {dataset.label_classes}")

if __name__ == '__main__':
    main()



Epoch 1/10, Loss: 2.7664
Epoch 2/10, Loss: 17.4947
Epoch 3/10, Loss: 0.0004
Epoch 4/10, Loss: 0.0000
Epoch 5/10, Loss: 0.0000
Epoch 6/10, Loss: 0.0000
Epoch 7/10, Loss: 0.0166
Epoch 8/10, Loss: 0.3790
Epoch 9/10, Loss: 0.0000
Epoch 10/10, Loss: 0.0000
‚úÖ Model and label encoder saved. Classes: ['angry' 'happy' 'sad']


In [None]:
import os
import torch
import torchaudio
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
import joblib
from google.colab import files
import numpy as np

# Constants
SAMPLE_RATE = 16000
NUM_MFCC = 13
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model definition (must match training)
class EmotionClassifier(torch.nn.Module):
    def __init__(self, input_dim=13, hidden_dim=64, output_dim=3):
        super(EmotionClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim * 200, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Preprocessing function
def preprocess_audio(filepath):
    waveform, sr = torchaudio.load(filepath)
    waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)

    mfcc = torchaudio.transforms.MFCC(
        sample_rate=SAMPLE_RATE,
        n_mfcc=NUM_MFCC,
        melkwargs={"n_mels": 40, "n_fft": 400, "hop_length": 160}
    )(waveform)

    mfcc = mfcc.squeeze(0).transpose(0, 1)  # [Time, MFCC]

    # Pad or trim to fixed length
    max_len = 200
    if mfcc.shape[0] < max_len:
        pad = torch.zeros(max_len - mfcc.shape[0], NUM_MFCC)
        mfcc = torch.cat((mfcc, pad), dim=0)
    else:
        mfcc = mfcc[:max_len, :]

    return mfcc.unsqueeze(0)  # Add batch dimension

# Upload a test .wav file
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

# Load model and label encoder
label_encoder = joblib.load('label_encoder.pkl')
num_classes = len(label_encoder.classes_)

model = EmotionClassifier(output_dim=num_classes).to(DEVICE)
model.load_state_dict(torch.load('emotion_model.pth', map_location=DEVICE))
model.eval()

# Preprocess and predict
input_tensor = preprocess_audio(file_path).to(DEVICE)
with torch.no_grad():
    outputs = model(input_tensor)
    predicted_index = torch.argmax(outputs, dim=1).item()
    predicted_label = label_encoder.inverse_transform([predicted_index])[0]

print(f"üéôÔ∏è Predicted Emotion: {predicted_label}")


Saving YAF_young_angry.wav to YAF_young_angry.wav
üéôÔ∏è Predicted Emotion: angry
