In [26]:
import numpy as np
import librosa

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import torch.nn.functional as F

from torchvision.utils import make_grid  # |   Utility stuff for plotting
import matplotlib.pyplot as plt          # |  <- I use this one a lot for plotting, seaborn is a good alternative
from matplotlib.image import imread      # |  it reads images... (png -> usable input (like a numpy array for ex))
import os
import random
from tqdm import tqdm  # | This one is a cute one for making a loading bar, I like it and we'll use it here

In [27]:
#load the CREMA-D AudioWAV dataset and dataloader
def load_dataset(directory, batch_size):

    dataset = []
    emotion_to_number = {'NEU': 0, 'HAP': 1, 'SAD': 2, 'ANG': 3, 'DIS': 4, 'FEA': 5}
    max_len = 0
    
    for filename in os.listdir(directory):
        if(filename.endswith('.wav')):
            filepath  = os.path.join(directory, filename)
            emotion = filename.split('_')[2]
            emotion_label = emotion_to_number[emotion]
            audio_tensor, _ = librosa.load(filepath, sr=None)
            max_len = max(max_len, len(audio_tensor))
            audio_tensor = torch.tensor(audio_tensor, dtype=torch.float32)
            dataset.append([audio_tensor, emotion_label])

    dataset = [(torch.nn.functional.pad(audio_tensor, (0, max_len - audio_tensor.size(0))), label)
                      for audio_tensor, label in dataset]

    dataloader = torch.utils.data.DataLoader(dataset,
                                          batch_size=batch_size,
                                          shuffle=True)
            
    return dataset, dataloader

In [28]:
crema_d_directory = "./AudioWAV"
dataset, dataloader = load_dataset(crema_d_directory, batch_size=32)

In [31]:
for inp, out in dataloader:
    print(len(inp[0]))
    print(out)
    break

80080
tensor([3, 3, 4, 3, 1, 1, 1, 3, 2, 0, 4, 2, 3, 2, 5, 0, 4, 4, 0, 3, 1, 3, 0, 0,
        5, 0, 2, 0, 5, 5, 3, 2])


In [33]:
class AudioCNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        
        # Calculate the output size of the convolutional layers
        conv_output_size = input_size // 4
        
        self.fc1 = nn.Linear(64 * conv_output_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [34]:
input_size = 80080
num_classes = 6  # Assuming 7 emotion classes
model = AudioCNN(input_size=input_size, num_classes=num_classes)