In [3]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from src.dataset import SpeechDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [6]:
dataset = SpeechDataset("../data/processed/melspec", feature_type="melspec")
len(dataset)
loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [7]:
class CNNEmotion(nn.Module):
    def __init__(self, n_classes=6):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(32 * 32 * 32, 128)
        self.fc2 = nn.Linear(128, n_classes)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

model = CNNEmotion().to(device)
model


CNNEmotion(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=32768, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=6, bias=True)
)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
epochs = 10
loss_history = []

for epoch in range(epochs):
    total = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()

        total += loss.item()

    loss_history.append(total)
    print("epoch", epoch+1, "loss", total)


epoch 1 loss 4422.859383404255
epoch 2 loss 3280.7062982320786
epoch 3 loss 2770.5539474338293
epoch 4 loss 2230.1546207368374
epoch 5 loss 1771.6681041792035
epoch 6 loss 1406.652057416737


In [None]:
import matplotlib.pyplot as plt

plt.plot(loss_history)
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid()
plt.show()


In [None]:
torch.save(model.state_dict(), "cnn_baseline.pth")
