In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [None]:
class AudioDataset(Dataset):
    def __init__(self, features_path, labels_path):
        self.X = np.load(features_path)  # (N, 150, 646)
        self.y = np.load(labels_path)    # (N,)
        self.X = torch.tensor(self.X, dtype=torch.float32).unsqueeze(1)  # (N,1,150,646)
        self.y = torch.tensor(self.y, dtype=torch.long)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
        
dataset = AudioDataset("features.npy", "labels.npy")
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [2]:
import torch
import torch.nn as nn

class CNN_LSTM_Model_PyTorch(nn.Module):
    def __init__(self, num_classes=24):
        super().__init__()
        
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2)
        )

        # After 3 MaxPool2d(2), height & width are divided by 8
        self.lstm_input_size = 128 * ((150//8))  # height collapsed into features
        self.lstm = nn.LSTM(self.lstm_input_size, 128, batch_first=True, bidirectional=True)

        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(128*2, 64)
        self.dropout2 = nn.Dropout(0.3)
        self.out = nn.Linear(64, num_classes)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.cnn(x)  # (B, C, H, W)

        # Reshape to (B, time_steps, features)
        x = x.permute(0, 3, 1, 2)  # (B, W, C, H)
        x = x.contiguous().view(batch_size, x.size(1), -1)  # (B, W, C*H)

        lstm_out, (h_n, _) = self.lstm(x)
        h = torch.cat((h_n[-2], h_n[-1]), dim=1)  # (B, 256)

        h = self.dropout1(h)
        h = torch.relu(self.fc1(h))
        h = self.dropout2(h)
        out = self.out(h)

        return out


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN_LSTM_Model().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
epochs = 10
for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch}/{epochs} Loss: {avg_loss:.4f}")


In [None]:
torch.save(model.state_dict(), "cnn_lstm_model.pth")
print("✅ Model saved as cnn_lstm_model.pth")