In [14]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import librosa.display
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
def load_data(data_folder):
    file_list = os.listdir(data_folder)
    data = []
    labels = []
    for filename in file_list:
        filepath = os.path.join(data_folder, filename)
        # Load audio file
        audio, sr = librosa.load(filepath, sr=44100)  # Adjust the sampling rate as needed
        # Convert audio to spectrogram
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        data.append(spectrogram)
        labels.append(int(filename.split('.')[0]))  # Assuming filenames are numbered as described
    return data, np.array(labels)  # Convert labels to numpy array

In [9]:
def pad_spectrograms(spectrograms):
    max_len = max(spec.shape[1] for spec in spectrograms)
    padded_specs = []
    for spec in spectrograms:
        pad_len = max_len - spec.shape[1]
        padded_spec = np.pad(spec, ((0, 0), (0, pad_len)), mode='constant')
        padded_specs.append(padded_spec)
    return padded_specs

In [5]:
def create_model(input_shape, num_classes):
    model = nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1)),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2),
        nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1)),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2),
        nn.Flatten(),
        nn.Linear(64 * (input_shape[0] // 4) * (input_shape[1] // 4), 128),
        nn.ReLU(),
        nn.Linear(128, num_classes)
    )
    return model

In [6]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

In [7]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = (correct / total) * 100
    print(f"Accuracy on test set: {accuracy}%")
    return accuracy

In [15]:
# Load data
data, labels = load_data('D:\\vs_code\\DL\\proj\\resources\\fma_small_edited_truncated')  # Specify your data folder path
padded_data = pad_spectrograms(data)

# Split data into train and test sets
X = torch.tensor(padded_data, dtype=torch.float32).unsqueeze(1)  # Add channel dimension
y = torch.tensor(labels, dtype=torch.long)  # Convert to torch.long type

# Initialize the model
input_shape = X.shape[2:]
num_classes = len(np.unique(labels))  # Number of unique classes in the labels
model = create_model(input_shape, num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Perform k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}/{kf.n_splits}")

    # Split data into train and test sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create DataLoader for train and test sets
    train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=32, shuffle=True)
    test_loader = DataLoader(list(zip(X_test, y_test)), batch_size=32)

    # Train the model
    train_model(model, train_loader, criterion, optimizer)

    # Evaluate the model
    accuracy = evaluate_model(model, test_loader)
    accuracies.append(accuracy)

# Calculate and print average accuracy
avg_accuracy = np.mean(accuracies)
print(f"Average accuracy across all folds: {avg_accuracy}%")

Fold 1/5
Epoch 1, Loss: 4.092637538909912
Epoch 2, Loss: 1309.659912109375
Epoch 3, Loss: 1775.8828125
Epoch 4, Loss: 1170.735107421875
Epoch 5, Loss: 972.8038330078125
Epoch 6, Loss: 688.670654296875
Epoch 7, Loss: 367.48779296875
Epoch 8, Loss: 149.6774139404297
Epoch 9, Loss: 37.54979705810547
Epoch 10, Loss: 39.79206848144531
Accuracy on test set: 0.0%
Fold 2/5
Epoch 1, Loss: 73.35342407226562
Epoch 2, Loss: 30.14513397216797
Epoch 3, Loss: 15.916583061218262
Epoch 4, Loss: 6.105024814605713
Epoch 5, Loss: 2.916755199432373
Epoch 6, Loss: 0.20061416923999786
Epoch 7, Loss: 0.05822592228651047
Epoch 8, Loss: 0.1661190241575241
Epoch 9, Loss: 0.03883408010005951
Epoch 10, Loss: 0.06223553419113159
Accuracy on test set: 50.0%
Fold 3/5
Epoch 1, Loss: 0.2935710847377777
Epoch 2, Loss: 0.1811506748199463
Epoch 3, Loss: 0.08751821517944336
Epoch 4, Loss: 0.040061503648757935
Epoch 5, Loss: 0.01561435405164957
Epoch 6, Loss: 0.004152162931859493
Epoch 7, Loss: 0.0009464593604207039
Epoch 8