In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
import torchvision.transforms as transforms
import os
import librosa
import numpy as np
import cv2
import torch.nn.functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
def load_data(data_folder):
    file_list = os.listdir(data_folder)
    data = []
    labels = []
    for filename in file_list:
        filepath = os.path.join(data_folder, filename)
        # Load audio file
        audio, sr = librosa.load(filepath, sr=5500,mono=True)  # Adjust the sampling rate as needed
        # Convert audio to spectrogram and resize
        spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        
        data.append(spectrogram)
        labels.append(filename)
    return data, labels

# Function to define the model
def define_model(num_classes):
    resnet50 = models.resnet50(pretrained=False)
    # Change the first convolutional layer to accept one input channel
    resnet50.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
    num_features = resnet50.fc.in_features
    resnet50.fc = nn.Linear(num_features, num_classes)
    return resnet50

# Function to train the model
def train_model(model, train_loader, criterion, optimizer, num_epochs,save_path):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}:")
        for i, (spectrograms, labels) in enumerate(train_loader, 1):
            # spectrograms, labels = spectrograms, labels  # Move data to GPU if available
            optimizer.zero_grad()
            outputs = model(spectrograms)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 10 == 0 or i == num_batches:
                print(f"  Batch {i}/{num_batches}, Loss: {loss.item():.4f}")
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"  Epoch {epoch + 1} Loss: {epoch_loss:.4f}")
    
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")


In [11]:
def pad_spectrograms(spectrograms):
    max_len = max(spec.shape[2] for spec in spectrograms)
    padded_specs = []
    for spec in spectrograms:
        pad_len = max_len - spec.shape[2]
        padded_spec = F.pad(spec, (0, pad_len))
        padded_specs.append(padded_spec)
    return padded_specs

In [12]:

# Set up dataset and dataloader
data_folder = "D:\\vs_code\\DL\\proj\\resources\\fma_small_edited_truncated"
data, labels = load_data(data_folder)
transform = transforms.Compose([
    transforms.ToTensor(),
])

tensor_data = [transform(spec) for spec in data]

# Pad spectrograms
padded_data = pad_spectrograms(tensor_data)

# Extracting labels as tensors
label_tensor = torch.tensor([int(label.split('.')[0]) for label in labels]) # Assuming filenames are like '0.mp3', '1.mp3', etc.


dataset = list(zip(padded_data, label_tensor))  # Zip spectrograms and labels
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

num_classes = len(set(label_tensor))

model = define_model(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer, num_epochs=10,save_path="D:\\vs_code\\DL\\proj\\resources\\resnet_50_trained.pth")


Epoch 1/10:
  Batch 1/1, Loss: 2.7173
  Epoch 1 Loss: 0.2717
Epoch 2/10:
  Batch 1/1, Loss: 1.7097
  Epoch 2 Loss: 0.1710
Epoch 3/10:
  Batch 1/1, Loss: 1.2118
  Epoch 3 Loss: 0.1212
Epoch 4/10:
  Batch 1/1, Loss: 0.7438
  Epoch 4 Loss: 0.0744
Epoch 5/10:
  Batch 1/1, Loss: 0.2260
  Epoch 5 Loss: 0.0226
Epoch 6/10:
  Batch 1/1, Loss: 0.0658
  Epoch 6 Loss: 0.0066
Epoch 7/10:
  Batch 1/1, Loss: 0.0137
  Epoch 7 Loss: 0.0014
Epoch 8/10:
  Batch 1/1, Loss: 0.0065
  Epoch 8 Loss: 0.0007
Epoch 9/10:
  Batch 1/1, Loss: 0.0039
  Epoch 9 Loss: 0.0004
Epoch 10/10:
  Batch 1/1, Loss: 0.0027
  Epoch 10 Loss: 0.0003
Model saved to D:\vs_code\DL\proj\resources\resnet_50_trained.pth
