In [1]:
import torch
import torch.nn as nn
import torchaudio
import librosa
from sklearn.metrics import classification_report

class ChordClassifier(nn.Module):
    def __init__(self):
        super(ChordClassifier, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        
        self.fc1 = nn.Linear(64 * 8 * 69, 128)  # 64 * 8 * 69 = 35328
        #2, 64, 8, 69]
        self.fc2 = nn.Linear(128, 1)  # Output of 1 for binary classification (minor vs major)

        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):

        #print("start")
        
        #print(x.shape)
        x = self.relu(self.conv1(x))
        print(x.shape)
       # x = self.pool(x)

        #print("after pooling")
        #print(x.shape)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        
       # print("after pooling")
        #print(x.shape)
        x = self.relu(self.conv3(x))
        x = self.pool(x)
        
       # print("after pooling")
        #print(x.shape)

        # Print shape before flattening to check dimensions
        print("Shape before flattening:", x.shape)
        
        x = x.view(x.size(0), -1)  # Flatten for the fully connected layer
       # print("Shape after flattening:", x.shape)  # Print the shape after flattening
        
        #x = x.flatten()#-1, 64 * 8 * 8)  # Flatten for the fully connected layer
      #  #print("after flatting")
      #  print(x.shape)
        x = self.relu(self.fc1(x))

       # print("passt")
        x = self.sigmoid(self.fc2(x))  # Binary classification output
        return x


    # Modify the evaluate function to generate a classification report
    def evaluate(self, dataloader, device):
        self.eval()  # Set model to evaluation mode (deactivates dropout if any)
        all_preds = []
        all_labels = []
    
        # Disable gradient calculations for inference to save memory
        with torch.no_grad():
            for inputs, labels in dataloader:
                # Move data to the appropriate device (GPU or CPU)
                inputs, labels = inputs.to(device), labels.to(device)
    
                # Forward pass
                outputs = self(inputs)
                
                # Sigmoid output is a probability, threshold at 0.5 for binary classification
                predicted = (outputs > 0.8).float()
    
                # Store the predictions and labels for the classification report
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
    
        # Get a classification report
        report = classification_report(all_labels, all_preds, target_names=["Minor", "Major"], zero_division=0)
        print(report)
        return report


In [2]:
from torch.utils.data import Dataset, DataLoader
import os
import torch
import torchaudio.transforms as T
from sklearn.model_selection import train_test_split

class ChordDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None):
        self.file_paths = file_paths  # List of file paths to .wav files
        self.labels = labels  # List of labels (0 for minor, 1 for major)
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        
        waveform, sample_rate = torchaudio.load(file_path)  # Load the .wav file
        
        # Convert waveform to Mel-spectrogram
        mel_spec_transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=64)
        mel_spectrogram = mel_spec_transform(waveform)
        
        # Add a channel dimension for the CNN input
        mel_spectrogram = mel_spectrogram
        
        if self.transform:
            mel_spectrogram = self.transform(mel_spectrogram)
        
        return mel_spectrogram, torch.tensor(label, dtype=torch.float32)

# Example usage for DataLoader
def get_dataloader(file_paths, labels, batch_size=16, shuffle=True):
    dataset = ChordDataset(file_paths, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader


In [3]:
def train_model(model, dataloader, num_epochs=10, learning_rate=0.2):
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader):
            # Move data to the appropriate device (GPU or CPU)
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Print statistics
            running_loss += loss.item()
            if i % 10 == 9:  # Print every 10 mini-batches
                print(f'Epoch {epoch+1}, Batch {i+1}, Loss: {running_loss / 10:.4f}')
                running_loss = 0.0

    print('Finished Training')

# Example device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ChordClassifier().to(device)

# Define the path to the directory containing the .wav files
file_dir = r'C:\Users\rapha\repositories\guitar_hero\data\raw'

# Get a list of all .wav files in the directory
file_paths = [os.path.join(file_dir, f) for f in os.listdir(file_dir) if f.endswith('.wav')]

labels = [0 if 'Minor' in f else 1 for f in os.listdir(file_dir) if f.endswith('.wav')]  # 0 for Minor, 1 for Major

# Split data into train and test sets (80% train, 20% test)
train_paths, test_paths, train_labels, test_labels = train_test_split(file_paths, labels, test_size=0.2, random_state=42, stratify=labels)

# Get the DataLoader for training and testing sets
train_dataloader = get_dataloader(train_paths, train_labels)
test_dataloader = get_dataloader(test_paths, test_labels)

# Train the model on the training data
train_model(model, train_dataloader, num_epochs=20, learning_rate=0.2)

# Evaluate the model on the test data using the evaluate method from before
model.evaluate(test_dataloader, device)




start
torch.Size([16, 1, 64, 552])
torch.Size([16, 16, 64, 552])
after pooling
torch.Size([16, 16, 32, 276])
after pooling
torch.Size([16, 32, 16, 138])
after pooling
torch.Size([16, 64, 8, 69])
Shape before flattening: torch.Size([16, 64, 8, 69])
Shape after flattening: torch.Size([16, 35328])
torch.Size([16, 35328])
passt
start
torch.Size([16, 1, 64, 552])
torch.Size([16, 16, 64, 552])
after pooling
torch.Size([16, 16, 32, 276])
after pooling
torch.Size([16, 32, 16, 138])
after pooling
torch.Size([16, 64, 8, 69])
Shape before flattening: torch.Size([16, 64, 8, 69])
Shape after flattening: torch.Size([16, 35328])
torch.Size([16, 35328])
passt
start
torch.Size([3, 1, 64, 552])
torch.Size([3, 16, 64, 552])
after pooling
torch.Size([3, 16, 32, 276])
after pooling
torch.Size([3, 32, 16, 138])
after pooling
torch.Size([3, 64, 8, 69])
Shape before flattening: torch.Size([3, 64, 8, 69])
Shape after flattening: torch.Size([3, 35328])
torch.Size([3, 35328])
passt
start
torch.Size([16, 1, 64, 5

'              precision    recall  f1-score   support\n\n       Minor       0.56      1.00      0.71         5\n       Major       0.00      0.00      0.00         4\n\n    accuracy                           0.56         9\n   macro avg       0.28      0.50      0.36         9\nweighted avg       0.31      0.56      0.40         9\n'