In [11]:
from google.colab import drive
drive.mount('/content/drive')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import librosa
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

class AudioClassifierLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(AudioClassifierLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Fully connected layers
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Binary classification

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]  # Take the last output
        out = self.relu(self.fc1(out))
        out = self.relu(self.fc2(out))
        out = self.fc3(out)
        out = self.sigmoid(out)  # Output between 0 and 1
        return out

# Initialize the model
input_dim = 33  # Based on your feature extraction
hidden_dim = 128
num_layers = 2
model = AudioClassifierLSTM(input_dim, hidden_dim, num_layers)

# Define loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

def train_model(model, train_loader, val_loader, epochs=10, lr=0.001, graph=False, save_path='model.pth'):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0
        for inputs, labels in train_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()

        train_losses.append(epoch_train_loss / len(train_loader))

        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels.float())
                epoch_val_loss += loss.item()
        val_losses.append(epoch_val_loss / len(val_loader))

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]}, Val Loss: {val_losses[-1]}')

    # Plotting the training and validation losses
    if graph:
        plt.figure(figsize=(10, 5))
        plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
        plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title('Training and Validation Loss')
        plt.legend()
        plt.grid(True)
        plt.show()

    torch.save(model.state_dict(), save_path)

In [5]:
from sklearn.metrics import confusion_matrix, classification_report

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            preds = (outputs.squeeze() > 0.5).long()  # Convert probabilities to class labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(confusion_matrix(all_labels, all_preds))
    print(classification_report(all_labels, all_preds))


In [6]:
def extract_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)

    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Extract Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

    # Extract Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    # Onset detection (to calculate note durations)
    onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
    onset_times = librosa.frames_to_time(onset_frames, sr=sr)

    # Calculate durations between onsets
    note_durations = np.diff(onset_times)  # Durations between onsets
    if len(note_durations) > 0:
        avg_note_duration = np.mean(note_durations)  # Average note duration
    else:
        avg_note_duration = 0  # Handle the case of no detected onsets

    # Combine all features into a single feature vector
    features = np.concatenate((
        mfccs.mean(axis=1), # Timbre
        chroma.mean(axis=1), # Pitch Classes (ex: C#2)
        spectral_contrast.mean(axis=1), # Contrast between peaks and valleys of frequency
        np.array([avg_note_duration])  # Add the average note duration as a feature
    ))

    return features


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

# Create a custom dataset class
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [7]:
import torch
from os import walk

# Auxilliary Function
# Import MIDI & MP3 Files
synthesized_path = "/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/"
synthesized_temp = []
synthesized_files = []
human_path = "/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/Human/"
human_temp = []
human_files = []

# Parsing through MIDI files
for (dir_path, dir_names, file_names) in walk(synthesized_path):
  synthesized_temp.extend(file_names)

for file in synthesized_temp:
  midi_file_path = synthesized_path + file
  synthesized_files.append(midi_file_path)

print(synthesized_files) # Check

# Parsing through Human files
for (dir_path, dir_names, file_names) in walk(human_path):
  human_temp.extend(file_names)

for file in human_temp:
  human_file_path = human_path + file
  human_files.append(human_file_path)

print(human_files) # Check

# Initialize lists to store features and labels
features_list = []
labels_list = []

# Process synthesized files
for file_path in synthesized_files:
    features = extract_features(file_path)
    features_list.append(features)
    labels_list.append(0)  # Label for synthesized audio

# Process human files
for file_path in human_files:
    features = extract_features(file_path)
    features_list.append(features)
    labels_list.append(1)  # Label for human audio

# Convert lists to numpy arrays
X = np.array(features_list)
y = np.array(labels_list)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

['/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_1.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_0.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_2.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_3.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_4.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_5.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_7.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_6.mp3', '/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/MIDI_Synth/MIDI_8.mp3', 

In [10]:
# Create datasets and dataloaders
train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [14]:
# Training

save_path = r"/content/drive/MyDrive/MIDI4STRINGS_but_better/Training Data/Human-ness Training Data/human-ness_model.pth"
model.to(device)
train_model(model, train_loader, test_loader, epochs=10, lr=0.001, graph=True, save_path=save_path)

RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors