In [1]:
import os
import wave
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor


In [2]:
# Set the path of the directory containing the WAV files
directory_path = 'audio'

# Initialize lists to store audio data and labels
audio_data_list = []
labels_list = []

# Loop through all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a WAV file
    if filename.endswith('.wav'):
        # Open the WAV file
        file_path = os.path.join(directory_path, filename)
        audio_file = wave.open(file_path, 'rb')
        
        # Get the sample rate
        sample_rate = audio_file.getframerate()
        
        # Read all frames from the file
        audio_frames = audio_file.readframes(audio_file.getnframes())
        
        # Close the file
        audio_file.close()
        
        # Convert the frames to a NumPy array
        audio_data = np.frombuffer(audio_frames, dtype=np.int16)
        
        # Append the audio data and label to the lists
        audio_data_list.append(audio_data)
        labels_list.append(filename.split('_')[0])  # Assumes that the label is the first part of the filename
        print(f'Loaded file {filename} with sample rate {sample_rate} and length {len(audio_data)}')
# Convert the lists to NumPy arrays
audio_data_array = np.array(audio_data_list, dtype=object)
labels_array = np.array(labels_list)

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(audio_data_array, labels_array, test_size=0.2, random_state=42)

Loaded file 84_121123_000007_000001.wav with sample rate 24000 and length 24960
Loaded file 84_121123_000008_000000.wav with sample rate 24000 and length 100800
Loaded file 84_121123_000008_000001.wav with sample rate 24000 and length 132720
Loaded file 84_121123_000008_000002.wav with sample rate 24000 and length 81120
Loaded file 84_121123_000008_000003.wav with sample rate 24000 and length 396000
Loaded file 84_121123_000008_000004.wav with sample rate 24000 and length 129120
Loaded file 84_121123_000009_000000.wav with sample rate 24000 and length 87840
Loaded file 84_121123_000009_000007.wav with sample rate 24000 and length 152880
Loaded file 84_121123_000009_000008.wav with sample rate 24000 and length 33840
Loaded file 84_121123_000010_000000.wav with sample rate 24000 and length 145440
Loaded file test1.wav with sample rate 24000 and length 42481


In [5]:
batch_size = 20
train_loader = DataLoader(dataset=train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              )
test_loader = DataLoader(dataset=test_data,
                              batch_size=batch_size,
                              shuffle=False,
                              )

In [6]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
# inputs = Input(shape=(8000,1))

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        
        # Using the nn.Sequential function, define a 5 layers of convolutions
        self.conv_relu_stack =  nn.Sequential(
            #First Conv1D layer
            nn.Conv1d(8,13, 3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Dropout(0.3),

            #Second Conv1D layer
            nn.Conv1d(16, 11, 3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Dropout(0.3),

            #Third Conv1D layer
            nn.Conv1d(32, 9, 3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Dropout(0.3),

            #Fourth Conv1D layer
            nn.Conv1d(64, 7, 3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(3),
            nn.Dropout(0.3),

            #Dense Layer 1
            nn.Linear(256,256),
            nn.ReLU(),
            nn.Dropout(0.3),

            #Dense Layer 2
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Dropout(0.3),
            # inputs of linear are number of audio files used
            nn.Linear(10,10),
            nn.Softmax())

        # model = Model(inputs, outputs)
        # model.summary()

    def forward(self, x):
        
        x = self.conv_relu_stack(x)
        x = self.flatten(x)
        logits = self.Linear(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cpu device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (conv_relu_stack): Sequential(
    (0): Conv1d(8, 13, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (3): Dropout(p=0.3, inplace=False)
    (4): Conv1d(16, 11, kernel_size=(3,), stride=(1,))
    (5): ReLU()
    (6): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.3, inplace=False)
    (8): Conv1d(32, 9, kernel_size=(3,), stride=(1,))
    (9): ReLU()
    (10): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (11): Dropout(p=0.3, inplace=False)
    (12): Conv1d(64, 7, kernel_size=(3,), stride=(1,))
    (13): ReLU()
    (14): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
    (15): Dropout(p=0.3, inplace=False)
    (16): Linear(in_features=256, out_features=256, bias=True)
    (17): ReLU()
    (18): Dropout(p=0.3, inpla

  return torch._C._cuda_getDeviceCount() > 0


In [7]:
# Use cross-entropy loss as the loss function
loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-2

# Define a pytorch optimizer using stochastic gradient descent (SGD)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [8]:
# # def train(model, device, train_loader, criterion, optimizer, scheduler, epoch):
# def train(model, device, train_loader, optimizer)
#   model.train()
#   data_len = len(train_loader.dataset)
#   for batch_idx, _data in enumerate(train_loader):
#     spectrograms, labels, input_lengths, label_lengths = _data 
#     spectrograms, labels = spectrograms.to(device), labels.to(device)

#     optimizer.zero_grad()

#     output = model(spectrograms)  # (batch, time, n_class)
#     output = F.log_softmax(output, dim=2)
#     output = output.transpose(0, 1) # (time, batch, n_class)

#     loss = criterion(output, labels, input_lengths, label_lengths)
#     loss.backward()

#     # print('loss', loss.item())
#     # print('learning_rate', scheduler.get_lr())

#     optimizer.step()
#     scheduler.step()
#     if batch_idx % 10 == 0 or batch_idx == data_len:
#       print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#         epoch, batch_idx * len(spectrograms), data_len,
#         100. * batch_idx / len(train_loader), loss.item())
#       )

# def test(model, device, test_loader, criterion):
#   print('\nevaluating...')
#   model.eval()
#   test_loss = 0
#   test_char_edit_dist = []
#   test_word_edit_dist = []
#   with torch.no_grad():
#     for data in test_loader:
#       spectrograms, labels, input_lengths, label_lengths = data 
#       spectrograms, labels = spectrograms.to(device), labels.to(device)

#       output = model(spectrograms) # (batch, time, n_class)
#       output = F.log_softmax(output, dim=2)
#       output = output.transpose(0, 1) # (time, batch, n_class)

#       loss = criterion(output, labels, input_lengths, label_lengths)
#       test_loss += loss.item() / len(test_loader)

#       decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
#       for j in range(len(decoded_preds)):
#         test_char_edit_dist.append(editdistance.eval(decoded_targets[j], decoded_preds[j]))
#         test_word_edit_dist.append(editdistance.eval(decoded_targets[j].split(" "), decoded_preds[j].split(" ")))

#   avg_char_edit_dist = sum(test_char_edit_dist)/len(test_char_edit_dist)
#   avg_word_edit_dist = sum(test_word_edit_dist)/len(test_word_edit_dist)

#   print("Test set:")
#   print("Average loss: {:.4f}".format(test_loss))
#   print("Average character edit distance: {:4f}".format(avg_char_edit_dist))
#   print("Average word edit distance: {:.4f}".format(avg_word_edit_dist))


In [11]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    
    # set the model to train mode
    model.train()
    
    losses = []
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        
        # Compute training loss
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        
        # Calculate model gradients from the loss and optimize the network
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), batch * len(X)
        if batch % 100 == 0:
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        losses.append(loss)
        
    return np.array(losses).mean()

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    
    # Set the model to eval mode
    model.eval()
    
    test_loss, correct = 0, 0
    with torch.no_grad():	# no_grad mode doesn't compute gradients
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X) # compute predictions from X
            test_loss +=  loss_fn(pred,y).item() # compute the test loss
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

In [12]:
# Set the path of the directory containing the WAV files
directory_path = 'audio'

# Initialize lists to store audio data and labels
audio_data_list = []
labels_list = []

# Loop through all files in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a WAV file
    if filename.endswith('.wav'):
        # Open the WAV file
        file_path = os.path.join(directory_path, filename)
        audio_file = wave.open(file_path, 'rb')
        
        # Get the sample rate
        sample_rate = audio_file.getframerate()
        
        # Read all frames from the file
        audio_frames = audio_file.readframes(audio_file.getnframes())
        
        # Close the file
        audio_file.close()
        
        # Convert the frames to a NumPy array
        audio_data = np.frombuffer(audio_frames, dtype=np.int16)
           # Append the audio data and label to the lists
        audio_data_list.append(audio_data)
        labels_list.append(filename.split('_')[0])  # Assumes that the label is the first part of the filename
        print(f'Loaded file {filename} with sample rate {sample_rate} and length {len(audio_data)}')
# Assuming audio files are all 1 second long and have the same sampling rate of 44100 Hz

data_array = np.reshape(audio_data_array, (audio_data_array.shape[0], -1))  # Shape: (11, 44100)
# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data_array, labels_array, test_size=0.2, random_state=42)
print("Data array shape:", data_array.shape)
print("Label array shape:", labels_array.shape)


# clf = SVM(n_iters=1000)

# clf.fit(train_data, train_labels)

# # Predict labels for the test data
# predictions = clf.predict(test_data)

# # Compute the accuracy
# accuracy = np.sum(predictions == test_labels) / len(test_labels)
# print(f'SVM Accuracy: {accuracy:.2f}')

Loaded file 84_121123_000007_000001.wav with sample rate 24000 and length 24960
Loaded file 84_121123_000008_000000.wav with sample rate 24000 and length 100800
Loaded file 84_121123_000008_000001.wav with sample rate 24000 and length 132720
Loaded file 84_121123_000008_000002.wav with sample rate 24000 and length 81120
Loaded file 84_121123_000008_000003.wav with sample rate 24000 and length 396000
Loaded file 84_121123_000008_000004.wav with sample rate 24000 and length 129120
Loaded file 84_121123_000009_000000.wav with sample rate 24000 and length 87840
Loaded file 84_121123_000009_000007.wav with sample rate 24000 and length 152880
Loaded file 84_121123_000009_000008.wav with sample rate 24000 and length 33840
Loaded file 84_121123_000010_000000.wav with sample rate 24000 and length 145440
Loaded file test1.wav with sample rate 24000 and length 42481
Data array shape: (11, 1)
Label array shape: (11,)


In [13]:
epochs = 10

# for plotting the training loss
history = {'losses': [], 'accuracies': []}
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    history['losses'].append(train(train_loader, model, loss_fn, optimizer))
    history['accuracies'].append(test(test_loader, model, loss_fn))
    
    plt.clf()
    fig1 = plt.figure()
    plt.plot(history['losses'], 'r-', lw=2, label='loss')
    plt.legend()
    display.clear_output(wait=True)
    display.display(plt.gcf())

    plt.clf()
    fig2 = plt.figure()
    plt.plot(history['accuracies'], 'b-', lw=1, label='accuracy')
    plt.legend()
#     display.clear_output(wait=True)
    display.display(plt.gcf())
print("Done!")

Epoch 1
-------------------------------


  return collate([torch.as_tensor(b) for b in batch], collate_fn_map=collate_fn_map)


RuntimeError: stack expects each tensor to be equal size, but got [33840] at entry 0 and [132720] at entry 1