In [1]:
from audio_file import AudioFile

In [2]:
af = AudioFile('audio/2019-11-12-10-00_Schleusinger-Allee_70Kmh_129304_M_W_CL_ME_CH12.wav')

In [3]:
af.y_info

{'date_time': 'audio/2019-11-12-10-00',
 'location': 'Schleusinger-Allee',
 'speed': '70',
 'sample_position': '129304',
 'is_background': False,
 'daytime': 'M',
 'weather': 'W',
 'vehicle': 'C',
 'direction': 'L',
 'microphone_type': 'ME',
 'channels': '12'}

In [13]:
import glob
import os

audio_files_list = []

for filepath in glob.iglob('audio/*.wav'):
  audio_files_list.append(AudioFile(filepath))

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np


class VehicleDataset(Dataset):
    def __init__(self, audio_files, label_encoder):
        self.data = []
        self.labels = []

        for i, audio in enumerate(audio_files):
            x_info = audio.x_info()
            y_info = audio.y_info()
            
            features = np.array([np.mean(x_info[key], axis=0) for key in x_info.keys()])
            features = features.flatten()

            if i == 0:
                global NUM_FEATURES
                NUM_FEATURES = len(features)

            self.data.append(features)
            self.labels.append(label_encoder.transform([y_info['vehicle']])[0])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit([af.y_info['vehicle'] for af in audio_files_list])
dataset = VehicleDataset(audio_files_list, label_encoder)

In [16]:
import torch.nn as nn
import torch.nn.functional as F

class AudioClassifier(nn.Module):
    def __init__(self, num_features):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [25]:
from torch.optim import Adam

assert(NUM_FEATURES)

model = AudioClassifier(NUM_FEATURES)
loss_function = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

def train_model(model, data_loader, epochs):
    model.train()
    for epoch in range(epochs):
        for features, labels in data_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
train_model(model, train_loader, 1000)

Epoch 1, Loss: 2.7293896675109863
Epoch 2, Loss: 1.8141292333602905
Epoch 3, Loss: 1.2183750867843628
Epoch 4, Loss: 1.0405184030532837
Epoch 5, Loss: 0.9497033357620239
Epoch 6, Loss: 0.8534680008888245
Epoch 7, Loss: 0.7903956770896912
Epoch 8, Loss: 0.7747653126716614
Epoch 9, Loss: 0.7389553189277649
Epoch 10, Loss: 0.6649367809295654
Epoch 11, Loss: 0.6140540242195129
Epoch 12, Loss: 0.6254065036773682
Epoch 13, Loss: 0.6572405099868774
Epoch 14, Loss: 0.6600188612937927
Epoch 15, Loss: 0.6477543115615845
Epoch 16, Loss: 0.6471648216247559
Epoch 17, Loss: 0.6539628505706787
Epoch 18, Loss: 0.648456335067749
Epoch 19, Loss: 0.6261404156684875
Epoch 20, Loss: 0.6031878590583801
Epoch 21, Loss: 0.5998354554176331
Epoch 22, Loss: 0.6055379509925842
Epoch 23, Loss: 0.6072683334350586
Epoch 24, Loss: 0.6033161282539368
Epoch 25, Loss: 0.602854311466217
Epoch 26, Loss: 0.6065876483917236
Epoch 27, Loss: 0.6069430708885193
Epoch 28, Loss: 0.6004536747932434
Epoch 29, Loss: 0.5934854745864