In [2]:
import torch
import torchaudio

from cnn_arch_2 import CNNNetwork
from dataset import UrbanSoundDataset
from train import AUDIO_DIR, ANNOTATIONS_FILE, SAMPLE_RATE, NUM_SAMPLES

In [14]:
class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]

def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        #Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ =="__main__":
    # load back the model
    cnn = CNNNetwork()
    state_dict = torch.load("cnn.pth")
    cnn.load_state_dict(state_dict)
    
    # load urban sound dataset dataset
    
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu")
    
    # get a sample from the dataset for inference
    input, target = usd[124][0], usd[124][1] # 3 dims[batch_size, num_channels, freq, time]
    # need 4 dimensions :
    input.unsqueeze_(0)

    
    # make an inference
    predicted, expected = predict(cnn, input, target, class_mapping)
    print(f"Predicted: '{predicted}', expected: '{expected}'")

Predicted: 'children_playing', expected: 'engine_idling'
