In [1]:
import dill
import torch
import numpy as np
import pandas as pd
import clip
import os
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as T
import torchvision.transforms as VT
from torch.utils.data import DataLoader, Dataset
device = "cuda" if torch.cuda.is_available() else "cpu"

## ESC50

In [2]:
class ESC50Dataset(Dataset):
    def __init__(self, base_path, meta_path, fold):
        self.base_path = base_path
        self.df = pd.read_csv(meta_path)
        self.df = self.df[self.df['fold'].isin(fold)]
        self.target_sample_rate = 44100
        self.target_channels = 2

    def __len__(self):
        return len(self.df)

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        if sig.shape[0] == new_channel:
            return aud
        if new_channel == 1:
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return (resig, sr)

    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        if sr == newsr:
            return aud
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1, :])
        if sig.shape[0] > 1:
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:, :])
            resig = torch.cat([resig, retwo])
        return (resig, newsr)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filename = os.path.join(self.base_path, row['filename'])
        label = row['category']
        waveform, sample_rate = torchaudio.load(filename)

        # Rechannel
        waveform, sample_rate = self.rechannel((waveform, sample_rate), self.target_channels)
        # Resample
        waveform, sample_rate = self.resample((waveform, sample_rate), self.target_sample_rate)

        # Generate Mel Spectrogram
        spectrogram = T.MelSpectrogram(sample_rate=sample_rate)(waveform)

        # Normalize Spectrogram
        mean = [0.485]
        std = [0.229]
        spectrogram = VT.Normalize(mean=mean, std=std)(spectrogram)

        return spectrogram, label

In [11]:
base_path = 'ESC-50-master/audio'
meta_path = 'ESC-50-master/meta/esc50.csv'
fold = [1,2,3,4,5]

test_dataset = ESC50Dataset(base_path, meta_path, fold)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [5]:
classes=test_dataset.df['category'].unique()

In [6]:
clip_model, _ = clip.load("ViT-B/32", device=device)
clip_model = clip_model.eval()

In [7]:
feature_classes=[]
for c in classes:
    c = "This is a sound of " + c
    text_tokens=clip.tokenize(c).to(device)
    with torch.no_grad():
        text_features=clip_model.encode_text(text_tokens)
    text_features=text_features.to(dtype=torch.float)
    feature_classes.append(text_features)

In [8]:
resnet = torch.load('UrbanSoundResnet/resnet_model.pth', pickle_module=dill)
resnet.eval()

ResNet(
  (conv1): Conv2d(2, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [9]:
acc=0
for spectrograms, labels in test_dataloader:
    spectrograms = spectrograms.to(device)
    audio_features = resnet(spectrograms)
    logit = torch.stack([torch.stack([F.cosine_similarity(audio.unsqueeze(0), text) for text in feature_classes]) for audio in audio_features])
    
    _,max_indices=torch.max(logit,dim=1)
    
    result=(classes[i] for i in max_indices)
    
    result=tuple(result)
    matching_count = sum(x == y for x, y in zip(labels, result))
    acc+=matching_count
print(f"Acc: {acc/len(test_dataset)}")



Acc: 0.03


In [13]:
acc=0
for spectrograms, labels in test_dataloader:
    spectrograms = spectrograms.to(device)
    audio_features = resnet(spectrograms)
    logit = torch.stack([torch.stack([F.cosine_similarity(audio.unsqueeze(0), text) for text in feature_classes]) for audio in audio_features])
    
    _,topk_index=torch.topk(logit,k=5,dim=1)
    indices = [np.where(classes == label)[0][0] for label in labels]
    
    topk_index_numpy = topk_index.cpu().numpy()
    for i, index in enumerate(indices):
        if index in topk_index_numpy[i]:
            acc += 1

print(f"Acc: {acc/len(test_dataset)}")

Acc: 0.1225


## UrbanSound8k

In [3]:
class UrbanSoundDataset(Dataset):
    def __init__(self, base_path, meta_path):
        self.base_path = base_path
        self.df = pd.read_csv(meta_path)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filename = os.path.join(self.base_path, row['path'])
        label = row['label']
        waveform, sample_rate = torchaudio.load(filename)
        
        spectrogram = T.MelSpectrogram(sample_rate=sample_rate)(waveform)

        if spectrogram.shape[0] > 1:
            spectrogram = spectrogram[0, :, :].unsqueeze(0)

        mean = [0.485]
        std = [0.229]
        spectrogram = VT.Normalize(mean=mean, std=std)(spectrogram)

        return spectrogram, label

In [4]:
base_path = 'UrbanSound8K/audio/'
meta_path = 'output.csv'

test_dataset = UrbanSoundDataset(base_path, meta_path)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

FileNotFoundError: [Errno 2] No such file or directory: 'output.csv'

In [11]:
acc=0
for spectrograms, labels in test_dataloader:
    # print(labels)
    spectrograms = spectrograms.to(device)
    audio_features = resnet(spectrograms)
    logit = torch.stack([torch.stack([F.cosine_similarity(audio.unsqueeze(0), text) for text in feature_classes]) for audio in audio_features])
    
    _,max_indices=torch.max(logit,dim=1)
    
    result=(classes[i] for i in max_indices)
    
    result=tuple(result)
    matching_count = sum(x == y for x, y in zip(labels, result))
    acc+=matching_count
print(f"Acc: {acc/len(test_dataset)}")



Acc: 0.1216374269005848


## Load Model from pretrained on Audioset

In [3]:
resnet = torch.load('new_AudiosetResnet/resnet_model_next.pth', pickle_module=dill)

In [4]:
resnet.eval()

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

## Load model from pretrain on ESC50

In [11]:
resnet_1 = torch.load('ESC50resnet/resnet_model.pth', pickle_module=dill)
resnet_1.eval()

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [18]:
acc=0
for spectrograms, labels in test_dataloader:
    spectrograms = spectrograms.to(device)
    audio_features = resnet_1(spectrograms)
    logit = torch.stack([torch.stack([F.cosine_similarity(audio.unsqueeze(0), text) for text in feature_classes]) for audio in audio_features])
    
    _,topk_index=torch.topk(logit,k=5,dim=1)
    indices = [np.where(classes == label)[0][0] for label in labels]
    # print(indices)
    # print(topk_index)
    topk_index_numpy = topk_index.cpu().numpy()
    for i, index in enumerate(indices):
        if index in topk_index_numpy[i]:
            acc += 1

print(f"Acc: {acc/len(test_dataset)}")

Acc: 0.5953216374269006
