In [3]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import pandas as pd
from pathlib import Path
import os
import dill
import pandas as pd
import numpy as np
import torchvision.models as models
import torch.nn as nn
import clip
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Resize, Compose, ToTensor, Normalize
from pathlib import Path
import torch.optim as optim
import torchaudio.transforms as T
import torchvision.transforms as VT
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
class AudioUtil:
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        if sig.shape[0] == new_channel:
            return aud
        if new_channel == 1:
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return (resig, sr)

    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        if sr == newsr:
            return aud
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if sig.shape[0] > 1:
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])
        return (resig, newsr)

    @staticmethod
    def pad_trunc(aud, max_ms, rand=False):
        sig, sr = aud
        max_len = sr // 1000 * max_ms
        if sig.shape[1] > max_len:
            sig = sig[:, :max_len]
        elif sig.shape[1] < max_len:
            pad_begin_len = random.randint(0, max_len - sig.shape[1]) if rand else 0
            pad_end_len = max_len - sig.shape[1] - pad_begin_len
            pad_begin = torch.zeros((sig.shape[0], pad_begin_len))
            pad_end = torch.zeros((sig.shape[0], pad_end_len))
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        return (sig, sr)

    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        shift_amt = int(random.random() * shift_limit * sig.shape[1])
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig, sr = aud
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
        spec = transforms.AmplitudeToDB(top_db=80)(spec)
        return spec

    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        aug_spec = spec
        freq_mask_param = max_mask_pct * spec.shape[1]
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec)
        time_mask_param = max_mask_pct * spec.shape[2]
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec)
        return aug_spec


In [5]:
class SoundDS(Dataset):
    def __init__(self, df, data_path, apply_augmentation=True):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.apply_augmentation = apply_augmentation  # 控制是否应用数据增强
        
    def __len__(self):
        return len(self.df)    
    
    def __getitem__(self, idx):
        audio_file = self.data_path + self.df.loc[idx, 'path']
        class_id = self.df.loc[idx, 'label']

        aud = AudioUtil.open(audio_file)
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)

        # 应用或跳过时间偏移
        if self.apply_augmentation:
            dur_aud = AudioUtil.pad_trunc(rechan, self.duration,True)
            shift_aud = AudioUtil.time_shift(dur_aud, 0.2)
        else:
            dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
            shift_aud = dur_aud

        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)

        # 应用或跳过频谱图增强
        if self.apply_augmentation:
            aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1,n_freq_masks=2, n_time_masks=2)
        else:
            aug_sgram = sgram

        return aug_sgram, class_id


In [6]:
df_train=pd.read_csv("train_test/Utrain.csv")
df_test=pd.read_csv("train_test/Utest.csv")

In [7]:
dataset_train=SoundDS(df_train,"UrbanSound8K/audio/",True)
dataset_test=SoundDS(df_test,"UrbanSound8K/audio/",False)
dataloader_train=DataLoader(dataset_train,batch_size=128,shuffle=True)
dataloader_test=DataLoader(dataset_test,batch_size=128,shuffle=False)

In [8]:
class AudioTextLoss(nn.Module):
    def __init__(self):
        super(AudioTextLoss, self).__init__()
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
        self.loss_audio = nn.CrossEntropyLoss()
        self.loss_text = nn.CrossEntropyLoss()

    def forward(self, audio_features, text_features):
        # Normalize features
        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Calculate cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_audio = logit_scale * audio_features @ text_features.t()
        logits_per_text = logit_scale * text_features @ audio_features.t()

        # Calculate batch size for ground truth
        batch_size = audio_features.shape[0]
        ground_truth = torch.arange(batch_size, dtype=torch.long, device=audio_features.device)

        # Compute loss as the average of audio-to-text and text-to-audio losses
        return (
            self.loss_audio(logits_per_audio, ground_truth)
            + self.loss_text(logits_per_text, ground_truth)
        ) / 2


In [9]:
class MLPLayers(nn.Module):
    def __init__(self, input_features, output_features, dropout_p=0.1):
        super(MLPLayers, self).__init__()
        self.nonlin = nn.ReLU()
        self.sequential = nn.Sequential(
            nn.Linear(input_features, output_features),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(output_features, output_features)
        )

    def forward(self, x):
        return self.sequential(self.nonlin(x))

resnet =models.resnet18(weights=models.ResNet18_Weights.DEFAULT)

resnet.conv1 = nn.Conv2d(2, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

num_ftrs = resnet.fc.in_features
resnet.fc = MLPLayers(input_features=num_ftrs, output_features=512)

In [10]:
clip_model, _ = clip.load("ViT-B/32", device=device)
clip_model = clip_model.eval()
loss_fn=AudioTextLoss()
resnet.to(device)
clip_model.to(device)

100%|███████████████████████████████████████| 338M/338M [00:11<00:00, 32.0MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [12]:
def train(model, data_loader, optimizer, clip_model, epochs=30):
    model.train()
    for epoch in range(epochs):
        loss_epoch=0
        for spectrograms,labels in tqdm(data_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            # print(labels)
            spectrograms = spectrograms.to(device)
            optimizer.zero_grad()
            audio_features = model(spectrograms)
            labels=["This is a sound of "+label for label in labels]
            text_tokens = clip.tokenize(labels).to(device)
            with torch.no_grad():
                text_features = clip_model.encode_text(text_tokens)
            text_features = text_features.to(dtype=torch.float)
            loss = loss_fn(audio_features, text_features)
            loss_epoch += loss
            loss.backward()
            optimizer.step()
        scheduler.step(loss_epoch/len(dataset_train))
        print(f"Epoch: {epoch+1}, loss: {loss_epoch/len(dataset_train)}")

In [13]:
optimizer = optim.Adam(resnet.parameters(), lr=0.1)
scheduler = CosineAnnealingLR(optimizer, T_max=5, eta_min=0.01)

train(resnet, dataloader_train, optimizer, clip_model, epochs=30)

In [None]:
model_save_path = "UrbanSoundResnet"
os.makedirs(model_save_path, exist_ok=True)

torch.save(resnet, os.path.join(model_save_path, "resnet_model.pth"), pickle_module=dill)

## Model next train

In [14]:
resnet_test = torch.load('UrbanSoundResnet/resnet_model.pth', pickle_module=dill)
resnet_test.eval()

ResNet(
  (conv1): Conv2d(2, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [15]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
optimizer = optim.AdamW(resnet_test.parameters(), lr=0.01, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6)

train(resnet_test, dataloader_train, optimizer, clip_model, epochs=30)

Epoch 1/30: 100%|██████████| 62/62 [01:55<00:00,  1.86s/it]


Epoch: 1, loss: 0.02336684614419937


Epoch 2/30: 100%|██████████| 62/62 [00:53<00:00,  1.17it/s]


Epoch: 2, loss: 0.022436875849962234


Epoch 3/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 3, loss: 0.02223445102572441


Epoch 4/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 4, loss: 0.022237107157707214


Epoch 5/30: 100%|██████████| 62/62 [00:51<00:00,  1.19it/s]


Epoch: 5, loss: 0.02207835018634796


Epoch 6/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 6, loss: 0.021945444867014885


Epoch 7/30: 100%|██████████| 62/62 [00:50<00:00,  1.22it/s]


Epoch: 7, loss: 0.02189304493367672


Epoch 8/30: 100%|██████████| 62/62 [00:50<00:00,  1.22it/s]


Epoch: 8, loss: 0.02192847989499569


Epoch 9/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 9, loss: 0.021883022040128708


Epoch 10/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 10, loss: 0.02177342213690281


Epoch 11/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 11, loss: 0.02174779772758484


Epoch 12/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 12, loss: 0.021778089925646782


Epoch 13/30: 100%|██████████| 62/62 [00:51<00:00,  1.19it/s]


Epoch: 13, loss: 0.02169603295624256


Epoch 14/30: 100%|██████████| 62/62 [00:52<00:00,  1.18it/s]


Epoch: 14, loss: 0.021706121042370796


Epoch 15/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 15, loss: 0.021588385105133057


Epoch 16/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 16, loss: 0.02159712091088295


Epoch 17/30: 100%|██████████| 62/62 [00:52<00:00,  1.19it/s]


Epoch: 17, loss: 0.021624134853482246


Epoch 18/30: 100%|██████████| 62/62 [00:52<00:00,  1.18it/s]


Epoch: 18, loss: 0.021613264456391335


Epoch 19/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 19, loss: 0.02158198691904545


Epoch 20/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 20, loss: 0.0215392354875803


Epoch 21/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 21, loss: 0.021601952612400055


Epoch 22/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 22, loss: 0.02155141346156597


Epoch 23/30: 100%|██████████| 62/62 [00:51<00:00,  1.19it/s]


Epoch: 23, loss: 0.021569067612290382


Epoch 24/30: 100%|██████████| 62/62 [00:52<00:00,  1.19it/s]


Epoch: 24, loss: 0.02154034934937954


Epoch 25/30: 100%|██████████| 62/62 [00:51<00:00,  1.19it/s]


Epoch: 25, loss: 0.02149629406630993


Epoch 26/30: 100%|██████████| 62/62 [00:51<00:00,  1.20it/s]


Epoch: 26, loss: 0.021556053310632706


Epoch 27/30: 100%|██████████| 62/62 [00:51<00:00,  1.19it/s]


Epoch: 27, loss: 0.021466746926307678


Epoch 28/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 28, loss: 0.02150634489953518


Epoch 29/30: 100%|██████████| 62/62 [00:51<00:00,  1.21it/s]


Epoch: 29, loss: 0.021446295082569122


Epoch 30/30: 100%|██████████| 62/62 [00:50<00:00,  1.22it/s]

Epoch: 30, loss: 0.02146325819194317





In [16]:
model_save_path = "UrbanSoundResnet"
os.makedirs(model_save_path, exist_ok=True)

torch.save(resnet, os.path.join(model_save_path, "resnet_model_60epoch.pth"), pickle_module=dill)

## Test on testset

In [17]:
classes=dataset_test.df['label'].unique()
classes

array(['car_horn', 'dog_bark', 'air_conditioner', 'children_playing',
       'siren', 'engine_idling', 'jackhammer', 'drilling', 'street_music',
       'gun_shot'], dtype=object)

In [18]:
feature_classes=[]
for c in classes:
    # print(c)
    c = "This is a sound of " + c
    text_tokens=clip.tokenize(c).to(device)
    with torch.no_grad():
        text_features=clip_model.encode_text(text_tokens)
    text_features=text_features.to(dtype=torch.float)
    feature_classes.append(text_features)

In [41]:
resnet_test = torch.load('UrbanSoundResnet/resnet_model.pth', pickle_module=dill)
resnet_test.eval()

ResNet(
  (conv1): Conv2d(2, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [30]:
acc=0
for spectrograms, labels in dataloader_test:
    spectrograms = spectrograms.to(device)
    print()
    audio_features = resnet_test(spectrograms)
    logit = torch.stack([torch.stack([F.cosine_similarity(audio.unsqueeze(0), text) for text in feature_classes]) for audio in audio_features])
    _,topk_index=torch.topk(logit,k=1,dim=1)
    indices = [np.where(classes == label)[0][0] for label in labels]
    topk_index_numpy = topk_index.cpu().numpy()
    for i, index in enumerate(indices):
        if index in topk_index_numpy[i]:
            acc += 1
print(acc)
print(len(dataset_test))
print(f"Acc: {acc/len(dataset_test)}")

817
855
Acc: 0.9555555555555556
