In [1]:
import os
import dill
import pandas as pd
import numpy as np
import torch
import torchaudio
import torchvision.models as models
import torch.nn as nn
import clip
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Resize, Compose, ToTensor, Normalize
from pathlib import Path
import torch.optim as optim
import torchaudio.transforms as T
import torchvision.transforms as VT

from torch.utils.data import Dataset, DataLoader

In [2]:
class ESC50Dataset(Dataset):
    def __init__(self, base_path, meta_path, fold):
        self.base_path = base_path
        self.df = pd.read_csv(meta_path)
        self.df = self.df[self.df['fold'].isin(fold)]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filename = os.path.join(self.base_path, row['filename'])
        label = row['category']
        waveform, sample_rate = torchaudio.load(filename)
        
        spectrogram = T.MelSpectrogram(sample_rate=sample_rate)(waveform)

        if spectrogram.shape[0] > 1:
            spectrogram = spectrogram[0, :, :].unsqueeze(0)

        mean = [0.485]
        std = [0.229]
        spectrogram = VT.Normalize(mean=mean, std=std)(spectrogram)

        return spectrogram, label

In [3]:
base_path = 'ESC-50-master/audio'
meta_path = 'ESC-50-master/meta/esc50.csv'
fold = [1,2,3,4]

dataset = ESC50Dataset(base_path, meta_path, fold)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [4]:
class AudioTextLoss(nn.Module):
    def __init__(self):
        super(AudioTextLoss, self).__init__()
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
        self.loss_audio = nn.CrossEntropyLoss()
        self.loss_text = nn.CrossEntropyLoss()

    def forward(self, audio_features, text_features):
        # Normalize features
        audio_features = audio_features / audio_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Calculate cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_audio = logit_scale * audio_features @ text_features.t()
        logits_per_text = logit_scale * text_features @ audio_features.t()

        # Calculate batch size for ground truth
        batch_size = audio_features.shape[0]
        ground_truth = torch.arange(batch_size, dtype=torch.long, device=audio_features.device)

        # Compute loss as the average of audio-to-text and text-to-audio losses
        return (
            self.loss_audio(logits_per_audio, ground_truth)
            + self.loss_text(logits_per_text, ground_truth)
        ) / 2


In [5]:
class MLPLayers(nn.Module):
    def __init__(self, input_features, output_features, dropout_p=0.1):
        super(MLPLayers, self).__init__()
        self.nonlin = nn.ReLU()
        self.sequential = nn.Sequential(
            nn.Linear(input_features, output_features),
            nn.ReLU(),
            nn.Dropout(dropout_p),
            nn.Linear(output_features, output_features)
        )

    def forward(self, x):
        return self.sequential(self.nonlin(x))

resnet = models.resnet18(pretrained=False)

resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

num_ftrs = resnet.fc.in_features
resnet.fc = MLPLayers(input_features=num_ftrs, output_features=512)



In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, _ = clip.load("ViT-B/32", device=device)
clip_model = clip_model.eval()
loss_fn=AudioTextLoss()
resnet.to(device)
clip_model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [7]:
def train(model, data_loader, optimizer, clip_model, epochs=30):
    model.train()
    for epoch in range(epochs):
        loss_epoch=0
        for spectrograms, labels in data_loader:
            spectrograms = spectrograms.to(device)
            optimizer.zero_grad()
            audio_features = model(spectrograms)
            labels=["This is a sound of "+label for label in labels]
            text_tokens = clip.tokenize(labels).to(device)
            with torch.no_grad():
                text_features = clip_model.encode_text(text_tokens)
            text_features = text_features.to(dtype=torch.float)
            loss = loss_fn(audio_features, text_features)
            loss_epoch += loss
            loss.backward()
            optimizer.step()
        # scheduler.step()
        print(f"Epoch: {epoch+1}, loss: {loss_epoch/len(dataset)}")

optimizer = optim.Adam(resnet.parameters(), lr=0.1)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

train(resnet, dataloader, optimizer, clip_model, epochs=100)



Epoch: 1, loss: 0.16678299009799957
Epoch: 2, loss: 0.1575911045074463
Epoch: 3, loss: 0.1485597789287567
Epoch: 4, loss: 0.14164896309375763
Epoch: 5, loss: 0.13647662103176117
Epoch: 6, loss: 0.13156019151210785
Epoch: 7, loss: 0.12815412878990173
Epoch: 8, loss: 0.11982974410057068
Epoch: 9, loss: 0.11282099783420563
Epoch: 10, loss: 0.10784037411212921
Epoch: 11, loss: 0.10320109128952026
Epoch: 12, loss: 0.09850771725177765
Epoch: 13, loss: 0.09768055379390717
Epoch: 14, loss: 0.09242719411849976
Epoch: 15, loss: 0.08992860466241837
Epoch: 16, loss: 0.0870678499341011
Epoch: 17, loss: 0.08135592937469482
Epoch: 18, loss: 0.07952971011400223
Epoch: 19, loss: 0.07517270743846893
Epoch: 20, loss: 0.07180191576480865
Epoch: 21, loss: 0.06798122078180313
Epoch: 22, loss: 0.06451481580734253
Epoch: 23, loss: 0.06459861248731613
Epoch: 24, loss: 0.06161371245980263
Epoch: 25, loss: 0.057276248931884766
Epoch: 26, loss: 0.056595366448163986
Epoch: 27, loss: 0.054111819714307785
Epoch: 28,

In [8]:
model_save_path = "newresnet"
os.makedirs(model_save_path, exist_ok=True)

torch.save(resnet, os.path.join(model_save_path, "resnet_model.pth"), pickle_module=dill)