In [None]:
import torch
from pytorch_grad_cam import GradCAM, HiResCAM, ScoreCAM, GradCAMPlusPlus, AblationCAM, XGradCAM, EigenCAM, FullGrad
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from pytorch_grad_cam.utils.image import show_cam_on_image
from pyha_analyzer.dataset import get_datasets, make_dataloaders, PyhaDFDataset, config
import wandb
import torch
from pyha_analyzer.models.timm_model import TimmModel

import datetime
import logging
import os
from typing import Any, Tuple, Optional

import numpy as np
import torch
import torch.nn.functional as F
from torch.amp.autocast_mode import autocast
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchmetrics.classification import MultilabelAveragePrecision
from tqdm import tqdm


from pyha_analyzer import config
from pyha_analyzer.dataset import get_datasets, make_dataloaders, PyhaDFDataset
from pyha_analyzer.utils import set_seed
from pyha_analyzer.models.early_stopper import EarlyStopper
from pyha_analyzer.models.timm_model import TimmModel
import matplotlib.pyplot as plt

from torchaudio import transforms as audtr
cfg = config.cfg

In [None]:
wandb.init()

In [None]:
weights = "./models/eca_nfnet_l0-20240419-0202.pt"
model_for_run = TimmModel(num_classes=132, 
                            model_name=cfg.model).to(cfg.device)
model_for_run.load_state_dict(torch.load(weights))
model = model_for_run

In [None]:
model

In [None]:
convert_to_mel = audtr.MelSpectrogram(
        sample_rate=cfg.sample_rate,
        n_mels=cfg.n_mels,
        n_fft=cfg.n_fft).to(cfg.prepros_device)
decibel_convert = audtr.AmplitudeToDB(stype="power").to(cfg.prepros_device)

def to_image( audio):
        """
        Convert audio clip to 3-channel spectrogram image
        """
        # Mel spectrogram
        # Pylint complains this is not callable, but it is a torch.nn.Module
        # pylint: disable-next=not-callable
        mel = convert_to_mel(audio)
        # Convert to decibels
        # pylint: disable-next=not-callable
        mel = decibel_convert(mel)
        # Convert to Image
        
        # Normalize Image (https://medium.com/@hasithsura/audio-classification-d37a82d6715)
        mean = mel.mean()
        std = mel.std()
        mel = (mel - mean) / (std + 1e-6)
        
        # Sigmoid to get 0 to 1 scaling (0.5 becomes mean)
        mel = torch.sigmoid(mel)
        return torch.stack([mel, mel, mel])

In [None]:
import pandas as pd

In [None]:
import torchaudio
from pyha_analyzer import utils

In [None]:
df = pd.read_csv("GRABADOR-SDZG-AM-1_PIHA_single_w_confidences_chunked.csv")
df["FILE NAME"] =  df["IN FILE"]
df

In [None]:
def preprocess_audio(idx = 0):
    annotation = df.iloc[idx]


    audio, sample_rate = torchaudio.load(       #pyright: ignore [reportGeneralTypeIssues ]
        os.path.join("//e4e-nas.ucsd.edu/passive-acoustic-biodiversity/Peru_2019_Audiomoth_Sound_Recordings/", annotation["IN FILE"])
    )

    if len(audio.shape) > 1:
        audio = utils.to_mono(audio)

    # Resample
    if sample_rate != cfg.sample_rate:
        resample = audtr.Resample(sample_rate, cfg.sample_rate)
        audio = resample(audio)

    frame_offset = int(annotation[cfg.offset_col] * cfg.sample_rate)
    print(frame_offset)

    num_frames = int(annotation[cfg.duration_col] * cfg.sample_rate)

    audio = audio[frame_offset: frame_offset+num_frames]

    # Crop if too long
    target_num_samples = 5 * cfg.sample_rate
    if audio.shape[0] > 5 * cfg.sample_rate:
        audio = utils.crop_audio(audio, target_num_samples)
    # Pad if too short
    if audio.shape[0] < 5 * cfg.sample_rate:
        audio = utils.pad_audio(audio, target_num_samples)

    return audio

In [None]:
def show_last_layer_activation(audio):
    target_layers = [model.model.final_conv]
    img = to_image(audio)
    input_tensor = torch.unsqueeze(img, 0).to("cpu")# Create an input tensor image for your model..
    # Note: input_tensor can be a batch tensor with several images!

    # Construct the CAM object once, and then re-use it on many images:
    cam = GradCAM(model=model, target_layers=target_layers)
    cam2 = GradCAMPlusPlus(model=model, target_layers=target_layers)

    targets = None#[ClassifierOutputTarget(64)]

    # You can also pass aug_smooth=True and eigen_smooth=True, to apply smoothing.
    grayscale_cam = cam(input_tensor=input_tensor, targets=targets)
    grayscale_cam2 = cam2(input_tensor=input_tensor, targets=targets)

    # In this example grayscale_cam has only one image in the batch:
    grayscale_cam = grayscale_cam[0, :]
    visualization = show_cam_on_image(img.permute(1, 2, 0).numpy(), grayscale_cam, use_rgb=True)
    grayscale_cam2 = grayscale_cam2[0, :]
    visualization2 = show_cam_on_image(img.permute(1, 2, 0).numpy(), grayscale_cam2, use_rgb=True)

    # You can also get the model outputs without having to re-inference
    model_outputs = cam.outputs

    fig, axs = plt.subplots(1, 3, figsize=(8, 16))  # 2 rows of subplots

    # Plot data on the first subplot
    axs[0].imshow(visualization)
    axs[0].set_title('GradCAM')
    axs[0].invert_yaxis()

    axs[1].imshow(visualization2)
    axs[1].set_title('GradCAM++')
    axs[1].invert_yaxis()

    # Plot data on the second subplot
    axs[2].imshow(img.permute(1, 2, 0).numpy())
    axs[2].set_title("Original Image")
    axs[2].invert_yaxis()
    
    plt.show()
    return grayscale_cam

In [None]:
#df = df.drop(columns=["Unnamed: 0"])
df[df["MANUAL ID"] == "PIHA_1"].drop_duplicates()

In [None]:
from IPython.display import clear_output
for i in df[df["MANUAL ID"] == "PIHA_1"].drop_duplicates().index:
    print(i)
    audio = preprocess_audio(i)
    show_last_layer_activation(audio)
    