In [1]:
import os
import pandas as pd
import torchaudio
import torchaudio.transforms as T
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import optuna
from torchvggish import vggish

In [2]:
class MarineMammalDataset(Dataset):
    def __init__(self, annotation_file, audio_dir, target_sample_rate, num_samples, device, transformation):
        self.annotations = pd.read_excel(annotation_file)  # Read Excel metadata
        self.audio_dir = audio_dir
        self.device = device
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.transformation = transformation.to(self.device)
        

        # Encode labels as integers
        self.label_encoder = LabelEncoder()
        self.annotations['species_name'] = self.label_encoder.fit_transform(self.annotations['species_name'])

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        #label = label.clone().detach().to(torch.long).to(self.device)

        signal, sr = torchaudio.load(audio_sample_path)

        # Send to GPU
        signal = signal.to(self.device)

        # Resample, mix down, cut, and pad
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)

        #Apply some more preprocessing 
        #signal = self._apply_noise_reduction(signal)
        #signal = self._remove_silence(signal)
        #signal = self._apply_highpass_filter(signal)
        #signal = self._normalize_volume(signal)
        #signal = self._clip_waveform(signal)
        #signal = self._apply_bandpass_filter(signal)


        mel_spec = self.transformation(signal.to(torch.float32))
        mel_spec = torch.log(mel_spec + 1e-6)  # Convert to log scale for VGGish

        mel_spec = torch.nn.functional.interpolate(
            mel_spec.unsqueeze(0), size=(64, 96), mode='bilinear', align_corners=False
        ).squeeze(0)

        # Ensure 3D shape
        mel_spec = mel_spec.unsqueeze(0) if mel_spec.ndim == 2 else mel_spec

        # Convert label to tensor
        
        label = torch.tensor(label, dtype=torch.long).to(self.device)

        # Convert label to tensor
        #label = torch.tensor(label, dtype=torch.long).to(self.device)
        #label = label.clone().detach().to(self.device)
        label = torch.tensor(label, dtype=torch.long).clone().detach().to(self.device)

        return mel_spec, label
    
    
    def _clip_waveform(self, signal, clip_value=0.99):
        return torch.clamp(signal, -clip_value, clip_value)
    
    def _pitch_shift(self, signal, n_steps=2, sample_rate=16000):
        pitch_shifter = T.PitchShift(sample_rate, n_steps=n_steps).to(signal.device)
        return pitch_shifter(signal)
    
    def _apply_bandpass_filter(self, signal):
    # Create a compose transformation if you want to apply multiple augmentations
        augment = Compose([
        HighPassFilter(min_cutoff_freq=10, max_cutoff_freq=240, p=1)
        ])
    
    # Convert signal to numpy for augmentations
        signal_np = signal.cpu().numpy().squeeze()
    
    # Apply augmentation
        augmented_signal = augment(signal_np, sample_rate=self.target_sample_rate)
    
    # Convert back to torch tensor
        return torch.tensor(augmented_signal).unsqueeze(0).to(self.device)
    
    def _normalize_volume(self, signal):
        rms = torch.sqrt(torch.mean(signal**2))
        normalized_signal = signal / rms
        return normalized_signal
    def _apply_noise_reduction(self, signal):
        noise = signal[:, :16000]  # Assume the first second contains noise
        reduced_signal = reduce_noise(
        y=signal.cpu().numpy().squeeze(), 
        sr=16000, 
        y_noise=noise.cpu().numpy().squeeze()
        )
        return torch.tensor(reduced_signal).unsqueeze(0).to(signal.device)
    
    def _remove_silence(self, signal, top_db=30):
        signal_np = signal.cpu().numpy().squeeze()
        intervals = librosa.effects.split(signal_np, top_db=top_db)
        non_silent_signal = np.concatenate([signal_np[start:end] for start, end in intervals])
        return torch.tensor(non_silent_signal).unsqueeze(0).to(signal.device)
    
    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        if signal.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - signal.shape[1]
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def _get_audio_sample_path(self, index):
        file_name = self.annotations.iloc[index, 0]  # file_name column
        path = os.path.join(self.audio_dir, file_name)
        return os.path.normpath(path)

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 2]  # Encoded species_name column


In [3]:
train_annotation_file = "E:/Project_Experiments/9_species_experiment/New_Test_2025/train_metadata.xlsx"
train_audio_dir = "E:/Project_Experiments/9_species_experiment/New_Test_2025/train"
test_annotation_file = "E:/Project_Experiments/9_species_experiment/New_Test_2025/test_metadata.xlsx"
test_audio_dir = "E:/Project_Experiments/9_species_experiment/New_Test_2025/test"
validate_annotation_file = "E:/Project_Experiments/9_species_experiment/New_Test_2025/validate_metadata.xlsx"
validate_audio_dir = "E:/Project_Experiments/9_species_experiment/New_Test_2025/validate"

# VGGish-specific configurations
    
device = "cuda" if torch.cuda.is_available() else "cpu"
sample_rate = 16000  # Required sample rate for VGGish
num_samples = sample_rate * 5  # 5 seconds of audio at 16 kHz
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=400,  # Smaller window size for VGGish
        hop_length=160,  # Matches VGGish expectation
        n_mels=64
    )

    # Create the dataset
train_dataset = MarineMammalDataset(train_annotation_file, train_audio_dir, sample_rate, num_samples, device, mel_spectrogram)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = MarineMammalDataset(test_annotation_file, test_audio_dir, sample_rate, num_samples, device, mel_spectrogram)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
validate_dataset = MarineMammalDataset(validate_annotation_file, validate_audio_dir, sample_rate, num_samples, device, mel_spectrogram)
val_loader = DataLoader(validate_dataset, batch_size=32, shuffle=True)



In [5]:
    # Initialize the VGGish model
model = vggish().to(device)
    #print(model)

# Move PCA tensors to the same device as the model
if model.postprocess:
 model.pproc._pca_matrix = model.pproc._pca_matrix.to(device)
 model.pproc._pca_means = model.pproc._pca_means.to(device)

input_tensor = torch.rand(1, 1, 96, 64).to(device)
output = model(input_tensor)
    #print(f"Old Output : ", output)
    #summary(model, (1, 96, 64))
    # # Path to the pretrained weights
pretrained_weights_path = "C:/Users/myair/.cache/torch/hub/checkpoints/vggish-10086976.pth"

    # # Load pretrained weights
model.load_state_dict(torch.load(pretrained_weights_path, map_location=device, weights_only=True), strict=False)
    # # Freeze pretrained feature extractor layers
for param in model.features.parameters():
 param.requires_grad = False
num_classes = 9

for param in list(model.features[-5:].parameters()):  # Accessing the last  layers
 param.requires_grad = True
    
def create_fc(num_inputs,num_layers,neuron_per_layer,num_outputs,dropout_rate):
 layers=[]
 for i in range(num_layers):
  layers.append(nn.Linear(num_inputs,neuron_per_layer))
  layers.append(nn.LayerNorm(neuron_per_layer))
  layers.append(nn.ReLU())
  layers.append(nn.Dropout(dropout_rate, inplace=False))
  num_inputs=neuron_per_layer
  neuron_per_layer=num_inputs//2

 layers.append(nn.Linear(num_inputs,num_outputs))
 #Create the sequential layer 
 fc=nn.Sequential(*layers)
 return fc
fc=create_fc(12288,5,4096,9,0.3)
print(fc)


Sequential(
  (0): Linear(in_features=12288, out_features=4096, bias=True)
  (1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  (2): ReLU()
  (3): Dropout(p=0.3, inplace=False)
  (4): Linear(in_features=4096, out_features=2048, bias=True)
  (5): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  (6): ReLU()
  (7): Dropout(p=0.3, inplace=False)
  (8): Linear(in_features=2048, out_features=1024, bias=True)
  (9): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (10): ReLU()
  (11): Dropout(p=0.3, inplace=False)
  (12): Linear(in_features=1024, out_features=512, bias=True)
  (13): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (14): ReLU()
  (15): Dropout(p=0.3, inplace=False)
  (16): Linear(in_features=512, out_features=256, bias=True)
  (17): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (18): ReLU()
  (19): Dropout(p=0.3, inplace=False)
  (20): Linear(in_features=256, out_features=9, bias=True)
)


In [5]:
def new_forward(x):
 with torch.set_grad_enabled(True):
  x = model.features(x)  # Extract features
  x = x.view(x.size(0), -1)  # Flatten the feature map
  x = model.fc(x)  # Pass through the updated classifier
 return x

def objective(trial):
        
        # 1. Using optuna generate number of hidden layers
        num_hidden_layers=trial.suggest_int("num_hidden_layers", 1, 5)
        # 2. Using optuna generate number of neurons per layer
        neurons_per_layer=trial.suggest_int("neurons_per_layer", 512,8192,step=512)
        # 3. Using optuna generate number of epochs
        num_epochs=trial.suggest_int("Epochs",4,12,step=2)
        # 4. Using optuna generate learning rate
        learning_rate=trial.suggest_float("Learning rate",1e-5,1e-3,log=True)
        # 5. Using optuna generate dropout rate
        dropout_rate=trial.suggest_float("Dropout_rate",0.1,0.5,step=0.1)
        # 6. Using optuna generate batch size
        batch_size=trial.suggest_categorical("batch_size", [16,32,64])
        # 7. Using optuna generate optimizer name
        optimizer_name=trial.suggest_categorical("Optimizer", ['Adam','SGD','RMSprop'])
        # 8. Using optuna generate weight decay
        weight_decay=trial.suggest_float("Weight decay",1e-5,1e-3,log=True)
        # initialize/update the model now
        model.fc=create_fc(12288,num_hidden_layers, neurons_per_layer,num_classes,dropout_rate).to(device)
        criterion = nn.CrossEntropyLoss()
        #optimizer = optim.Adam(model.parameters(), lr=(1e-5)*4)
        if optimizer_name=='Adam':
            optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        elif optimizer_name=='SGD':
            optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        else:
            optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        
        model.forward=new_forward
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        torch.autograd.set_detect_anomaly(True)
        #Training loop
        for epochs in range(num_epochs):
            model.train()
            running_loss = 0.0
            correct_predictions = 0
            total_samples = 0

            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Update metrics
                #running_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                correct_predictions += (preds == labels).sum().item()
                total_samples += labels.size(0)

            #epoch_loss = running_loss / len(train_loader.dataset)
            epoch_accuracy = correct_predictions / total_samples
        print(" Training accuracy :",epoch_accuracy)

        #Test loop : return testing accuracy: Need to maximize
        model.eval()
        correct=0
        total=0

        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)

            
            # Count correct predictions
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        test_accuracy=correct/total*100
        return test_accuracy

    # # Save the fine-tuned model
    # torch.save(model, "E:/Project_Experiments/9_species_experiment/New_Test_2025/fine_tuned_marine_mammal_vggish.pth")


In [6]:
study=optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[I 2025-02-08 18:49:02,897] A new study created in memory with name: no-name-4f5e2658-f649-492f-bbe7-5bacba2f516b
  label = torch.tensor(label, dtype=torch.long).clone().detach().to(self.device)


 Training accuracy : 0.20286659316427783


[I 2025-02-08 18:53:30,095] Trial 0 finished with value: 43.06393244873342 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 8192, 'Epochs': 4, 'Learning rate': 0.0009958936867781058, 'Dropout_rate': 0.5, 'batch_size': 64, 'Optimizer': 'SGD', 'Weight decay': 3.459022339340795e-05}. Best is trial 0 with value: 43.06393244873342.


 Training accuracy : 0.996141124586549


[I 2025-02-08 19:07:50,901] Trial 1 finished with value: 86.24849215922798 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 6144, 'Epochs': 8, 'Learning rate': 4.607573491164103e-05, 'Dropout_rate': 0.30000000000000004, 'batch_size': 16, 'Optimizer': 'Adam', 'Weight decay': 0.0006344478983149422}. Best is trial 1 with value: 86.24849215922798.


 Training accuracy : 0.9727122381477398


[I 2025-02-08 19:15:51,011] Trial 2 finished with value: 86.12786489746684 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 6656, 'Epochs': 4, 'Learning rate': 0.00038198771811713734, 'Dropout_rate': 0.2, 'batch_size': 64, 'Optimizer': 'Adam', 'Weight decay': 2.593135144318663e-05}. Best is trial 1 with value: 86.24849215922798.


 Training accuracy : 0.9986218302094818


[I 2025-02-08 19:22:00,071] Trial 3 finished with value: 84.07720144752714 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 3584, 'Epochs': 6, 'Learning rate': 0.00010522141751643215, 'Dropout_rate': 0.1, 'batch_size': 32, 'Optimizer': 'RMSprop', 'Weight decay': 5.455179221516757e-05}. Best is trial 1 with value: 86.24849215922798.


 Training accuracy : 0.9980705622932745


[I 2025-02-08 19:35:04,215] Trial 4 finished with value: 88.05790108564536 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 1024, 'Epochs': 10, 'Learning rate': 6.904599704334791e-05, 'Dropout_rate': 0.4, 'batch_size': 16, 'Optimizer': 'Adam', 'Weight decay': 0.0002838749061654009}. Best is trial 4 with value: 88.05790108564536.


 Training accuracy : 0.9691289966923925


[I 2025-02-08 19:39:32,981] Trial 5 finished with value: 85.52472858866103 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 2048, 'Epochs': 4, 'Learning rate': 0.0008653620247278095, 'Dropout_rate': 0.2, 'batch_size': 16, 'Optimizer': 'Adam', 'Weight decay': 1.0191571086200156e-05}. Best is trial 4 with value: 88.05790108564536.


 Training accuracy : 0.9953142227122381


[I 2025-02-08 19:53:32,850] Trial 6 finished with value: 90.10856453558505 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 5120, 'Epochs': 12, 'Learning rate': 0.0003154523365200716, 'Dropout_rate': 0.2, 'batch_size': 16, 'Optimizer': 'SGD', 'Weight decay': 0.0009255666600033805}. Best is trial 6 with value: 90.10856453558505.


 Training accuracy : 0.5283902976846747


[I 2025-02-08 19:58:49,436] Trial 7 finished with value: 76.7189384800965 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 3584, 'Epochs': 6, 'Learning rate': 0.00016290130552062471, 'Dropout_rate': 0.4, 'batch_size': 32, 'Optimizer': 'SGD', 'Weight decay': 2.1858509223883854e-05}. Best is trial 6 with value: 90.10856453558505.


 Training accuracy : 0.9983461962513782


[I 2025-02-08 20:10:09,172] Trial 8 finished with value: 91.31483715319662 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 1024, 'Epochs': 12, 'Learning rate': 4.795339902956599e-05, 'Dropout_rate': 0.5, 'batch_size': 32, 'Optimizer': 'Adam', 'Weight decay': 2.4332071091028467e-05}. Best is trial 8 with value: 91.31483715319662.


 Training accuracy : 0.9776736493936052


[I 2025-02-08 20:33:28,481] Trial 9 finished with value: 90.59107358262968 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 512, 'Epochs': 12, 'Learning rate': 6.033394993955948e-05, 'Dropout_rate': 0.30000000000000004, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 2.7157985860286698e-05}. Best is trial 8 with value: 91.31483715319662.


 Training accuracy : 0.9983461962513782


[I 2025-02-08 20:42:30,091] Trial 10 finished with value: 89.98793727382389 and parameters: {'num_hidden_layers': 3, 'neurons_per_layer': 2048, 'Epochs': 10, 'Learning rate': 1.0954334115483946e-05, 'Dropout_rate': 0.5, 'batch_size': 32, 'Optimizer': 'Adam', 'Weight decay': 0.00014518394957425395}. Best is trial 8 with value: 91.31483715319662.


 Training accuracy : 0.9994487320837927


[I 2025-02-08 20:52:39,889] Trial 11 finished with value: 91.43546441495778 and parameters: {'num_hidden_layers': 3, 'neurons_per_layer': 512, 'Epochs': 12, 'Learning rate': 2.9961526451147717e-05, 'Dropout_rate': 0.4, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 1.0269702547456945e-05}. Best is trial 11 with value: 91.43546441495778.


 Training accuracy : 0.9994487320837927


[I 2025-02-08 21:02:09,011] Trial 12 finished with value: 91.43546441495778 and parameters: {'num_hidden_layers': 3, 'neurons_per_layer': 2048, 'Epochs': 12, 'Learning rate': 1.883082213141984e-05, 'Dropout_rate': 0.4, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 1.1669642313350815e-05}. Best is trial 11 with value: 91.43546441495778.


 Training accuracy : 0.9997243660418964


[I 2025-02-08 21:09:51,271] Trial 13 finished with value: 91.79734620024126 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 2560, 'Epochs': 10, 'Learning rate': 1.6403990527209376e-05, 'Dropout_rate': 0.4, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 1.092318676668115e-05}. Best is trial 13 with value: 91.79734620024126.


 Training accuracy : 0.9994487320837927


[I 2025-02-08 21:17:52,430] Trial 14 finished with value: 91.91797346200241 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 3072, 'Epochs': 10, 'Learning rate': 2.2175208722064758e-05, 'Dropout_rate': 0.4, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 7.888853024533711e-05}. Best is trial 14 with value: 91.91797346200241.


 Training accuracy : 0.9994487320837927


[I 2025-02-08 21:25:42,803] Trial 15 finished with value: 92.15922798552472 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 3584, 'Epochs': 10, 'Learning rate': 1.1011305896876287e-05, 'Dropout_rate': 0.30000000000000004, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 9.970738182608503e-05}. Best is trial 15 with value: 92.15922798552472.


 Training accuracy : 0.9997243660418964


[I 2025-02-08 21:33:41,180] Trial 16 finished with value: 92.15922798552472 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 4608, 'Epochs': 8, 'Learning rate': 1.015762453028712e-05, 'Dropout_rate': 0.30000000000000004, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 9.897764669598588e-05}. Best is trial 15 with value: 92.15922798552472.


 Training accuracy : 0.9994487320837927


[I 2025-02-08 21:41:03,923] Trial 17 finished with value: 92.52110977080821 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 4608, 'Epochs': 8, 'Learning rate': 1.1262002620627131e-05, 'Dropout_rate': 0.30000000000000004, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 0.00016046101455048314}. Best is trial 17 with value: 92.52110977080821.


 Training accuracy : 0.9994487320837927


[I 2025-02-08 21:50:06,926] Trial 18 finished with value: 92.40048250904704 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 5632, 'Epochs': 8, 'Learning rate': 3.040311240152974e-05, 'Dropout_rate': 0.1, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 0.00020787765814208456}. Best is trial 17 with value: 92.52110977080821.


 Training accuracy : 0.9997243660418964


[I 2025-02-08 22:00:00,714] Trial 19 finished with value: 92.40048250904704 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 5632, 'Epochs': 8, 'Learning rate': 3.313787706327997e-05, 'Dropout_rate': 0.1, 'batch_size': 64, 'Optimizer': 'RMSprop', 'Weight decay': 0.00021397511787444326}. Best is trial 17 with value: 92.52110977080821.
