In [None]:
! pip install git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git

In [None]:
import os
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torchvision import models, transforms
from torchvision.models import EfficientNet_B0_Weights
from sklearn.model_selection import train_test_split
import torch.multiprocessing as mp
from collections import Counter
from sklearn.utils import resample
from warmup_scheduler import GradualWarmupScheduler

In [None]:
# Check if PyTorch is using the GPU
print("Is CUDA available? ", torch.cuda.is_available())
print("Device name: ", torch.cuda.get_device_name(0))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
directory_path = '/kaggle/working/new_directory'
# Create the directory
os.makedirs(directory_path, exist_ok=True)
print(f"Directory '{directory_path}' has been created.")

In [None]:
csv_file_path = '/kaggle/input/birds-audio-and-behavior-dataset/audio_metadata.csv'
df = pd.read_csv(csv_file_path)
paths = []

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Concatenate the path with the id
    path = "/kaggle/input/birds-audio-and-behavior-dataset/Audio_files/" + str(row["id"]) + '.mp3'
    # Append the path to the list
    paths.append(path)

# Assign the list of paths to the 'path' column
df["path"] = paths
df["sound_label"] = df["sound_label"].apply(lambda x: x.split(';')[0].strip())
df.drop(['common_name', 'scientific_name'], axis=1, inplace=True)

# directory_path = 'output'
# os.makedirs(directory_path, exist_ok=True)
new_csv_file_path = '/kaggle/working/new_directory/audio_metadata.csv'
df.to_csv(new_csv_file_path, index=False)

In [None]:
mel_spec_params = {
    "sample_rate": 44100,
    "n_mels": 256,
    "f_min": 20,
    "f_max": 22050,
    "n_fft": 2048,
    "hop_length": 512,
    "normalized": True,
    "center" : True,
    "pad_mode" : "constant",
    "norm" : "slaney",
    "onesided" : True,
    "mel_scale" : "slaney"
}

label_mapping = {
    'Call': 0,
    'Song': 1,
    'Dawn song': 2,
    'Non-vocal': 3,
    'Duet': 4,
    'Flight song': 5,
    'Flight call': 6
}

# augmentations = [
#     lambda x: torchaudio.transforms.PitchShift(sample_rate=16000, n_steps=2).to(device)(x.to(device)),
#     lambda x: x.to(device) + torch.randn_like(x.to(device)) * 0.01,  # Add noise
#     lambda x: torch.roll(x.to(device), shifts=int(0.1 * x.size(1)), dims=1),  # Time shift
#     lambda x: torchaudio.transforms.Vol(3).to(device)(x.to(device))  # Change volume
# ]

top_db = 80
train_period = 10
val_period = 10
remove_duration = 12 * mel_spec_params["sample_rate"]
train_duration = train_period * mel_spec_params["sample_rate"]
val_duration = val_period * mel_spec_params["sample_rate"]


In [None]:
def normalize_melspec(X, eps=1e-6):
    X = X.to(device)  # Move tensor to GPU
    mean = X.mean((1, 2), keepdim=True)
    stdout = X.std((1, 2), keepdim=True)
    Xstd = (X - mean) / (stdout + eps)

    norm_min, norm_max = (
        Xstd.min(-1)[0].min(-1)[0],
        Xstd.max(-1)[0].max(-1)[0],
    )
    fix_ind = (norm_max - norm_min) > eps * torch.ones_like((norm_max - norm_min)).to(device)
    V = torch.zeros_like(Xstd).to(device)
    if fix_ind.sum():
        V_fix = Xstd[fix_ind]
        norm_max_fix = norm_max[fix_ind, None, None]
        norm_min_fix = norm_min[fix_ind, None, None]
        V_fix = torch.max(
            torch.min(V_fix, norm_max_fix),
            norm_min_fix,
        )
        V_fix = (V_fix - norm_min_fix) / (norm_max_fix - norm_min_fix)
        V[fix_ind] = V_fix
    return V

def read_wav(path):
    wav, org_sr = torchaudio.load(path, normalize=True)
    wav = torchaudio.functional.resample(wav, orig_freq=org_sr, new_freq=mel_spec_params["sample_rate"])
    wav = wav.to(device)  # Move tensor to GPU
    return wav

def crop_start_wav(wav, remove_duration, use_duration):
    wav = wav.to(device)  # Ensure tensor is on GPU
    if wav.size(-1) > remove_duration + use_duration:
        wav = wav[:, remove_duration:remove_duration + use_duration]
    return wav

In [None]:
class BirdCallDataset(Dataset):
    def __init__(self, csv_file, transform=None, augmentations=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.augmentations = augmentations
        self.mel_transform = torchaudio.transforms.MelSpectrogram(**mel_spec_params).to(device)
        self.db_transform = torchaudio.transforms.AmplitudeToDB(stype='power', top_db=top_db).to(device)

    def __len__(self):
        return len(self.data)

    def prepare_spec(self, path):
        wav = read_wav(path).to(device)
        wav = crop_start_wav(wav, remove_duration, train_duration).to(device)  # Ensure wav is on GPU
        
        # Apply augmentations
        if self.augmentations:
            for aug in self.augmentations:
                wav = aug(wav).to(device)

        mel_spectrogram = normalize_melspec(self.db_transform(self.mel_transform(wav))).to(device)
        mel_spectrogram = mel_spectrogram * 255

        # Ensure the tensor has 3 channels
        if mel_spectrogram.size(0) == 1:
            mel_spectrogram = mel_spectrogram.repeat(3, 1, 1)
        elif mel_spectrogram.size(0) == 2:
            mel_spectrogram = torch.cat([mel_spectrogram, mel_spectrogram[0:1, :, :]], dim=0)
        elif mel_spectrogram.size(0) != 3:
            raise RuntimeError(f"Unexpected tensor size: {mel_spectrogram.size()}")

        mel_spectrogram = mel_spectrogram.permute(1, 2, 0).cpu().detach().numpy()  # Move back to CPU for numpy
        return mel_spectrogram
    
    def __getitem__(self, idx):
        audio_path = self.data.iloc[idx, 2]
        label = self.data.iloc[idx, 1]

        spec = self.prepare_spec(audio_path)
    
        if self.transform is not None:
            res = self.transform(spec)
            spec = res
        else:
            spec = spec
        
        return spec, label


In [None]:
# Define the model class
class BirdCallClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BirdCallClassifier, self).__init__()
        weights = EfficientNet_B0_Weights.DEFAULT
        self.efficientnet = models.efficientnet_b0(weights=weights)
        self.efficientnet.classifier[1] = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self.efficientnet.classifier[1].in_features, num_classes)
        )
    def forward(self, x):
        x = x.to(device)  # Ensure input is on GPU
        x = self.efficientnet(x)
        return x

In [None]:
class GradualWarmupSchedulerV2(GradualWarmupScheduler):
    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]
        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

In [None]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, scheduler=None):
    model = model.to(device)
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0
            num_batches = len(dataloaders[phase])
            for batch_idx, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                numerical_labels = torch.tensor([label_mapping[label] for label in labels], dtype=torch.long).to(device)
                
                # Print the progress
                print(f"Processing batch {batch_idx + 1}/{num_batches} in {phase} phase")

                # sample_rate = 44100  # Assuming a sample rate of 32kHz
                # min_duration = 22 * sample_rate  # Minimum number of samples for 22 seconds
                # if any(input.size(-1) < min_duration for input in inputs):
                #     print(f"Skipping batch {batch_idx + 1}/{num_batches} in {phase} phase due to short spectrogram")
                #     continue

                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, numerical_labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == numerical_labels)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        # Step the scheduler if provided
        if scheduler and phase == 'train':
            scheduler.step(epoch_loss)

    return model


In [None]:
# Define validation function
def validate_model(model, dataloader, criterion):
    model = model.to(device)
    
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    running_corrects = 0

    with torch.no_grad():
        num_batches = len(dataloader)
        for batch_idx, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            numerical_labels = torch.tensor([label_mapping[label] for label in labels], dtype=torch.long).to(device)
            
            # Print the progress
            print(f"Processing batch {batch_idx + 1}/{num_batches}")
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, numerical_labels)

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == numerical_labels)

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = running_corrects.double() / len(dataloader.dataset)

    print(f'Validation Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

    return epoch_loss, epoch_acc

In [None]:
# Save the model parameters
def save_model(model, path):
    torch.save(model.state_dict(), path)

In [None]:
def balance_dataset(dataset):
    # Get the labels
    labels = dataset.data['sound_label']
    label_counts = Counter(labels)
    min_count = min(label_counts.values())

    # Create a balanced dataset
    balanced_indices = []
    for label in label_counts:
        label_indices = [i for i, l in enumerate(labels) if l == label]
        balanced_indices.extend(resample(label_indices, replace=False, n_samples=min_count, random_state=42))

    return Subset(dataset, balanced_indices)

In [None]:
if __name__ == "__main__":
    csv_file = '/kaggle/working/new_directory/audio_metadata.csv'
    num_classes = 7  # Adjust based on your dataset
    batch_size = 64
    num_epochs = 8  # Increase the number of epochs
    learning_rate = 0.001

    dataset = BirdCallDataset(csv_file, transform=transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        # transforms.RandomHorizontalFlip(),  # Added augmentation
        # transforms.RandomCrop(224, padding=4),  # Added augmentation
        transforms.ToTensor(),
    ]), augmentations=None)

    # Balance the dataset
    labels = dataset.data['sound_label']

    train_indices, temp_indices, train_labels, temp_labels = train_test_split(
        range(len(labels)), labels, stratify=labels, test_size=0.4, random_state=42)

    val_indices, test_indices, val_labels, test_labels = train_test_split(
        temp_indices, temp_labels, stratify=temp_labels, test_size=0.5, random_state=42)

    train_dataset = Subset(dataset, train_indices)
    val_dataset = Subset(dataset, val_indices)
    test_dataset = Subset(dataset, test_indices)

    dataloaders = {
        'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
        'val': DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0),
        'test': DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    }

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = BirdCallClassifier(num_classes=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    after_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')  # Added learning rate scheduler
    scheduler = GradualWarmupSchedulerV2(optimizer, multiplier=1, total_epoch=5, after_scheduler=after_scheduler)  # Added warmup scheduler

    model = train_model(model, dataloaders, criterion, optimizer, num_epochs=num_epochs, scheduler=scheduler)
    save_model(model, '/kaggle/working/new_directory/bird_call_classifier.pth')

    # Validate the model on the test set
    print("Testing the model on the test set:")
    validate_model(model, dataloaders['test'], criterion)