In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import init
from torch.utils.data import Dataset, random_split
from sklearn.preprocessing import LabelEncoder
import torchaudio
from torchaudio import transforms
import random
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, recall_score
import tensorflow_hub as hub
import os
import noisereduce as nr
# Load the official Silero VAD model
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True)
(get_speech_timestamps, _, _, _, collect_chunks) = utils

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


device = 'cuda' if torch.cuda.is_available() else 'cpu'




  from .autonotebook import tqdm as notebook_tqdm


Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to C:\Users\admin/.cache\torch\hub\master.zip


In [2]:
import shutil

cache_path = os.path.expanduser("~\\AppData\\Local\\Temp\\tfhub_modules")
shutil.rmtree(cache_path, ignore_errors=True)

In [3]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)













In [4]:
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

In [5]:
df = pd.read_csv("train.csv")

In [6]:
label_encoder = LabelEncoder()
df["scientific_name"] = label_encoder.fit_transform(df["scientific_name"])
df['filename'] = 'train_audio/' + df['filename'].str[:]
df = df[['filename', 'scientific_name']]
df.rename(columns={'scientific_name': 'class'}, inplace=True)

In [7]:
new_df = df.groupby('class').head(5).reset_index(drop=True)
excluded_df = df.groupby('class').apply(lambda x: x.iloc[5:]).reset_index(drop=True)

  excluded_df = df.groupby('class').apply(lambda x: x.iloc[5:]).reset_index(drop=True)


In [8]:
labels = new_df['class'].to_numpy()
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
len(class_weights)

206

In [9]:
# Audio Util
class AudioUtil():
  # Load Audio Data from Source. Returns Tensor and Sample Rate of that Audio File.
  def read_file(audio_file_path, target_sr=16000):
    waveform, sr = torchaudio.load(audio_file_path)
    # converting to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != target_sr:
        resampler = torchaudio.transforms.Resample(sr, target_sr) # Converting raw data to target_sr
        waveform = resampler(waveform)
    denoised_waveform = torch.tensor(nr.reduce_noise(y=waveform, sr=sr))
    rms = torch.sqrt(torch.mean(denoised_waveform ** 2))
    normalized_waveform =  denoised_waveform * (0.1 / rms) if rms > 0 else waveform
    return (normalized_waveform, target_sr)
  
  def extract_non_speech_segments(audio_file, segment_length_sec=1.0, no_of_segments = 3):
    waveform, sr = audio_file
    waveform = waveform.squeeze(0)

    # Use Silero VAD to get timestamps of speech
    speech_timestamps = get_speech_timestamps(waveform, model, sampling_rate=sr)

    # Convert speech regions into a mask
    speech_mask = torch.zeros_like(waveform, dtype=torch.bool)
    for ts in speech_timestamps:
        speech_mask[ts['start']:ts['end']] = True

    # Split into 1 second segments
    segment_len = int(sr * segment_length_sec)
    total_segments = len(waveform) // segment_len
    non_speech_segments = []
    if total_segments > no_of_segments: total_segments = no_of_segments
    for i in range(total_segments):
        start = i * segment_len
        end = start + segment_len
        segment = waveform[start:end]

        # Skip if segment overlaps with speech
        if not speech_mask[start:end].any():
            non_speech_segments.append(segment)

    if not non_speech_segments:
      return None, sr
    stacked_segments = torch.stack(non_speech_segments)
    stacked_segments = stacked_segments.flatten().unsqueeze(0)
    return stacked_segments, sr
  
  def retrieve_embeddings(waveform):
     scores, embeddings, spectrogram = yamnet_model(waveform.squeeze(0))
     embeddings = torch.from_numpy(embeddings.numpy())
     return embeddings

  # Resizing all audio files to the same length
  def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      # Truncate the signal to the given length
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      # Length of padding to add at the beginning and end of the signal
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len

      # Pad with 0s
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
      
    return (sig, sr)
  
  # Data Augmentation of the Raw Audio Data using Time Shift.
  def time_shift(aud, shift_limit):
    sig,sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)

  # Converting raw Audio Data to a Mel Spectrogram
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)
  
  # Data Augmentation on Mel-Spectrogram Data using Frequency Masks and Time Masks
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

In [10]:
# Sound Dataset
class SoundDS(Dataset):
    def __init__(self, df):
        self.df = df
        self.duration = 3 * 1000
        self.sr = 16000
        self.channel = 2
        self.shift_pct = 0.4
                        
    # Number of items in dataset
    def __len__(self):
        return len(self.df)    
        
    # Get i'th item in dataset
    def __getitem__(self, idx):
        # Absolute file path of the audio file - concatenate the audio directory with the relative path
        row = self.df.iloc[idx]
        audio_file_path = row['filename']
        class_id = row['class']
        
        audio_file = AudioUtil.read_file(audio_file_path)
        aud,sr = AudioUtil.extract_non_speech_segments(audio_file)
        if aud is None:
            return self.__getitem__((idx + 1) % len(self))  # Skip to next index safely
        dur_aud, sr = AudioUtil.pad_trunc(audio_file, self.duration)
        embeddings = AudioUtil.retrieve_embeddings(dur_aud)
        #shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        #sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        #aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)
        
        return embeddings, class_id

In [11]:
# Train Test Val Split
myds = SoundDS(new_df)
val_ds = SoundDS(excluded_df)

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(myds, batch_size=32, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32, shuffle=False)

In [12]:
# Model
class EmbeddingClassifier(nn.Module):
    def __init__(self, num_classes):
        super(EmbeddingClassifier, self).__init__()
        
        self.pooling = nn.AdaptiveAvgPool1d(1)  # [B, T, 1024] → [B, 1024]
        
        self.fc1 = nn.Linear(1024, 512)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        # x: [B, T, 1024]
        x = x.permute(0, 2, 1)          # [B, 1024, T]
        x = self.pooling(x).squeeze(-1) # [B, 1024]

        x = self.fc1(x)                 # [B, 512]
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)                 # [B, 256]
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)                 # [B, num_classes]
        return x


# Instantiate model
myModel = EmbeddingClassifier(num_classes=206)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)

# Confirm it's on the right device
print(next(myModel.parameters()).device)

cpu


In [15]:
def train_embedding_model(model, train_dl, num_epochs=25, class_weights=None, optimizer=None, scheduler=None, start_epoch=0, checkpoint_path=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Loss, optimizer, scheduler
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    # Initialize optimizer and scheduler if not provided
    if optimizer is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    if scheduler is None:
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, max_lr=0.001,
            steps_per_epoch=len(train_dl),
            epochs=num_epochs + start_epoch,
            anneal_strategy='linear'
        )

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        start_time = time.time()

        for inputs, labels in train_dl:
            inputs = inputs.to(device).float()   # [B, T, 1024]
            labels = labels.to(device).long()

            # Normalize embeddings (optional but common)
            mean, std = inputs.mean(), inputs.std()
            inputs = (inputs - mean) / std

            optimizer.zero_grad()
            outputs = model(inputs)               # [B, num_classes]
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        epoch_loss = running_loss / len(train_dl)
        epoch_acc = correct / total
        print(f"[Epoch {epoch+1}/{num_epochs}] Train Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}, Time: {time.time()-start_time:.1f}s")

        if checkpoint_path:
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict()
            }, checkpoint_path)
    print("Training complete.")
    return model, optimizer, scheduler

In [16]:
model, optimizer, scheduler = train_embedding_model(myModel, train_dl, num_epochs=1, class_weights=class_weights)

[Epoch 1/1] Train Loss: 5.3720, Accuracy: 0.0123, Time: 1192.8s
Training complete.


In [None]:
torch.save(model.state_dict(), "model_weights.pt")
scripted_model = torch.jit.script(model)
scripted_model.save("final_scripted_model.pt")

In [None]:
def inference(model, val_dl, scripted=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if scripted:
        model = torch.jit.load(model)  # if you passed a path
    model = model.to(device)
    model.eval()

    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for val_inputs, val_labels in val_dl:
            val_inputs = val_inputs.to(device).float()
            val_labels = val_labels.to(device).long()

            val_inputs = (val_inputs - val_inputs.mean()) / val_inputs.std()

            val_outputs = model(val_inputs)
            val_preds = val_outputs.argmax(dim=1)
            val_correct += (val_preds == val_labels).sum().item()
            val_total += val_labels.size(0)

    val_acc = val_correct / val_total if val_total > 0 else 0.0
    print(f"Inference Accuracy: {val_acc:.4f}")
    return val_acc

AttributeError: 'EmbeddingClassifier' object has no attribute 'reset_states'

In [None]:
model, optimizer, scheduler = train_embedding_model(myModel, train_dl, num_epochs=1, class_weights=class_weights, optimizer=optimizer, scheduler=scheduler)

In [None]:
def predict(model, input_tensor):
    model.eval()
    input_tensor = input_tensor.unsqueeze(0).to(device).float()
    input_tensor = (input_tensor - input_tensor.mean()) / input_tensor.std()

    with torch.no_grad():
        output = model(input_tensor)
        predicted_class = output.argmax(dim=1).item()
        probs = torch.softmax(output, dim=1)  # [1, num_classes]
    return predicted_class, probs