# Imports

In [None]:
import torch.nn as nn
import numpy as np
import random
import torch
import torch.nn as nn
import copy
import warnings
import torch
import librosa
import csv
import random
import os
import pandas as pd
import torchaudio
import subprocess
import tempfile
from skimage.transform import resize
from skimage.filters import gaussian
from skimage.color import rgb2gray
from skimage import exposure, util
from torchvision.models import resnet101
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import KFold

In [None]:
num_labels = 24
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
warnings.filterwarnings('ignore')

In [None]:
def addChannels(img):
    return np.stack((img, img, img))

def horizontal_flip(img):
    horizontal_flip_img = img[:, ::-1]
    return addChannels(horizontal_flip_img)

def vertical_flip(img):
    vertical_flip_img = img[::-1, :]
    return addChannels(vertical_flip_img)

def addNoisy(img):
    noise_img = util.random_noise(img)
    return addChannels(noise_img)

def contrast_stretching(img):
    contrast_img = exposure.rescale_intensity(img)
    return addChannels(contrast_img)

def randomGaussian(img):
    gaussian_img = gaussian(img)
    return addChannels(gaussian_img)

def randomGamma(img):
    img_gamma = exposure.adjust_gamma(img)
    return addChannels(img_gamma)

def spec_to_image(spec):
    spec = resize(spec, (224, 400))
    eps=1e-6
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.asarray(spec_scaled)
    return spec_scaled

In [None]:
sr = 48000
length = 10 * sr
data = pd.read_csv("./dataset/train_tp.csv")

fmin = 24000
fmax = 0
for i in range(0, len(data)):
    if fmin > float(data.iloc[i]['f_min']):
        fmin = float(data.iloc[i]['f_min'])
    if fmax < float(data.iloc[i]['f_max']):
        fmax = float(data.iloc[i]['f_max'])
        
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)

In [None]:
def safe_load_audio(path):
    try:
        waveform, sr = torchaudio.load(path)
        return waveform.numpy().squeeze(), sr
    except Exception as e:
        print(f"[torchaudio error] {path}: {e}")
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            tmp_wav_path = tmpfile.name
        try:
            cmd = ["ffmpeg", "-y", "-i", path, tmp_wav_path]
            subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            waveform, sr = torchaudio.load(tmp_wav_path)
            return waveform.numpy().squeeze(), sr
        except Exception as e2:
            print(f"[ffmpeg error] {path}: {e2}")
            return None, None
        finally:
            if os.path.exists(tmp_wav_path):
                os.remove(tmp_wav_path)

In [None]:
label_list = []
data_list = []
audio_data = {}

for i in tqdm(range(len(data))):
    row = data.iloc[i]
    recording_id = row['recording_id']
    species_id = int(row['species_id'])
    t_min, t_max = float(row['t_min']), float(row['t_max'])

    audio_path = f"{'./dataset/train/'}{recording_id}.flac"
    wav, sr = safe_load_audio(audio_path)
    if wav is None:
        continue

    center = np.round((t_min + t_max) / 2 * sr)
    start = max(0, center - length // 2)
    end = min(len(wav), start + length)
    start = max(0, end - length)
    audio_slice = wav[int(start):int(end)]

    spec = librosa.feature.melspectrogram(y=audio_slice, sr=sr, fmin=fmin, fmax=fmax)
    spec_db = librosa.power_to_db(spec, top_db=80)
    img = spec_to_image(spec_db)

    data_list.append(recording_id)
    label_list.append(species_id)
    audio_data[recording_id] = img

In [None]:
class AudioData(Dataset):
    def __init__(self, X, y, data_type):
        self.data = []
        self.labels = []
        self.augs = [addNoisy, contrast_stretching,randomGaussian,randomGamma, vertical_flip, horizontal_flip, addChannels]
        self.data_type=data_type
        for i in range(0, len(X)):
            recording_id = X[i]
            label = y[i]
            mel_spec = audio_data[recording_id]
            self.data.append(mel_spec)
            self.labels.append(label)
                
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.data_type == "train":
            aug= random.choice(self.augs)
            data = aug(self.data[idx])
        else:
            data = addChannels(self.data[idx])
        return data, self.labels[idx]

In [None]:
learning_rate = 2e-4
epochs = 20
loss_fn = nn.CrossEntropyLoss()

In [None]:
def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, scheduler):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    train_losses = []
    valid_losses = []
    
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses=[]
        for _, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
        train_losses.append(batch_losses)

        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        
        for _, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        
        print("epoch = %d, train_loss = %.5f, val_loss = %.5f, val_accuracy = %.5f" % (epoch, np.mean(train_losses[-1]), np.mean(valid_losses[-1]), accuracy))

        scheduler.step(np.mean(valid_losses[-1]))
        if accuracy > best_acc:
            best_acc = accuracy
            best_model_wts = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_model_wts)
    return model

In [None]:
nfold = 5
skf = KFold(n_splits=nfold, shuffle=True, random_state=32)

for fold_id, (train_index, val_index) in enumerate(skf.split(data_list, label_list)):
    print("Fold", fold_id)
    X_train = np.take(data_list, train_index)
    y_train = np.take(label_list, train_index, axis = 0)
    X_val = np.take(data_list, val_index)
    y_val = np.take(label_list, val_index, axis = 0)

    train_data = AudioData(X_train, y_train, "train")
    valid_data = AudioData(X_val, y_val, "valid")
    train_loader = DataLoader(train_data, batch_size=8, shuffle=True, drop_last=True)
    valid_loader = DataLoader(valid_data, batch_size=8, shuffle=True, drop_last=True)

    model = resnet101(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_labels)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
    model = train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, scheduler)
    torch.save(model.state_dict(), "./model" + str(fold_id) + ".pt")
    
    del train_data, valid_data, train_loader, valid_loader, model, X_train, X_val, y_train, y_val

Fold 0


Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth
100%|██████████| 171M/171M [00:00<00:00, 284MB/s]
  5%|▌         | 1/20 [00:26<08:16, 26.11s/it]

epoch = 1, train_loss = 3.00798, val_loss = 2.56881, val_accuracy = 0.25000


 10%|█         | 2/20 [00:46<06:51, 22.85s/it]

epoch = 2, train_loss = 2.71547, val_loss = 3.01856, val_accuracy = 0.25000


 15%|█▌        | 3/20 [01:07<06:11, 21.84s/it]

epoch = 3, train_loss = 2.48854, val_loss = 3.11627, val_accuracy = 0.31667


 20%|██        | 4/20 [01:27<05:40, 21.30s/it]

epoch = 4, train_loss = 2.22845, val_loss = 2.10557, val_accuracy = 0.49167


 25%|██▌       | 5/20 [01:48<05:16, 21.12s/it]

epoch = 5, train_loss = 2.12518, val_loss = 2.12288, val_accuracy = 0.49167


 30%|███       | 6/20 [02:09<04:53, 20.93s/it]

epoch = 6, train_loss = 1.87094, val_loss = 1.75304, val_accuracy = 0.55417


 35%|███▌      | 7/20 [02:29<04:30, 20.79s/it]

epoch = 7, train_loss = 1.83867, val_loss = 1.24491, val_accuracy = 0.66667


 40%|████      | 8/20 [02:50<04:08, 20.70s/it]

epoch = 8, train_loss = 1.64922, val_loss = 1.30135, val_accuracy = 0.65417


 45%|████▌     | 9/20 [03:10<03:46, 20.63s/it]

epoch = 9, train_loss = 1.54027, val_loss = 1.23556, val_accuracy = 0.65000


 50%|█████     | 10/20 [03:31<03:26, 20.61s/it]

epoch = 10, train_loss = 1.50700, val_loss = 1.45188, val_accuracy = 0.63333


 55%|█████▌    | 11/20 [03:51<03:05, 20.62s/it]

epoch = 11, train_loss = 1.42224, val_loss = 1.33594, val_accuracy = 0.69583


 60%|██████    | 12/20 [04:12<02:44, 20.57s/it]

epoch = 12, train_loss = 1.27715, val_loss = 1.28129, val_accuracy = 0.67083


 65%|██████▌   | 13/20 [04:32<02:23, 20.54s/it]

epoch = 13, train_loss = 1.13402, val_loss = 1.18289, val_accuracy = 0.68750


 70%|███████   | 14/20 [04:53<02:03, 20.55s/it]

epoch = 14, train_loss = 1.13569, val_loss = 1.23145, val_accuracy = 0.66667


 75%|███████▌  | 15/20 [05:13<01:42, 20.49s/it]

epoch = 15, train_loss = 1.05641, val_loss = 1.32466, val_accuracy = 0.67083


 80%|████████  | 16/20 [05:34<01:21, 20.50s/it]

epoch = 16, train_loss = 1.07223, val_loss = 1.21676, val_accuracy = 0.70833


 85%|████████▌ | 17/20 [05:54<01:01, 20.51s/it]

epoch = 17, train_loss = 0.89545, val_loss = 1.17584, val_accuracy = 0.69583


 90%|█████████ | 18/20 [06:15<00:41, 20.50s/it]

epoch = 18, train_loss = 0.94487, val_loss = 1.01126, val_accuracy = 0.76250


 95%|█████████▌| 19/20 [06:35<00:20, 20.54s/it]

epoch = 19, train_loss = 0.85868, val_loss = 1.15693, val_accuracy = 0.74583


100%|██████████| 20/20 [06:56<00:00, 20.83s/it]

epoch = 20, train_loss = 0.73619, val_loss = 1.33360, val_accuracy = 0.68750





Fold 1


  5%|▌         | 1/20 [00:20<06:30, 20.57s/it]

epoch = 1, train_loss = 3.03822, val_loss = 2.62037, val_accuracy = 0.27083


 10%|█         | 2/20 [00:41<06:10, 20.60s/it]

epoch = 2, train_loss = 2.61069, val_loss = 4.34926, val_accuracy = 0.14167


 15%|█▌        | 3/20 [01:01<05:51, 20.66s/it]

epoch = 3, train_loss = 2.47784, val_loss = 2.83374, val_accuracy = 0.39583


 20%|██        | 4/20 [01:22<05:30, 20.65s/it]

epoch = 4, train_loss = 2.17087, val_loss = 1.81135, val_accuracy = 0.52083


 25%|██▌       | 5/20 [01:43<05:09, 20.62s/it]

epoch = 5, train_loss = 1.98487, val_loss = 2.06827, val_accuracy = 0.54167


 30%|███       | 6/20 [02:03<04:48, 20.60s/it]

epoch = 6, train_loss = 1.93574, val_loss = 2.32169, val_accuracy = 0.47500


 35%|███▌      | 7/20 [02:24<04:28, 20.62s/it]

epoch = 7, train_loss = 1.89814, val_loss = 1.79292, val_accuracy = 0.61250


 40%|████      | 8/20 [02:44<04:07, 20.63s/it]

epoch = 8, train_loss = 1.77715, val_loss = 1.35627, val_accuracy = 0.66250


 45%|████▌     | 9/20 [03:05<03:46, 20.63s/it]

epoch = 9, train_loss = 1.66600, val_loss = 1.63083, val_accuracy = 0.64167


 50%|█████     | 10/20 [03:26<03:26, 20.62s/it]

epoch = 10, train_loss = 1.54697, val_loss = 1.77421, val_accuracy = 0.65000


 55%|█████▌    | 11/20 [03:46<03:05, 20.63s/it]

epoch = 11, train_loss = 1.51715, val_loss = 1.94928, val_accuracy = 0.65417


 60%|██████    | 12/20 [04:07<02:45, 20.63s/it]

epoch = 12, train_loss = 1.49216, val_loss = 1.96793, val_accuracy = 0.58333


 65%|██████▌   | 13/20 [04:28<02:25, 20.74s/it]

epoch = 13, train_loss = 1.45152, val_loss = 1.35586, val_accuracy = 0.73333


 70%|███████   | 14/20 [04:49<02:04, 20.74s/it]

epoch = 14, train_loss = 1.28054, val_loss = 1.15343, val_accuracy = 0.77500


 75%|███████▌  | 15/20 [05:09<01:43, 20.69s/it]

epoch = 15, train_loss = 1.22433, val_loss = 1.25306, val_accuracy = 0.76250


 80%|████████  | 16/20 [05:30<01:22, 20.68s/it]

epoch = 16, train_loss = 1.16583, val_loss = 1.26968, val_accuracy = 0.77500


 85%|████████▌ | 17/20 [05:51<01:02, 20.69s/it]

epoch = 17, train_loss = 1.25517, val_loss = 1.17555, val_accuracy = 0.77500


 90%|█████████ | 18/20 [06:11<00:41, 20.67s/it]

epoch = 18, train_loss = 1.18120, val_loss = 1.16343, val_accuracy = 0.76667


 95%|█████████▌| 19/20 [06:32<00:20, 20.67s/it]

epoch = 19, train_loss = 1.25095, val_loss = 1.23244, val_accuracy = 0.78750


100%|██████████| 20/20 [06:53<00:00, 20.66s/it]

epoch = 20, train_loss = 1.18897, val_loss = 1.18092, val_accuracy = 0.77917





Fold 2


  5%|▌         | 1/20 [00:20<06:32, 20.64s/it]

epoch = 1, train_loss = 3.02771, val_loss = 2.94646, val_accuracy = 0.27917


 10%|█         | 2/20 [00:41<06:13, 20.75s/it]

epoch = 2, train_loss = 2.69436, val_loss = 2.51912, val_accuracy = 0.32917


 15%|█▌        | 3/20 [01:02<05:52, 20.76s/it]

epoch = 3, train_loss = 2.46153, val_loss = 2.00526, val_accuracy = 0.46250


 20%|██        | 4/20 [01:22<05:31, 20.75s/it]

epoch = 4, train_loss = 2.27563, val_loss = 2.87541, val_accuracy = 0.41250


 25%|██▌       | 5/20 [01:43<05:11, 20.75s/it]

epoch = 5, train_loss = 2.14671, val_loss = 1.64427, val_accuracy = 0.57083


 30%|███       | 6/20 [02:04<04:50, 20.73s/it]

epoch = 6, train_loss = 1.95475, val_loss = 1.50898, val_accuracy = 0.59583


 35%|███▌      | 7/20 [02:25<04:29, 20.73s/it]

epoch = 7, train_loss = 1.82259, val_loss = 2.00463, val_accuracy = 0.52917


 40%|████      | 8/20 [02:45<04:09, 20.76s/it]

epoch = 8, train_loss = 1.82369, val_loss = 1.27398, val_accuracy = 0.70417


 45%|████▌     | 9/20 [03:06<03:48, 20.76s/it]

epoch = 9, train_loss = 1.77711, val_loss = 10.86437, val_accuracy = 0.30833


 50%|█████     | 10/20 [03:27<03:27, 20.76s/it]

epoch = 10, train_loss = 1.72009, val_loss = 1.55019, val_accuracy = 0.64167


 55%|█████▌    | 11/20 [03:48<03:06, 20.75s/it]

epoch = 11, train_loss = 1.56112, val_loss = 1.49614, val_accuracy = 0.66250


 60%|██████    | 12/20 [04:08<02:45, 20.72s/it]

epoch = 12, train_loss = 1.58813, val_loss = 1.40656, val_accuracy = 0.64167


 65%|██████▌   | 13/20 [04:29<02:25, 20.75s/it]

epoch = 13, train_loss = 1.44912, val_loss = 0.97748, val_accuracy = 0.77083


 70%|███████   | 14/20 [04:50<02:04, 20.74s/it]

epoch = 14, train_loss = 1.31713, val_loss = 1.00303, val_accuracy = 0.76250


 75%|███████▌  | 15/20 [05:11<01:43, 20.74s/it]

epoch = 15, train_loss = 1.30521, val_loss = 0.98059, val_accuracy = 0.78750


 80%|████████  | 16/20 [05:31<01:22, 20.73s/it]

epoch = 16, train_loss = 1.23111, val_loss = 0.97149, val_accuracy = 0.76667


 85%|████████▌ | 17/20 [05:52<01:02, 20.74s/it]

epoch = 17, train_loss = 1.28593, val_loss = 1.01176, val_accuracy = 0.75833


 90%|█████████ | 18/20 [06:13<00:41, 20.70s/it]

epoch = 18, train_loss = 1.24422, val_loss = 0.94864, val_accuracy = 0.77917


 95%|█████████▌| 19/20 [06:33<00:20, 20.68s/it]

epoch = 19, train_loss = 1.16258, val_loss = 0.95239, val_accuracy = 0.78750


100%|██████████| 20/20 [06:54<00:00, 20.73s/it]

epoch = 20, train_loss = 1.18772, val_loss = 1.00408, val_accuracy = 0.79583





Fold 3


  5%|▌         | 1/20 [00:20<06:34, 20.77s/it]

epoch = 1, train_loss = 3.07657, val_loss = 2.91900, val_accuracy = 0.22917


 10%|█         | 2/20 [00:41<06:13, 20.73s/it]

epoch = 2, train_loss = 2.80015, val_loss = 2.30610, val_accuracy = 0.27083


 15%|█▌        | 3/20 [01:02<05:51, 20.69s/it]

epoch = 3, train_loss = 2.56895, val_loss = 1.79790, val_accuracy = 0.48750


 20%|██        | 4/20 [01:22<05:29, 20.62s/it]

epoch = 4, train_loss = 2.37778, val_loss = 1.79673, val_accuracy = 0.53333


 25%|██▌       | 5/20 [01:43<05:09, 20.61s/it]

epoch = 5, train_loss = 2.17384, val_loss = 1.54803, val_accuracy = 0.55833


 30%|███       | 6/20 [02:03<04:48, 20.60s/it]

epoch = 6, train_loss = 2.00449, val_loss = 1.55582, val_accuracy = 0.60000


 35%|███▌      | 7/20 [02:24<04:28, 20.66s/it]

epoch = 7, train_loss = 1.92951, val_loss = 1.28939, val_accuracy = 0.70833


 40%|████      | 8/20 [02:45<04:08, 20.68s/it]

epoch = 8, train_loss = 1.83323, val_loss = 1.92612, val_accuracy = 0.60417


 45%|████▌     | 9/20 [03:06<03:47, 20.70s/it]

epoch = 9, train_loss = 1.84415, val_loss = 1.15501, val_accuracy = 0.72083


 50%|█████     | 10/20 [03:26<03:27, 20.74s/it]

epoch = 10, train_loss = 1.76207, val_loss = 1.38075, val_accuracy = 0.73333


 55%|█████▌    | 11/20 [03:47<03:06, 20.74s/it]

epoch = 11, train_loss = 1.73578, val_loss = 1.91837, val_accuracy = 0.60000


 60%|██████    | 12/20 [04:08<02:45, 20.73s/it]

epoch = 12, train_loss = 1.66350, val_loss = 1.43817, val_accuracy = 0.68750


 65%|██████▌   | 13/20 [04:29<02:25, 20.75s/it]

epoch = 13, train_loss = 1.63335, val_loss = 1.34132, val_accuracy = 0.70417


 70%|███████   | 14/20 [04:49<02:04, 20.77s/it]

epoch = 14, train_loss = 1.35298, val_loss = 1.13178, val_accuracy = 0.75000


 75%|███████▌  | 15/20 [05:10<01:44, 20.82s/it]

epoch = 15, train_loss = 1.31360, val_loss = 1.08978, val_accuracy = 0.75417


 80%|████████  | 16/20 [05:31<01:23, 20.80s/it]

epoch = 16, train_loss = 1.23376, val_loss = 1.12523, val_accuracy = 0.73750


 85%|████████▌ | 17/20 [05:52<01:02, 20.78s/it]

epoch = 17, train_loss = 1.16944, val_loss = 1.06830, val_accuracy = 0.75417


 90%|█████████ | 18/20 [06:13<00:41, 20.76s/it]

epoch = 18, train_loss = 1.23203, val_loss = 0.98734, val_accuracy = 0.77083


 95%|█████████▌| 19/20 [06:33<00:20, 20.81s/it]

epoch = 19, train_loss = 1.30541, val_loss = 1.03949, val_accuracy = 0.77083


100%|██████████| 20/20 [06:54<00:00, 20.73s/it]

epoch = 20, train_loss = 1.12691, val_loss = 0.99874, val_accuracy = 0.77917





Fold 4


  5%|▌         | 1/20 [00:20<06:31, 20.58s/it]

epoch = 1, train_loss = 3.01601, val_loss = 6.40297, val_accuracy = 0.18333


 10%|█         | 2/20 [00:41<06:11, 20.66s/it]

epoch = 2, train_loss = 2.66518, val_loss = 2.31184, val_accuracy = 0.42083


 15%|█▌        | 3/20 [01:01<05:50, 20.63s/it]

epoch = 3, train_loss = 2.43312, val_loss = 2.21704, val_accuracy = 0.42917


 20%|██        | 4/20 [01:22<05:29, 20.58s/it]

epoch = 4, train_loss = 2.30936, val_loss = 1.68958, val_accuracy = 0.50417


 25%|██▌       | 5/20 [01:43<05:08, 20.60s/it]

epoch = 5, train_loss = 2.11652, val_loss = 1.87662, val_accuracy = 0.53750


 30%|███       | 6/20 [02:03<04:48, 20.60s/it]

epoch = 6, train_loss = 2.07532, val_loss = 2.18421, val_accuracy = 0.53333


 35%|███▌      | 7/20 [02:24<04:27, 20.58s/it]

epoch = 7, train_loss = 1.85438, val_loss = 1.62811, val_accuracy = 0.66667


 40%|████      | 8/20 [02:44<04:07, 20.65s/it]

epoch = 8, train_loss = 1.84862, val_loss = 1.50382, val_accuracy = 0.63750


 45%|████▌     | 9/20 [03:05<03:46, 20.62s/it]

epoch = 9, train_loss = 1.79377, val_loss = 1.57519, val_accuracy = 0.61250


 50%|█████     | 10/20 [03:25<03:25, 20.58s/it]

epoch = 10, train_loss = 1.66796, val_loss = 1.41090, val_accuracy = 0.66667


 55%|█████▌    | 11/20 [03:46<03:05, 20.60s/it]

epoch = 11, train_loss = 1.66882, val_loss = 1.45409, val_accuracy = 0.71667


 60%|██████    | 12/20 [04:07<02:44, 20.57s/it]

epoch = 12, train_loss = 1.59568, val_loss = 1.22810, val_accuracy = 0.72500


 65%|██████▌   | 13/20 [04:27<02:23, 20.56s/it]

epoch = 13, train_loss = 1.46231, val_loss = 1.54104, val_accuracy = 0.70417


 70%|███████   | 14/20 [04:48<02:03, 20.58s/it]

epoch = 14, train_loss = 1.49215, val_loss = 1.28003, val_accuracy = 0.75000


 75%|███████▌  | 15/20 [05:08<01:42, 20.56s/it]

epoch = 15, train_loss = 1.40442, val_loss = 1.38032, val_accuracy = 0.66667


 80%|████████  | 16/20 [05:29<01:22, 20.56s/it]

epoch = 16, train_loss = 1.45534, val_loss = 1.31229, val_accuracy = 0.73333


 85%|████████▌ | 17/20 [05:50<01:01, 20.59s/it]

epoch = 17, train_loss = 1.32022, val_loss = 1.07613, val_accuracy = 0.75833


 90%|█████████ | 18/20 [06:10<00:41, 20.58s/it]

epoch = 18, train_loss = 1.21471, val_loss = 1.13709, val_accuracy = 0.78750


 95%|█████████▌| 19/20 [06:31<00:20, 20.59s/it]

epoch = 19, train_loss = 1.18850, val_loss = 1.10911, val_accuracy = 0.78750


100%|██████████| 20/20 [06:51<00:00, 20.59s/it]

epoch = 20, train_loss = 1.24220, val_loss = 1.08144, val_accuracy = 0.78750





In [None]:
def load_test_file(f):
    wav, sr = torchaudio.load('./dataset/test/' + f, sr=None)
    wav, sr = wav.numpy(), sr
    
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        spec=librosa.feature.melspectrogram(y=slice, sr=sr, fmin=fmin, fmax=fmax)
        spec_db=librosa.power_to_db(spec,top_db=80)

        img = spec_to_image(spec_db)
        mel_spec = np.stack((img, img, img))
        mel_array.append(mel_spec)
    
    return mel_array

In [None]:
members = []
for i in range(nfold):
    model = resnet101(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, num_labels)
    model = model.to(device)
    model.load_state_dict(torch.load('./model'+str(i)+'.pt'))
    model.eval()
    members.append(model)

In [None]:
os.remove('./model0.pt')
os.remove('./model1.pt')
os.remove('./model2.pt')
os.remove('./model3.pt')
os.remove('./model4.pt')

In [None]:

print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir('./data/test/')
    print(len(test_files))
    
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output_list = []
        for m in members:
            output = m(data)
            maxed_output = torch.max(output, dim=0)[0]
            maxed_output = maxed_output.cpu().detach()
            output_list.append(maxed_output)
        avg_maxed_output = torch.mean(torch.stack(output_list), dim=0)
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in avg_maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

Starting prediction loop
1992
Predicted for 100 of 1993 files
Predicted for 200 of 1993 files
Predicted for 300 of 1993 files
Predicted for 400 of 1993 files
Predicted for 500 of 1993 files
Predicted for 600 of 1993 files
Predicted for 700 of 1993 files
Predicted for 800 of 1993 files
Predicted for 900 of 1993 files
Predicted for 1000 of 1993 files
Predicted for 1100 of 1993 files
Predicted for 1200 of 1993 files
Predicted for 1300 of 1993 files
Predicted for 1400 of 1993 files
Predicted for 1500 of 1993 files
Predicted for 1600 of 1993 files
Predicted for 1700 of 1993 files
Predicted for 1800 of 1993 files
Predicted for 1900 of 1993 files
Submission generated
