In [1]:
# Добавляем необходимые модули

import os
import gc
import random
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from soundfile import SoundFile 
import numpy as np 
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import timm

In [2]:
# Параметры

learning_rate = 1e-3
batch_size = 32
num_fold = 5
num_classes = 264
max_time = 5
n_mels = 224
n_fft = 1024
epochs = 10
hop_length = 512

In [3]:
# Блок функций для перевода звука в спектограмму

def to_rgb(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    _min, _max = X.min(), X.max()
    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)
    return V

def to_melspec(y, sr, n_mels, fmin, fmax):
    melspec = lb.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax,
    )
    melspec = lb.power_to_db(melspec).astype(np.float32)
    return melspec

In [4]:
class ClefDataset(Dataset):
    def __init__(self, data, sr=32000, n_mels=128, fmin=0, fmax=None, duration=5, 
                 step=None, res_type="kaiser_fast", resample=True, valid=False, transform=None):
        self.data = data
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr // 2
        self.transform = transform
        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        self.valid = valid
        self.path = '' if valid else 'kaggle/input/train_audio/'
        self.res_type = res_type
        self.resample = resample
        
    # Приводим значения к диапазону (0, 1)
    def normalize(self, image):
        image = image.astype("float32", copy=False) / 255.0
        if image.shape[1] > 256:
            image = image[:128, :256]
        else:
            zeroes = np.zeros((128, 256 - image.shape[1]))
            image = np.concatenate([image, zeroes], axis=1, dtype=np.float32)
        image = np.stack([image, image, image], axis=0)
        return image
    
    def audio_to_image(self, audio):
        melspec = to_melspec(audio, self.sr, self.n_mels, self.fmin, self.fmax) 
        image = to_rgb(melspec)
        image = self.normalize(image)
        return image

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.loc[idx]
        filepath = self.path + str(row['path'])
        audio, orig_sr = sf.read(filepath, dtype="float32")
        if self.resample and orig_sr != self.sr:
            audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)
        if self.valid:
            audios = []
            for i in range(self.audio_length, len(audio) + self.step, self.step):
                start = max(0, i - self.audio_length)
                end = start + self.audio_length
                audios.append(audio[start:end])
            if len(audios[-1]) < self.audio_length:
                audios = audios[:-1]
            images = [self.audio_to_image(audio) for audio in audios]
            images = np.stack(images)
        else:
            images = self.audio_to_image(audio) 
        labels = torch.tensor(row[3:]).float() 
        return (images, labels)

In [5]:
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Resize((120, 224))])
data = pd.read_csv('/kaggle/input/birdclef-2023/train_metadata.csv')
data = pd.concat(
    [
        pd.Series(data['primary_label']), 
        pd.Series(data['type']), 
        pd.Series(data['filename'], name='path')
    ], 
    axis=1, names=['primary_label', 'type', 'path']
)
birds = list(pd.get_dummies(data['primary_label']).columns)
filenames = data.path.values.tolist()
data = pd.concat([data, pd.get_dummies(data['primary_label'])], axis=1)
train_data, valid_data = train_test_split(data, train_size=0.7, shuffle=True)
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)

train_dataset = ClefDataset(train_data, transform=transform)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

valid_dataset = ClefDataset(valid_data)
valid_dataloader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size)

In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.model = timm.create_model("tf_efficientnet_b1_ns", pretrained=False)
        self.in_features = self.model.classifier.in_features
        self.model.classifier = nn.Sequential(nn.Linear(self.in_features, num_classes))
    
    def forward(self, images):
        features = self.model(images)
        return features

In [7]:
# default метрика от Kaggle
def padded_cmap(solution, submission, padding_factor=5):
    solution = solution.drop(['row_id'], axis=1, errors='ignore')
    submission = submission.drop(['row_id'], axis=1, errors='ignore')
    
    new_rows = []
    for i in range(padding_factor):
        new_rows.append([1 for i in range(len(solution.columns))])
    new_rows = pd.DataFrame(new_rows)
    new_rows.columns = solution.columns
    
    padded_solution = pd.concat([solution, new_rows]).reset_index(drop=True).copy()
    padded_submission = pd.concat([submission, new_rows]).reset_index(drop=True).copy()
    
    score = sklearn.metrics.average_precision_score(
        padded_solution.values,
        padded_submission.values,
        average='macro',
    )
    return score

In [8]:
def loss_fn(outputs, labels):
    return  nn.CrossEntropyLoss()(outputs, labels)

def train(model, data_loader, optimizer, epoch):
    model.train()
    running_loss = 0
    loop = tqdm(data_loader, position=0)
    for i, (mels, labels) in enumerate(loop):
        
        outputs = model(mels)
        _, preds = torch.max(outputs, 1)
       
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
        del mels, labels, outputs
        gc.collect()
            
        running_loss += loss.item()
        
        loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
        loop.set_postfix(loss=loss.item())

    return running_loss / len(data_loader)

def valid(model, data_loader, epoch):
    model.eval()
    
    running_loss = 0
    pred = []
    label = []
    
    loop = tqdm(data_loader, position=0)
    for i, (mels, labels) in enumerate(loop):
        
        outputs = model(mels)
        _, preds = torch.max(outputs, 1)
       
        loss = loss_fn(outputs, labels)
            
        running_loss += loss.item()
        
        loop.set_description(f"Epoch [{epoch+1}/{epochs}]")
        loop.set_postfix(loss=loss.item())

        label.append(labels.cpu().detach())
        pred.append(outputs.sigmoid().cpu().detach())
        
        del mels, labels, outputs
        gc.collect()
        
    labels_df = torch.cat([x for x in label], dim=0)
    pred_df = torch.cat([x for x in pred], dim=0)
    label_df = pd.DataFrame(labels_df)  
    pred_df = pd.DataFrame(pred_df)  
    current_score = padded_cmap(label_df, pred_df)
    
    return running_loss/len(data_loader), current_score

In [9]:
model = Model()
optimizer = Adam(model.parameters(), lr=learning_rate)

  model = create_fn(


In [10]:
'''best_valid_padded_map = 0

for epoch in range(epochs):
    train_loss = train(model, train_dataloader, optimizer, epoch)
    valid_loss, valid_padded_map = valid(model, valid_dataloader, epoch)

    print(f'mAP: {valid_padded_map}, loss: {valid_loss}')
    if valid_padded_map > best_valid_padded_map:
        torch.save(model.state_dict(), f'./ckp.bin')
        best_valid_padded_map = valid_padded_map
                                    
print(f'End of training. Best score: {best_valid_padded_map}')
print(best_valid_padded_map)'''


input_file_name = os.listdir('/kaggle/input/birdclef-2023/test_soundscapes')
input_file_path = '/kaggle/input/birdclef-2023/test_soundscapes/'

data = {'primary_label' : [x for x in range(len(input_file_name))], 
        'type' : [x for x in range(len(input_file_name))], 
        'path' : [str(input_file_path + x) for x in input_file_name]}

test_data = pd.DataFrame(data=data)
test_data = ClefDataset(test_data, valid=True)

In [11]:
model = Model()
model.load_state_dict(torch.load("/kaggle/input/birdclef-weights/best.pth", 
                                 map_location='cpu'), strict=False)
predictions = []
for en in range(len(test_data)):
    images = torch.from_numpy(test_data[en][0])
    with torch.no_grad():
        outputs = model(images).sigmoid().detach().cpu().numpy()
    predictions.append(outputs)

  model = create_fn(


In [12]:
submission = pd.DataFrame(columns=['row_id']+birds)

for i, file in enumerate(input_file_name):
    pred = predictions[i]
    file = input_file_name[i][:-4]
    num_rows = len(pred)
    row_ids = [f'{file}_{(i+1)*5}' for i in range(num_rows)]
    df = pd.DataFrame(columns=['row_id'] + birds)
    df['row_id'] = row_ids
    df[birds] = pred
    submission = pd.concat([submission, df]).reset_index(drop=True)
submission.to_csv('submission.csv', index=False)