In [1]:
!pip install /kaggle/input/onnxruntimegpu/humanfriendly-10.0-py2.py3-none-any.whl
!pip install /kaggle/input/onnxruntimegpu/coloredlogs-15.0.1-py2.py3-none-any.whl
!pip install /kaggle/input/onnxruntimegpu/onnxruntime_gpu-1.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
import os
import sys
import cv2
import glob
from PIL import Image
import  matplotlib.pyplot as plt

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm.auto import tqdm
import librosa

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

!pip uninstall timm -y
sys.path.append('/kaggle/input/pytorch-image-models')
import timm

Processing /kaggle/input/onnxruntimegpu/humanfriendly-10.0-py2.py3-none-any.whl
Installing collected packages: humanfriendly
Successfully installed humanfriendly-10.0
[0mProcessing /kaggle/input/onnxruntimegpu/coloredlogs-15.0.1-py2.py3-none-any.whl
Installing collected packages: coloredlogs
Successfully installed coloredlogs-15.0.1
[0mProcessing /kaggle/input/onnxruntimegpu/onnxruntime_gpu-1.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: onnxruntime-gpu
Successfully installed onnxruntime-gpu-1.13.1
[0mFound existing installation: timm 0.6.12
Uninstalling timm-0.6.12:
  Successfully uninstalled timm-0.6.12
[0m

In [2]:
def get_test_transform(img_size):
    return A.Compose([
        A.SmallestMaxSize(max_size=img_size, interpolation=3, p=1),
        ToTensorV2(p=1.0),
    ])

class Customize_Dataset(Dataset):
    def __init__(self, df, transforms):
        self.df = df
        self.image_path = df['image_path'].values
        self.transforms = transforms
    
    def __getitem__(self, index):
        path = self.image_path[index]
        
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        img_128 = self.transforms(img_size= 128)(image=img)["image"]
        img_192 = self.transforms(img_size= 192)(image=img)["image"]
        
        return {
            'image_128': torch.tensor(img_128/255, dtype=torch.float32),
            'image_192': torch.tensor(img_192/255, dtype=torch.float32),
        }
    
    def __len__(self):
        return len(self.df)

## Make PNG

In [3]:
%%time
from multiprocessing.pool import ThreadPool as Pool

def process(path):
    def audio_to_img(path,
                     period= 5,
                     sr= None,
                     n_mels= 128,
                     fmin = 20,
                     fmax = 16000):

        def mono_to_color(X, eps=1e-6, mean=None, std=None):
            mean = mean or X.mean()
            std = std or X.std()
            X = (X - mean) / (std + eps)

            _min, _max = X.min(), X.max()

            if (_max - _min) > eps:
                V = np.clip(X, _min, _max)
                V = 255 * (V - _min) / (_max - _min)
                V = V.astype(np.uint8)
            else:
                V = np.zeros_like(X, dtype=np.uint8)
            return V


        data, sr = librosa.load(path, sr= sr)
        max_sec= int( len(data)//sr )

        ## split audio by period
        datas = [ data[i * sr: (i+period) * sr] for i in range(0, max_sec, period) ]
        if len(datas[-1])<sr*period: datas[-1]= list(datas[-1]) + [0]*( sr*period-len(datas[-1]) )

        ## audio to melspectrogram
        melspec = librosa.feature.melspectrogram(y= np.array(datas), ## drop last
                                                 sr= sr, 
                                                 n_mels= n_mels, 
                                                 fmin= fmin, 
                                                 fmax= fmax)
        melspec = librosa.power_to_db(melspec).astype(np.float32)
        img= np.array([mono_to_color(im) for im in melspec])

        return img
    
    imgs= audio_to_img(path,
                       period= 5,
                       sr= 32000,
                       n_mels= 128,
                       fmin = 0,
                       fmax = None)
    
    dirs= path.replace('.ogg', '').split('/')[-1]
    os.makedirs(f'test_img/{dirs}', exist_ok=True)
    c= 5
    for i in range(len(imgs)):
        im= Image.fromarray(imgs[i].astype(np.uint8))
        im.save(f'test_img/{dirs}/{c}.png')
        c+= 5
    return None


pool = Pool(2)
paths= glob.glob('/kaggle/input/birdclef-2023/test_soundscapes/**/*ogg', recursive=True)
for path in tqdm(paths):
    pool.apply_async(process, (path,))

pool.close()
pool.join()
print('finish')

  0%|          | 0/1 [00:00<?, ?it/s]

finish
CPU times: user 10.2 s, sys: 859 ms, total: 11.1 s
Wall time: 12.1 s


In [4]:
class Customize_Model(nn.Module):
    def __init__(self, model_name, cls):
        super().__init__()
        
    def forward(self, image):
        x = self.model(image)
        return x
    
import onnxruntime

def load_onnx(path):
    session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'])
    return session

## CFG

In [5]:
CFG= {
    'img_size': [
        128,
        128,
        128,
    ],
    'model': [
        '/kaggle/input/cvnxt-v2-128',
        '/kaggle/input/effv2s-128',
        '/kaggle/input/effb0-128',
    ],
    'model_weight': [
        0.4,
        0.35,
        0.25,
    ],
    
    'TTA': 1,
}

## load model
Models= []
for i in range(len(CFG['model'])):
    models= []
    for m in glob.glob(CFG['model'][i]+'/**'):
        if 'ts' in m:
            models.append( torch.jit.load(m, map_location= 'cpu') )
        elif 'onnx' in m:
            models.append( load_onnx(m) )
        else:
            models.append( torch.load(m, map_location= 'cpu') )
    Models.append(models)
CFG['model']= Models
print(f"length of model: {len(Models)}")

length of model: 3


## Prepare Dataset

In [6]:
## make dataloader
test_df= pd.DataFrame(columns=('image_path',))
paths= glob.glob('/kaggle/working/test_img/**/*png', recursive=True)
print(paths[0])
paths= sorted(paths, key= lambda st:(int(st.split('/')[-2].split('_')[-1]), 
                                     int(st.split('/')[-1].split('.')[-2])) )
for i in range(len(paths)):
    test_df.loc[i, 'image_path']=paths[i]
    
test_dataset= Customize_Dataset(test_df, get_test_transform)
test_loader= DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)
test_df.head()

/kaggle/working/test_img/soundscape_29201/310.png


Unnamed: 0,image_path
0,/kaggle/working/test_img/soundscape_29201/5.png
1,/kaggle/working/test_img/soundscape_29201/10.png
2,/kaggle/working/test_img/soundscape_29201/15.png
3,/kaggle/working/test_img/soundscape_29201/20.png
4,/kaggle/working/test_img/soundscape_29201/25.png


## Inference

In [7]:
def onnx_inference(models, img):
    
    img= np.expand_dims(img, axis=0)
    for i, session in enumerate(models):
        session.get_modelmeta()
        first_input_name = session.get_inputs()[0]
        first_output_name = session.get_outputs()[0]

        inname = [input.name for input in session.get_inputs()]
        outname = [output.name for output in session.get_outputs()]

        if CFG['TTA']:
            imgs= np.concatenate([ img, 
                                   np.flip(img, axis=[-1]), 
                                   np.flip(img, axis=[-2]), 
                                   np.flip(img, axis=[-1,-2]) ])
            ## onnx model can't deal batch input properly, lead to gpu OOM issue
            for j in range(CFG['TTA']):
                pred = session.run(outname, {inname[0]: imgs[j:j+1]})[0]
                if j==0: preds= pred
                else: preds+= pred
            pred= preds/CFG['TTA']
        else:
            pred = session.run(outname, {inname[0]: img})[0]
        
        pred= torch.from_numpy(pred).softmax(dim=-1).numpy()
        if i==0: preds= pred
        else: pred+= pred
            
    pred= preds/len(models)
    pred= pred[0]
    return pred


def inference(model, img):
    img= torch.unsqueeze(img, 0)
    
    for i, m in enumerate(model):
        with torch.no_grad():
            m.eval()
            if CFG['TTA']:
                imgs= torch.cat([
                            img, 
                            img.flip(-1), 
                            img.flip(-2), 
                            img.flip(-1).flip(-2)
                        ], dim=0)
                
                ## tensor_trt can't use bs!=1
                for j in range(CFG['TTA']):
                    p= m(imgs[j:j+1])
                    if j==0: ps= p
                    else: ps+= p
                pred= ps/CFG['TTA']
            else:
                pred= m(img)[0]
        
        if i==0: preds= pred.softmax(dim=-1)
        else: preds+= pred.softmax(dim=-1)
        
    pred= preds/len(model)
    pred= pred.cpu().numpy()
    pred= pred[0]
    return pred

In [8]:
test_df['pred_cls']= None
count= 0
for i, data in enumerate(tqdm(test_loader)):
    
    ## loop for a batch
    for j in range(len(data['image_128'])):
        
        ## multi-model predict
        for indx, m in enumerate(CFG['model']):
            img_size= CFG['img_size'][indx]
            img= data[f'image_{img_size}'][j]
            
            pred= inference(m, img)
#             pred= onnx_inference(m, img.numpy())
    
            pred*= CFG['model_weight'][indx]
            
            if indx==0: preds= pred
            else: preds+= pred
        
        test_df.loc[count, 'pred_cls']= str(preds.tolist()).replace('  ', ' ')
        count+= 1
test_df.head()

  0%|          | 0/30 [00:00<?, ?it/s]



Unnamed: 0,image_path,pred_cls
0,/kaggle/working/test_img/soundscape_29201/5.png,"[0.02313368022441864, 0.0029884090181440115, 0..."
1,/kaggle/working/test_img/soundscape_29201/10.png,"[0.004453688394278288, 0.0042408606968820095, ..."
2,/kaggle/working/test_img/soundscape_29201/15.png,"[0.009758793748915195, 0.003527225460857153, 0..."
3,/kaggle/working/test_img/soundscape_29201/20.png,"[0.0020427717827260494, 0.0011781221255660057,..."
4,/kaggle/working/test_img/soundscape_29201/25.png,"[0.002624419517815113, 0.0017555025406181812, ..."


## Submission

In [9]:
# ## fix continuous predict
# for i in tqdm(range(1,len(test_df)-1)):
#     previous= np.array(eval(test_df.loc[i-1, 'pred_cls']))
#     current= np.array(eval(test_df.loc[i, 'pred_cls']))
#     next_= np.array(eval(test_df.loc[i+1, 'pred_cls']))
#     if previous.argmax(0)==next_.argmax(0) and current.argmax(0)!=previous.argmax(0):
#         cls= current.argmax(0)
#         current[ previous.argmax(0) ]= current.max()+1e-5
#         test_df.loc[i, 'pred_cls']= str(current.tolist())

In [10]:
from io import StringIO
from csv import writer 
label_name= ['abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1', 'afecuc1', 'affeag1', 'afgfly1', 'afghor1', 'afmdov1', 'afpfly1', 'afpkin1', 'afpwag1', 'afrgos1', 'afrgrp1', 'afrjac1', 'afrthr1', 'amesun2', 'augbuz1', 'bagwea1', 'barswa', 'bawhor2', 'bawman1', 'bcbeat1', 'beasun2', 'bkctch1', 'bkfruw1', 'blacra1', 'blacuc1', 'blakit1', 'blaplo1', 'blbpuf2', 'blcapa2', 'blfbus1', 'blhgon1', 'blhher1', 'blksaw1', 'blnmou1', 'blnwea1', 'bltapa1', 'bltbar1', 'bltori1', 'blwlap1', 'brcale1', 'brcsta1', 'brctch1', 'brcwea1', 'brican1', 'brobab1', 'broman1', 'brosun1', 'brrwhe3', 'brtcha1', 'brubru1', 'brwwar1', 'bswdov1', 'btweye2', 'bubwar2', 'butapa1', 'cabgre1', 'carcha1', 'carwoo1', 'categr', 'ccbeat1', 'chespa1', 'chewea1', 'chibat1', 'chtapa3', 'chucis1', 'cibwar1', 'cohmar1', 'colsun2', 'combul2', 'combuz1', 'comsan', 'crefra2', 'crheag1', 'crohor1', 'darbar1', 'darter3', 'didcuc1', 'dotbar1', 'dutdov1', 'easmog1', 'eaywag1', 'edcsun3', 'egygoo', 'equaka1', 'eswdov1', 'eubeat1', 'fatrav1', 'fatwid1', 'fislov1', 'fotdro5', 'gabgos2', 'gargan', 'gbesta1', 'gnbcam2', 'gnhsun1', 'gobbun1', 'gobsta5', 'gobwea1', 'golher1', 'grbcam1', 'grccra1', 'grecor', 'greegr', 'grewoo2', 'grwpyt1', 'gryapa1', 'grywrw1', 'gybfis1', 'gycwar3', 'gyhbus1', 'gyhkin1', 'gyhneg1', 'gyhspa1', 'gytbar1', 'hadibi1', 'hamerk1', 'hartur1', 'helgui', 'hipbab1', 'hoopoe', 'huncis1', 'hunsun2', 'joygre1', 'kerspa2', 'klacuc1', 'kvbsun1', 'laudov1', 'lawgol', 'lesmaw1', 'lessts1', 'libeat1', 'litegr', 'litswi1', 'litwea1', 'loceag1', 'lotcor1', 'lotlap1', 'luebus1', 'mabeat1', 'macshr1', 'malkin1', 'marsto1', 'marsun2', 'mcptit1', 'meypar1', 'moccha1', 'mouwag1', 'ndcsun2', 'nobfly1', 'norbro1', 'norcro1', 'norfis1', 'norpuf1', 'nubwoo1', 'pabspa1', 'palfly2', 'palpri1', 'piecro1', 'piekin1', 'pitwhy', 'purgre2', 'pygbat1', 'quailf1', 'ratcis1', 'raybar1', 'rbsrob1', 'rebfir2', 'rebhor1', 'reboxp1', 'reccor', 'reccuc1', 'reedov1', 'refbar2', 'refcro1', 'reftin1', 'refwar2', 'rehblu1', 'rehwea1', 'reisee2', 'rerswa1', 'rewsta1', 'rindov', 'rocmar2', 'rostur1', 'ruegls1', 'rufcha2', 'sacibi2', 'sccsun2', 'scrcha1', 'scthon1', 'shesta1', 'sichor1', 'sincis1', 'slbgre1', 'slcbou1', 'sltnig1', 'sobfly1', 'somgre1', 'somtit4', 'soucit1', 'soufis1', 'spemou2', 'spepig1', 'spewea1', 'spfbar1', 'spfwea1', 'spmthr1', 'spwlap1', 'squher1', 'strher', 'strsee1', 'stusta1', 'subbus1', 'supsta1', 'tacsun1', 'tafpri1', 'tamdov1', 'thrnig1', 'trobou1', 'varsun2', 'vibsta2', 'vilwea1', 'vimwea1', 'walsta1', 'wbgbir1', 'wbrcha2', 'wbswea1', 'wfbeat1', 'whbcan1', 'whbcou1', 'whbcro2', 'whbtit5', 'whbwea1', 'whbwhe3', 'whcpri2', 'whctur2', 'wheslf1', 'whhsaw1', 'whihel1', 'whrshr1', 'witswa1', 'wlwwar', 'wookin1', 'woosan', 'wtbeat1', 'yebapa1', 'yebbar1', 'yebduc1', 'yebere1', 'yebgre1', 'yebsto1', 'yeccan1', 'yefcan', 'yelbis1', 'yenspu1', 'yertin1', 'yesbar1', 'yespet1', 'yetgre1', 'yewgre1']

submission= pd.read_csv('/kaggle/input/birdclef-2023/sample_submission.csv')
output = StringIO()
csv_writer = writer(output)
csv_writer.writerow(list(submission.columns))
for i in tqdm(range(len(test_df))):
    
    data= test_df.loc[i]
    name= data['image_path'].split('/')[-2]
    sec= data['image_path'].split('/')[-1].replace('.png','')
    id_= f'{name}_{sec}'
    
    pred= eval(data['pred_cls'].replace(' ', ',').replace(',,', ','))
    csv_writer.writerow([id_] + pred)
    
output.seek(0)
submission= pd.read_csv(output)
submission.to_csv('submission.csv', index=False)
submission

  0%|          | 0/120 [00:00<?, ?it/s]

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.023134,0.002988,0.004583,0.001306,0.001112,0.002604,0.001863,0.001231,0.011490,...,0.001129,0.000653,0.001204,0.001155,0.001497,0.001872,0.000789,0.000638,0.001335,0.001952
1,soundscape_29201_10,0.004454,0.004241,0.000589,0.009551,0.001200,0.001251,0.003568,0.000584,0.002811,...,0.001461,0.000840,0.004904,0.008777,0.002080,0.000937,0.000529,0.000736,0.001866,0.001381
2,soundscape_29201_15,0.009759,0.003527,0.000804,0.002075,0.000658,0.002894,0.001385,0.001364,0.001362,...,0.000732,0.001207,0.001337,0.002287,0.000722,0.001030,0.000558,0.002681,0.001175,0.001174
3,soundscape_29201_20,0.002043,0.001178,0.000833,0.003569,0.001100,0.001499,0.000677,0.001112,0.002921,...,0.000543,0.001301,0.001553,0.000894,0.001166,0.000697,0.001301,0.000999,0.002156,0.000854
4,soundscape_29201_25,0.002624,0.001756,0.001436,0.004048,0.002925,0.002433,0.001298,0.001025,0.004824,...,0.000779,0.001182,0.003088,0.001915,0.000940,0.001370,0.002244,0.001309,0.003004,0.001020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,soundscape_29201_580,0.001517,0.001471,0.000603,0.001263,0.001444,0.001366,0.002021,0.001487,0.003103,...,0.000489,0.000845,0.001705,0.001630,0.001041,0.001638,0.002682,0.001175,0.002134,0.001070
116,soundscape_29201_585,0.003881,0.002163,0.000354,0.001247,0.000817,0.000917,0.036592,0.000954,0.001003,...,0.000921,0.000692,0.002804,0.001439,0.001900,0.001223,0.001951,0.001757,0.001538,0.000598
117,soundscape_29201_590,0.002116,0.001913,0.000838,0.003088,0.001194,0.002369,0.005293,0.001216,0.003008,...,0.000798,0.000742,0.002817,0.002137,0.000994,0.000603,0.002059,0.003036,0.002148,0.000627
118,soundscape_29201_595,0.002340,0.002736,0.000874,0.004873,0.002046,0.001992,0.001487,0.002018,0.002331,...,0.000413,0.000619,0.002540,0.013463,0.001160,0.001535,0.002057,0.001241,0.001928,0.000931
