### imports

In [1]:
import gc
import os
import pickle
import random
import time
from collections import Counter, defaultdict
from functools import partial
from pathlib import Path
from psutil import cpu_count

import math
import random

import librosa
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
#from skmultilearn.model_selection import iterative_train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from fastprogress import master_bar, progress_bar
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms
from torchvision import transforms, datasets, models

from os.path import isfile, join, abspath, exists, isdir, expanduser
from os import listdir, makedirs, getcwd, remove

In [2]:
torch.cuda.is_available()

True

### utils

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 1111
seed_everything(SEED)


In [4]:
N_JOBS = cpu_count()
os.environ['MKL_NUM_THREADS'] = str(N_JOBS)
os.environ['OMP_NUM_THREADS'] = str(N_JOBS)
DataLoader = partial(DataLoader, num_workers=N_JOBS)

In [5]:
# from official code https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8#scrollTo=cRCaCIb9oguU
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class

### dataset

In [6]:
#!ls ../input/weight-best-ver1/

#cache_dir = expanduser(join('~', '.torch'))
#if not exists(cache_dir):
#    makedirs(cache_dir)
#models_dir = join(cache_dir, 'models')
#if not exists(models_dir):
#    makedirs(models_dir)
#!cp ../input/weight-best-ver1/* ~/.torch/models/

#!ls ~/.torch/models

dataset_dir = Path('../input/freesound-audio-tagging-2019')
preprocessed_dir = Path('../input/fat2019_prep_mels1')

In [7]:
submission_0 = pd.read_csv('../input/freesoundmixup/submission_1.csv')
submission_1 = pd.read_csv('../input/freesoundmixup/submission_2.csv')
submission_2 = pd.read_csv('../input/freesoundmixup/submission_3.csv')
submission_3 = pd.read_csv('../input/freesoundmixup/submission_4.csv')
submission_4 = pd.read_csv('../input/freesoundmixup/submission_5.csv')

test_df = pd.read_csv(dataset_dir/'sample_submission.csv')
labels = test_df.columns[1:].tolist()

submission=submission_0[labels].values + submission_1[labels].values + submission_2[labels].values + submission_3[labels].values + submission_4[labels].values

test_df[labels] = submission/5
test_df.to_csv('submission.csv', index=False)
test_df.head()




Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,Burping_and_eructation,Bus,Buzz,Car_passing_by,Cheering,Chewing_and_mastication,Child_speech_and_kid_speaking,Chink_and_clink,Chirp_and_tweet,Church_bell,Clapping,Computer_keyboard,Crackle,Cricket,Crowd,Cupboard_open_or_close,Cutlery_and_silverware,Dishes_and_pots_and_pans,Drawer_open_or_close,Drip,Electric_guitar,Fart,Female_singing,Female_speech_and_woman_speaking,Fill_(with_liquid),Finger_snapping,Frying_(food),Gasp,Glockenspiel,Gong,...,Harmonica,Hi-hat,Hiss,Keys_jangling,Knock,Male_singing,Male_speech_and_man_speaking,Marimba_and_xylophone,Mechanical_fan,Meow,Microwave_oven,Motorcycle,Printer,Purr,Race_car_and_auto_racing,Raindrop,Run,Scissors,Screaming,Shatter,Sigh,Sink_(filling_or_washing),Skateboard,Slam,Sneeze,Squeak,Stream,Strum,Tap,Tick-tock,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,000ccb97.wav,-8.791524,-11.276542,-9.562972,-10.228289,-9.402058,-3.858264,-8.77376,-9.534332,-6.195141,-8.375465,-9.707643,-9.607131,-9.271647,-9.703127,-9.465942,-10.030047,-8.723332,-5.982788,-9.535469,-6.963907,-10.210032,-8.969843,-7.058739,-9.341362,-9.365251,-3.703495,-8.154037,-7.729991,-8.447921,-10.472861,-7.639707,-8.290491,-8.330125,-11.024823,-4.489432,-8.40038,-9.363029,-9.367074,-7.82037,...,-7.632224,-0.111849,-5.675154,-5.922147,-9.321105,-9.0323,-9.719732,-8.710148,-10.801785,-10.241268,-8.825757,-9.516094,-8.651019,-10.320634,-9.675553,-8.539225,-9.937739,-3.611117,-8.925399,-4.456472,-9.779232,-9.236957,-9.015209,-10.564683,-7.685301,-10.528816,-8.863146,-9.590097,-7.039071,-8.513955,-7.706279,-10.304735,-10.929057,-9.971642,-9.329429,-9.597869,-6.152478,-6.077756,-8.532779,-9.884321
1,0012633b.wav,-1.426058,-7.613961,-6.223971,-6.524019,-7.570473,-5.636334,-3.873757,-7.225362,-8.409531,-6.86619,-4.597991,-5.404117,-4.91043,-7.570663,-7.397243,-6.76926,-8.147783,-8.423029,-6.858411,-6.764313,-8.165224,-7.026639,-7.780612,-6.431773,-5.574741,-7.629361,-8.129289,-5.780359,-9.109972,-5.256652,-6.949477,-6.916475,-6.443898,-8.272712,-7.997171,-6.944674,-7.003649,-8.363314,-6.88628,...,-4.490533,-5.85466,-5.428852,-8.219059,-8.383932,-6.081883,-5.000207,-8.286637,-6.5474,-7.20153,-7.106834,-0.556286,-7.193039,-6.817559,-4.671455,-8.417385,-6.577154,-8.457267,-7.554844,-7.22472,-5.857558,-7.668516,-6.830085,-6.921608,-5.543547,-6.32147,-8.40388,-6.138568,-7.467389,-9.387594,-5.384216,-4.366921,-9.811607,-6.194322,-7.296856,-6.92417,-8.084876,-6.169709,-6.876653,-5.750203
2,001ed5f1.wav,-6.847269,-6.974532,-7.567574,-5.821598,-7.718976,-5.790901,-6.227512,-6.190699,-7.970877,-8.198745,-7.629197,-7.265667,-7.140046,-6.153306,-8.402987,-7.614854,-7.341763,-8.452734,-7.587851,-5.210319,-4.650168,-5.708196,-7.161912,-7.136221,-4.045441,-8.329041,-7.159221,-6.485064,-7.996245,-6.004496,-8.898528,-8.511965,-8.103209,-8.748619,-6.746857,-6.399633,-9.230689,-8.696593,-6.56886,...,-8.550846,-6.044494,-5.70674,-8.066385,-3.238464,-6.969715,-7.078382,-8.138712,-6.194111,-8.90876,-5.282379,-5.991313,-5.980667,-9.377781,-7.786143,-7.915596,-0.008984,-7.289499,-8.539924,-7.631226,-8.102122,-7.173686,-5.023476,-6.216912,-7.462472,-6.618219,-5.995512,-7.912551,-5.058311,-6.401876,-7.311523,-6.865546,-8.899119,-6.55062,-6.983426,-7.065683,-8.545472,-7.272933,-8.19633,-7.386028
3,00294be0.wav,-13.357406,-13.00642,-10.644485,-12.401155,-9.682833,-10.000592,-11.48301,-12.53441,-9.701755,-8.408823,-10.752382,-11.479132,-11.319611,-13.501399,-7.996554,-10.71715,-11.34768,-9.471247,-12.807303,-12.859415,-11.722944,-10.971406,-6.134005,-14.327222,-13.528902,-10.869341,-11.626091,-9.213711,-10.002096,-11.887448,-9.938213,-9.963949,-9.765237,-9.791996,-11.397982,-11.250865,-10.159858,-10.421098,-10.402865,...,-12.401876,-12.480584,-6.476386,-9.959803,-10.670586,-13.641009,-12.822018,-10.339299,-11.102044,-5.51063,-10.915506,-10.762195,-12.232727,5.244822,-13.669467,-10.64341,-9.359421,-9.357411,-11.3746,-12.174,-9.658916,-9.471611,-12.70465,-9.933787,-12.004688,-11.626382,-12.335631,-9.928554,-11.601612,-10.777857,-10.607757,-13.051289,-12.317013,-10.867906,-10.657203,-11.125907,-8.045822,-10.456474,-10.822442,-7.400805
4,003fde7a.wav,-9.068222,-9.204363,-10.453787,-10.641745,-9.98577,-10.005691,-10.754848,-10.539022,2.153161,-8.330508,-8.555352,-9.459157,-9.674171,-10.578801,-9.247939,-9.328031,-7.626411,-10.240534,-9.114473,-10.583421,-10.738965,-10.509126,-9.384538,-10.645274,-9.904908,-7.498883,-8.92571,-7.063823,-9.981207,-10.25519,-8.844185,-9.466584,-8.798328,-9.631821,-9.445584,-9.268835,-11.129076,-3.211059,-8.917941,...,-8.035894,-8.290565,-9.788488,-7.527767,-9.919047,-10.252807,-10.298046,-8.33932,-12.051169,-9.259693,-8.8225,-10.457736,-10.386249,-9.858006,-10.016462,-10.424938,-10.667839,-9.825079,-7.581996,-7.934007,-10.02474,-10.176035,-9.166938,-9.093817,-9.874481,-8.332733,-11.181279,-10.248159,-9.860875,-10.12099,-8.983416,-10.036682,-11.739288,-11.056126,-10.770756,-10.983463,-10.881329,-10.53504,-8.755375,-9.80627


In [8]:
csvs = {
    'train_curated': dataset_dir / 'train_curated.csv',
    #'train_noisy': dataset_dir / 'train_noisy.csv',
    'train_noisy': preprocessed_dir / 'trn_noisy_best50s.csv',
    'sample_submission': dataset_dir / 'sample_submission.csv',
}

dataset = {
    'train_curated': dataset_dir / 'train_curated',
    'train_noisy': dataset_dir / 'train_noisy',
    'test': dataset_dir / 'test',
}

mels = {
    'train_curated': preprocessed_dir / 'mels_train_curated.pkl',
    'train_noisy': preprocessed_dir / 'mels_trn_noisy_best50s.pkl',
    'test': preprocessed_dir / 'mels_test.pkl',  # NOTE: this data doesn't work at 2nd stage
}

In [9]:
train_curated = pd.read_csv(csvs['train_curated'])
train_noisy = pd.read_csv(csvs['train_noisy'])
#train_df = pd.concat([train_curated, train_noisy, train_noisy], sort=True, ignore_index=True)
train_df = train_curated
train_df.head()

Unnamed: 0,fname,labels
0,0006ae4e.wav,Bark
1,0019ef41.wav,Raindrop
2,001ec0ad.wav,Finger_snapping
3,0026c7cb.wav,Run
4,0026f116.wav,Finger_snapping


In [10]:
test_df = pd.read_csv(csvs['sample_submission'])
test_df.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,Burping_and_eructation,Bus,Buzz,Car_passing_by,Cheering,Chewing_and_mastication,Child_speech_and_kid_speaking,Chink_and_clink,Chirp_and_tweet,Church_bell,Clapping,Computer_keyboard,Crackle,Cricket,Crowd,Cupboard_open_or_close,Cutlery_and_silverware,Dishes_and_pots_and_pans,Drawer_open_or_close,Drip,Electric_guitar,Fart,Female_singing,Female_speech_and_woman_speaking,Fill_(with_liquid),Finger_snapping,Frying_(food),Gasp,Glockenspiel,Gong,...,Harmonica,Hi-hat,Hiss,Keys_jangling,Knock,Male_singing,Male_speech_and_man_speaking,Marimba_and_xylophone,Mechanical_fan,Meow,Microwave_oven,Motorcycle,Printer,Purr,Race_car_and_auto_racing,Raindrop,Run,Scissors,Screaming,Shatter,Sigh,Sink_(filling_or_washing),Skateboard,Slam,Sneeze,Squeak,Stream,Strum,Tap,Tick-tock,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,000ccb97.wav,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0012633b.wav,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,001ed5f1.wav,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,00294be0.wav,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,003fde7a.wav,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
labels = test_df.columns[1:].tolist()
labels

['Accelerating_and_revving_and_vroom',
 'Accordion',
 'Acoustic_guitar',
 'Applause',
 'Bark',
 'Bass_drum',
 'Bass_guitar',
 'Bathtub_(filling_or_washing)',
 'Bicycle_bell',
 'Burping_and_eructation',
 'Bus',
 'Buzz',
 'Car_passing_by',
 'Cheering',
 'Chewing_and_mastication',
 'Child_speech_and_kid_speaking',
 'Chink_and_clink',
 'Chirp_and_tweet',
 'Church_bell',
 'Clapping',
 'Computer_keyboard',
 'Crackle',
 'Cricket',
 'Crowd',
 'Cupboard_open_or_close',
 'Cutlery_and_silverware',
 'Dishes_and_pots_and_pans',
 'Drawer_open_or_close',
 'Drip',
 'Electric_guitar',
 'Fart',
 'Female_singing',
 'Female_speech_and_woman_speaking',
 'Fill_(with_liquid)',
 'Finger_snapping',
 'Frying_(food)',
 'Gasp',
 'Glockenspiel',
 'Gong',
 'Gurgling',
 'Harmonica',
 'Hi-hat',
 'Hiss',
 'Keys_jangling',
 'Knock',
 'Male_singing',
 'Male_speech_and_man_speaking',
 'Marimba_and_xylophone',
 'Mechanical_fan',
 'Meow',
 'Microwave_oven',
 'Motorcycle',
 'Printer',
 'Purr',
 'Race_car_and_auto_racing',
 

In [12]:
num_classes = len(labels)
num_classes

80

In [13]:
y_train = np.zeros((len(train_df), num_classes)).astype(int)
for i, row in enumerate(train_df['labels'].str.split(',')):
    for label in row:
        idx = labels.index(label)
        y_train[i, idx] = 1

#y_train= np.concatenate((y_train,y_train), axis=0)
y_train.shape

(4970, 80)

In [14]:
with open(mels['train_curated'], 'rb') as curated, open(mels['train_noisy'], 'rb') as noisy:
    x_train = pickle.load(curated)
    #x_train2= pickle.load(noisy)
    #x_train.extend(pickle.load(noisy))


with open(mels['test'], 'rb') as test:
    x_test = pickle.load(test)

#x_train= x_train + x_train2 
len(x_train), len(x_test)

(4970, 1120)

In [15]:
class FATTrainDataset(Dataset):
    def __init__(self, mels, labels, transforms):
        super().__init__()
        self.mels = mels
        self.labels = labels
        self.transforms = transforms
        
    def __len__(self):
        return len(self.mels)
    
    def __getitem__(self, idx):
        # crop 1sec
        image = Image.fromarray(self.mels[idx], mode='RGB')        
        time_dim, base_dim = image.size
        crop = random.randint(0, time_dim - base_dim)
        image = image.crop([crop, 0, crop + base_dim, base_dim])
        image = self.transforms(image).div_(255)
        
        label = self.labels[idx]
        label = torch.from_numpy(label).float()
        
        return image, label

In [16]:
class FATTestDataset(Dataset):
    def __init__(self, fnames, mels, transforms, tta=5):
        super().__init__()
        self.fnames = fnames
        self.mels = mels
        self.transforms = transforms
        self.tta = tta
        
    def __len__(self):
        return len(self.fnames) * self.tta
    
    def __getitem__(self, idx):
        new_idx = idx % len(self.fnames)
        
        image = Image.fromarray(self.mels[new_idx], mode='RGB')
        time_dim, base_dim = image.size
        crop = random.randint(0, time_dim - base_dim)
        image = image.crop([crop, 0, crop + base_dim, base_dim])
        image = self.transforms(image).div_(255)

        fname = self.fnames[new_idx]
        
        return image, fname

In [17]:
transforms_dict = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(0.8),
        transforms.RandomRotation(9, resample=False, expand=False, center=None),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.0,), std=(0.5,)),
    ]),
    'test': transforms.Compose([
        #transforms.RandomHorizontalFlip(0.8),
        #transforms.RandomRotation(5, resample=False, expand=False, center=None),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.0,), std=(0.5,)),
    ]),
}

### model

In [18]:
class Classifier(nn.Module):    
    def __init__(self, num_classes):
        super(Classifier, self).__init__()
        
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),            

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  
        )

        self.fc = nn.Sequential(
            nn.Dropout(p = 0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.2),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(p = 0.1),
            nn.Linear(128,num_classes),
        )
 
        for m in self.features.children():
             if isinstance(m, nn.Conv2d):
                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                 m.weight.data.normal_(0, math.sqrt(2. / n))
             elif isinstance(m, nn.BatchNorm2d):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()
         
        for m in self.fc.children():
             if isinstance(m, nn.Linear):
                 nn.init.kaiming_uniform_(m.weight)
             elif isinstance(m, nn.BatchNorm1d):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()
 
    def forward(self, x):
        x = self.features(x)
        x = torch.mean(x, dim=3)
        x, _ = torch.max(x, dim=2)
        x = self.fc(x)
        return x

In [19]:
Classifier(num_classes=num_classes)

Classifier(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256, kernel_size

### train

In [20]:
def train_model(x_train, y_train, train_transforms):
    num_epochs = 400
    batch_size = 128
    test_batch_size = 256
    lr = 1e-3
    eta_min = 1e-5
    t_max = 5
    
    num_classes = y_train.shape[1]

    x_trn, x_val, y_trn, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=SEED)
    
    train_dataset = FATTrainDataset(x_trn, y_trn, train_transforms)
    valid_dataset = FATTrainDataset(x_val, y_val, train_transforms)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=test_batch_size, shuffle=False)

    model = Classifier(num_classes=num_classes).cuda()
    criterion = nn.BCEWithLogitsLoss().cuda()
    optimizer = Adam(params=model.parameters(), lr=lr, amsgrad=False)
    scheduler = CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)

    best_epoch = -1
    best_lwlrap = 0.
    mb = master_bar(range(num_epochs))

    for epoch in mb:
        start_time = time.time()
        model.train()
        avg_loss = 0.

        for x_batch, y_batch in train_loader:
            preds = model(x_batch.cuda())
            loss = criterion(preds, y_batch.cuda())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            avg_loss += loss.item() / len(train_loader)

        model.eval()
        valid_preds = np.zeros((len(x_val), num_classes))
        avg_val_loss = 0.

        for i, (x_batch, y_batch) in enumerate(valid_loader):
            preds = model(x_batch.cuda()).detach()
            loss = criterion(preds, y_batch.cuda())

            preds = torch.sigmoid(preds)
            valid_preds[i * test_batch_size: (i+1) * test_batch_size] = preds.cpu().numpy()

            avg_val_loss += loss.item() / len(valid_loader)
            
        score, weight = calculate_per_class_lwlrap(y_val, valid_preds)
        lwlrap = (score * weight).sum()
        
        scheduler.step()

        if (epoch + 1) % 1 == 0:
            elapsed = time.time() - start_time
            mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  val_lwlrap: {lwlrap:.6f}  time: {elapsed:.0f}s')
    
        if lwlrap > best_lwlrap:
            best_epoch = epoch + 1
            best_lwlrap = lwlrap
            torch.save(model.state_dict(), 'weight_best.pt')
            
    return {
        'best_epoch': best_epoch,
        'best_lwlrap': best_lwlrap,
    }

In [21]:
#result = train_model(x_train, y_train, transforms_dict['train'])

In [22]:
#result

### predict

In [23]:
def predict_model(test_fnames, x_test, test_transforms, num_classes, *, tta=5):
    batch_size = 256

    test_dataset = FATTestDataset(test_fnames, x_test, test_transforms, tta=tta)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
      
    model = Classifier(num_classes=num_classes)
    model.load_state_dict(torch.load('../input/freesound-singleweight/weight_best.pt'))
    #model.load_state_dict(torch.load('weight_best.pt'))
    model.cuda()
    model.eval()

    all_outputs, all_fnames = [], []

    pb = progress_bar(test_loader)
    for images, fnames in pb:
        preds = torch.sigmoid(model(images.cuda()).detach())
        all_outputs.append(preds.cpu().numpy())
        all_fnames.extend(fnames)

    test_preds = pd.DataFrame(data=np.concatenate(all_outputs),
                              index=all_fnames,
                              columns=map(str, range(num_classes)))
    test_preds = test_preds.groupby(level=0).mean()

    return test_preds

In [24]:
#test_preds = predict_model(test_df['fname'], x_test, transforms_dict['test'], num_classes, tta=20)

In [25]:
#test_df[labels] = test_preds.values
#test_df.to_csv('submission.csv', index=False)
#test_df.head()