In [1]:
import numpy as np
import librosa as lb
import pandas as pd
import random
import heapq
import time

from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import KFold,StratifiedKFold
from skimage.transform import resize
import os
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.__version__

'1.5.1'

In [3]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    
GLOBAL_SEED = 42
setup_seed(GLOBAL_SEED)

In [4]:
train_path = '/root/cfl/b/Audio/data/train'
test_path = '/root/cfl/b/Audio/data/test'
feat_path = '/root/cfl/b/Audio/features'
res_path = '/root/cfl/b/Audio/res'
model_save = '/root/cfl/b/Audio/model_save'
tensorboard_path = '/root/cfl/b/Audio/tb_run'
if not os.path.exists(model_save):
    os.makedirs(model_save)
if not os.path.exists(res_path):
    os.makedirs(res_path)
if not os.path.exists(tensorboard_path):
    os.makedirs(tensorboard_path)

## 提取特征保存为图片

In [5]:
SR = 65535 * 2
NUM_CLASSES = 30
LABELS = os.listdir(train_path)

In [17]:
def mono_to_color(
    X: np.ndarray, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6
):
    """
    Sources:
        https://www.kaggle.com/daisukelab/creating-fat2019-preprocessed-data
        https://www.kaggle.com/ttahara/training-birdsong-baseline-resnest50-fast
    """
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)


def get_melspec(x, sr, n_mels=256):
    mel_spec = lb.feature.melspectrogram(x, sr=sr, n_mels=n_mels, n_fft=2048, hop_length=512, power=2)
    mel_spec = lb.power_to_db(mel_spec).astype(np.float32)
    image = mono_to_color(mel_spec)
#     image = normalize(image, mean=None, std=None)
    image = image.transpose(2, 1, 0)
#     image = resize(image, (224, 400)) 
    return image


def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    elif len(y) > length:
        y = y[: length]
    return y

def preprocess_train(train_path):
    x, y = [], []
    for i, label in enumerate(LABELS):
        label_dir = os.path.join(train_path, label)
        for wav_file in tqdm(os.listdir(label_dir)):
            wav_path = os.path.join(train_path, label, wav_file)
            wav, sr = lb.load(wav_path, sr=SR)
            wav = crop_or_pad(wav, 1*SR)
            melspec = get_melspec(wav, sr)
            x.append(melspec)
            y.append(i)

    x, y = np.r_[x], np.r_[y]
#     x, y = shuffle(x, y, random_state=GLOBAL_SEED)
    return x, y.astype(np.int32)

def preprocess_test(test_path):
    x, keys = [], []

    for wav_file in tqdm(os.listdir(test_path)):
        wav_path = os.path.join(test_path, wav_file)
        wav, sr = lb.load(wav_path, sr=SR)
        wav = crop_or_pad(wav, 1*SR)
        melspec = get_melspec(wav, sr)
        x.append(melspec)
        keys.append(wav_file)
    x = np.r_[x]
    return x, keys

In [18]:
X_train, y_train = preprocess_train(train_path)

100%|██████████| 2122/2122 [04:08<00:00,  8.53it/s]
100%|██████████| 2095/2095 [04:10<00:00,  8.36it/s]
100%|██████████| 2109/2109 [04:12<00:00,  8.34it/s]
100%|██████████| 2121/2121 [04:13<00:00,  8.37it/s]
100%|██████████| 2138/2138 [04:16<00:00,  8.34it/s]
100%|██████████| 1567/1567 [03:06<00:00,  8.42it/s]
100%|██████████| 2123/2123 [04:12<00:00,  8.40it/s]
100%|██████████| 2126/2126 [04:15<00:00,  8.32it/s]
100%|██████████| 2106/2106 [04:13<00:00,  8.30it/s]
100%|██████████| 2131/2131 [04:14<00:00,  8.38it/s]
100%|██████████| 1562/1562 [03:08<00:00,  8.29it/s]
100%|██████████| 1573/1573 [03:07<00:00,  8.41it/s]
100%|██████████| 2089/2089 [04:08<00:00,  8.39it/s]
100%|██████████| 1584/1584 [03:10<00:00,  8.34it/s]
100%|██████████| 2119/2119 [04:14<00:00,  8.34it/s]
100%|██████████| 2086/2086 [04:12<00:00,  8.27it/s]
100%|██████████| 2108/2108 [04:11<00:00,  8.37it/s]
100%|██████████| 1566/1566 [03:08<00:00,  8.32it/s]
100%|██████████| 1540/1540 [03:04<00:00,  8.33it/s]
100%|███████

In [19]:
X_test, test_names = preprocess_test(test_path)

100%|██████████| 6835/6835 [13:30<00:00,  8.43it/s]


In [20]:
np.savez(os.path.join(feat_path, 'melspec_256_256'), X=np.concatenate([X_train, X_test], axis=0), y=y_train, test_names=test_names)

## 加载特征准备数据集

In [6]:
melspec = np.load(os.path.join(feat_path, 'melspec_256_256.npz'))

In [7]:
X = melspec['X']

In [8]:
X_train = X[:57886] 
X_test = X[57886:]
y_train = melspec['y']
test_names = melspec['test_names']
labels = os.listdir(train_path)

In [9]:
del melspec, X
gc.collect()

31

## 准备数据集

In [10]:
from torchvision import transforms
class CustomDataset(Dataset):
    def __init__(self, index, is_train=True):
        self.index = index
        self.is_train = is_train
        self.transformer = transforms.Compose([
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.index)
    
    def __getitem__(self, idx):
        if self.is_train:
            img = X_train[self.index[idx]] / 255.0
            return self.transformer(torch.tensor(img, dtype=torch.float32)), y_train[self.index[idx]]
        else:
            img = X_test[self.index[idx]] / 255.0
            return self.transformer(torch.tensor(img, dtype=torch.float32))

In [11]:
BATCH_SIZE_TRAIN = 128
BATCH_SIZE_VAL = 128
BATCH_SIZE_TEST = 128
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=GLOBAL_SEED)
data_folds = []
valid_indexs = []    


test_dataset = CustomDataset(np.arange(X_test.shape[0]), is_train=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, sampler=SequentialSampler(test_dataset), shuffle=False, num_workers=0)

for idx, (train_index, valid_index) in enumerate(kf.split(X=X_train, y=y_train)):
    valid_indexs.append(valid_index)
    
    train_dataset = CustomDataset(train_index, is_train=True)
    val_dataset = CustomDataset(valid_index, is_train=True)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=0)
    valid_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE_VAL, sampler=SequentialSampler(val_dataset), shuffle=False, num_workers=0)
    data_folds.append((train_dataloader, valid_dataloader, test_dataloader))

## 搭建模型

In [12]:
from resnest.torch import resnest50
import torchvision

In [13]:
class ResNest50(nn.Module):
    def __init__(self):
        super().__init__()
        # ResNeSt: Split-Attention Networks
        # https://arxiv.org/abs/2004.08955
        # Significantly outperforms standard Resnet
        self.model = resnest50(pretrained=True)
#         self.model = torchvision.models.resnet50(pretrained=True)

        self.model.fc = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(1024, 30)
        )

#         self.model.fc = nn.Sequential(
#             nn.Linear(2048, 30)
#         )
        
    def forward(self, X):
        return self.model(X)

In [14]:
def validate(model, val_dataloader, criterion, history, n_iters):
    model.eval()
    costs = []
    accs = []
    with torch.no_grad():
        for idx, batch in enumerate(val_dataloader):
            X, y = batch
            X, y = X.cuda(), y.cuda().long()
            y_output = model(X)    
            loss = criterion(y_output, y)
            costs.append(loss.item())
            _, y_preds = torch.max(y_output, 1)
            accs.append((y_preds == y).float().mean().item())
    mean_accs = np.mean(accs)
    mean_costs = np.mean(costs)
    writer.add_scalar('age/validate_accuracy', mean_accs, n_iters)
    writer.add_scalar('age/validate_loss', mean_costs, n_iters)
    if mean_accs > history['best_model'][0][0]:  
        heapq.heapify(history['best_model'])
        checkpoint_pth = history['best_model'][0][1]
        heapq.heappushpop(history['best_model'], (mean_accs, checkpoint_pth))
        torch.save(model.state_dict(), checkpoint_pth)
    return mean_costs, mean_accs


def train(model, train_dataloader, val_dataloader, criterion, optimizer, epoch, history, validate_points, scheduler, step=True):
    model.train()
    costs = []
    accs = []
    val_loss, val_acc = 0, 0
    with tqdm(total=len(train_dataloader.dataset), desc='Epoch{}'.format(epoch)) as pbar:
        for idx, batch in enumerate(train_dataloader):
            X, y = batch
            X, y = X.cuda(), y.cuda().long()
            y_output = model(X)    
            loss = criterion(y_output, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step:
                scheduler.step()
            with torch.no_grad():
                costs.append(loss.item())
                _, y_preds = torch.max(y_output, 1)
                accs.append((y_preds == y).float().mean().item())
                pbar.update(y.size(0))
            n_iters = idx + len(train_dataloader) * (epoch-1)
            if idx in validate_points:
                val_loss, val_acc = validate(model, val_dataloader, criterion, history, n_iters)
                model.train()
            
            writer.add_scalar('age/train_accuracy', accs[-1], n_iters)
            writer.add_scalar('age/train_loss', costs[-1], n_iters)
            writer.add_scalar('age/learning_rate', scheduler.get_lr()[0], n_iters)
            pbar.set_postfix_str('loss:{:.4f}, acc:{:.4f}, val-loss:{:.4f}, val-acc:{:.4f}'.format(np.mean(costs[-10:]), np.mean(accs[-10:]), val_loss, val_acc))
            torch.cuda.empty_cache()

    
def test(oof_train_test, model, test_dataloader, val_dataloader, valid_index, weight=1):
    model.eval()
    y_preds = []
    y_preds_val = []
    with torch.no_grad():
        for idx, batch in enumerate(test_dataloader):
            X = batch
            X= X.cuda()
            y_output = model(X)    
            y_preds.append(y_output.cpu())
            
        for idx, batch in enumerate(val_dataloader):
            X, y = batch
            X = X.cuda()
            y_output = model(X)
            y_preds_val.append(y_output.cpu())
    
    oof_train_test[valid_index] += F.softmax(torch.cat(y_preds_val)).numpy() * weight
    oof_train_test[57886:] += F.softmax(torch.cat(y_preds)).numpy() * weight

In [None]:
def criterion(y_output, y_true):
    loss = nn.CrossEntropyLoss()(y_output, y_true)
    return loss

res_folds = []
acc_folds = []
model_name = 'resnest50'
best_checkpoint_num = 3
for idx, (train_dataloader, val_dataloader, test_dataloader) in enumerate(data_folds):
    oof_train_test = np.zeros((X_train.shape[0] + X_test.shape[0], 30))
    history = {'best_model': []}
    for i in range(best_checkpoint_num):
        history['best_model'].append((0, os.path.join(model_save, '{}_checkpoint_fold_{}_{}.pth'.format(model_name, idx, i))))
    validate_points = list(np.linspace(0, len(train_dataloader)-1, 3).astype(int))[1:]
     
    model = ResNest50().cuda()
#     model = nn.DataParallel(model, device_ids=[0, 1])
    optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), lr=1e-3)
    epochs = 7
#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-5, max_lr=2e-3, step_size_up=int(len(train_dataloader)/2), cycle_momentum=False, mode='triangular')
#     scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=3e-3, epochs=epochs, steps_per_epoch=len(train_dataloader), pct_start=0.2, anneal_strategy='linear', div_factor=30, final_div_factor=1e4)
    for epoch in range(1, epochs+1):
        writer = SummaryWriter(log_dir=os.path.join(tensorboard_path, '{}_fold_{}'.format(model_name, idx)))
        train(model, train_dataloader, val_dataloader, criterion, optimizer, epoch, history, validate_points, scheduler, step=True)
#         scheduler.step()
        gc.collect()
    for (acc, checkpoint_pth), weight in zip(sorted(history['best_model'], reverse=True), [0.5, 0.3, 0.2]):
        model.load_state_dict(torch.load(checkpoint_pth, map_location= torch.device('cpu')), strict=True)
        test(oof_train_test, model, test_dataloader, val_dataloader, valid_indexs[idx], weight=weight)
    acc_folds.append(sorted(history['best_model'], reverse=True)[0][0])
    res_folds.append(oof_train_test)
    np.save(os.path.join(res_path, "{}_fold_{}.npy".format(model_name, idx)), oof_train_test)
    del model, history 
    gc.collect()
    torch.cuda.empty_cache()

Epoch1: 100%|██████████| 46309/46309 [08:22<00:00, 92.14it/s, loss:0.1718, acc:0.9470, val-loss:0.1530, val-acc:0.9566] 
Epoch2: 100%|██████████| 46309/46309 [08:49<00:00, 87.44it/s, loss:0.1879, acc:0.9509, val-loss:0.1302, val-acc:0.9619] 
Epoch3: 100%|██████████| 46309/46309 [08:47<00:00, 87.80it/s, loss:0.1129, acc:0.9689, val-loss:0.1185, val-acc:0.9662] 
Epoch4: 100%|██████████| 46309/46309 [08:47<00:00, 87.81it/s, loss:0.1163, acc:0.9661, val-loss:0.1119, val-acc:0.9692] 
Epoch5: 100%|██████████| 46309/46309 [08:48<00:00, 87.55it/s, loss:0.1245, acc:0.9642, val-loss:0.1154, val-acc:0.9690] 
Epoch6: 100%|██████████| 46309/46309 [08:57<00:00, 86.10it/s, loss:0.1241, acc:0.9676, val-loss:0.1075, val-acc:0.9703] 
Epoch7: 100%|██████████| 46309/46309 [08:52<00:00, 86.90it/s, loss:0.0836, acc:0.9769, val-loss:0.1058, val-acc:0.9704] 
Epoch1: 100%|██████████| 46309/46309 [08:41<00:00, 88.85it/s, loss:0.1619, acc:0.9556, val-loss:0.1709, val-acc:0.9538] 
Epoch2: 100%|██████████| 46309/4

In [23]:
res_folds

[array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [4.05189260e-02, 1.28448105e-01, 1.30232821e-02, ...,
         1.97078560e-02, 1.92916249e-02, 1.51436491e-02],
        [9.99314964e-01, 4.72229523e-09, 2.46480514e-10, ...,
         1.04051865e-10, 1.42188678e-06, 6.52036278e-10],
        ...,
        [2.60628326e-09, 2.05265420e-07, 1.25412249e-07, ...,
         7.45270863e-09, 2.58160886e-06, 4.72284030e-11],
        [2.32501975e-02, 2.21754618e-02, 3.47323222e-02, ...,
         2.51573386e-02, 2.74438350e-02, 2.37078201e-02],
        [1.56348458e-13, 4.84132573e-13, 8.61053225e-12, ...,
         6.28070542e-19, 4.50682092e-12, 2.50901304e-22]])]

In [5]:
res = []
for i in range(len(data_folds)):
    res.append(np.load(os.path.join(res_path, "{}_fold_{}.npy".format(model_name, i))))

In [14]:
sub = pd.DataFrame()
sub['file_name'] = test_names
sub['label'] = np.argmax(np.mean(res, axis=0)[57886:], axis=1)

In [15]:
sub['label'] = sub['label'].map({i:label for i, label in enumerate(labels)})

In [16]:
sub

Unnamed: 0,file_name,label
0,003gtit8kw.wav,one
1,006irl4pgx.wav,yes
2,007sh75o5w.wav,tree
3,009k6j5dbw.wav,three
4,009lyahcx8.wav,marvin
...,...,...
6830,zyvkhzi7pt.wav,house
6831,zzbo90jvjj.wav,nine
6832,zzgk3zkfr8.wav,right
6833,zzqta071j9.wav,three


In [17]:
now = time.strftime("%Y%m%d_%H%M%S",time.localtime(time.time())) 
fname="submit_" + model_name + "_" + now + ".csv"    
sub.to_csv(os.path.join(res_path, fname), index=False)