In [6]:
import numpy as np
import librosa as lb
import librosa
import pandas as pd
import random
import time

from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import KFold,StratifiedKFold
from skimage.transform import resize
import os
import gc
import warnings
warnings.filterwarnings('ignore')

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.__version__

'1.5.1'

In [8]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    
GLOBAL_SEED = 42
setup_seed(GLOBAL_SEED)

In [9]:
train_path = '/root/cfl/b/Audio/data/train'
test_path = '/root/cfl/b/Audio/data/test'
feat_path = '/root/cfl/b/Audio/features'
res_path = '/root/cfl/b/Audio/res'
model_save = '/root/cfl/b/Audio/model_save'
tensorboard_path = '/root/cfl/b/Audio/tb_run'
if not os.path.exists(model_save):
    os.makedirs(model_save)
if not os.path.exists(res_path):
    os.makedirs(res_path)
if not os.path.exists(tensorboard_path):
    os.makedirs(tensorboard_path)

## 提取特征保存为图片

In [10]:
SR = 65535
NUM_CLASSES = 30
LABELS = os.listdir(train_path)

In [6]:
def crop_or_pad(y, length):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    elif len(y) > length:
        y = y[: length]
    return y

def preprocess_train(train_path):
    x, y = [], []
    for i, label in enumerate(LABELS):
        label_dir = os.path.join(train_path, label)
        for wav_file in tqdm(os.listdir(label_dir)):
            wav_path = os.path.join(train_path, label, wav_file)
            wav, sr = lb.load(wav_path, sr=SR)
            wav = crop_or_pad(wav, 1*SR)
            x.append(wav)
            y.append(i)

    x, y = np.r_[x], np.r_[y]
#     x, y = shuffle(x, y, random_state=GLOBAL_SEED)
    return x, y.astype(np.int32)

def preprocess_test(test_path):
    x, keys = [], []

    for wav_file in tqdm(os.listdir(test_path)):
        wav_path = os.path.join(test_path, wav_file)
        wav, sr = lb.load(wav_path, sr=SR)
        wav = crop_or_pad(wav, 1*SR)
        x.append(wav)
        keys.append(wav_file)
    x = np.r_[x] 
    return x, keys

In [10]:
X_train, y_train = preprocess_train(train_path)

100%|██████████| 2122/2122 [02:05<00:00, 16.91it/s]
100%|██████████| 2095/2095 [02:05<00:00, 16.71it/s]
100%|██████████| 2109/2109 [02:17<00:00, 15.39it/s]
100%|██████████| 2121/2121 [01:54<00:00, 18.48it/s]
100%|██████████| 2138/2138 [01:54<00:00, 18.66it/s]
100%|██████████| 1567/1567 [01:23<00:00, 18.76it/s]
100%|██████████| 2123/2123 [01:53<00:00, 18.76it/s]
100%|██████████| 2126/2126 [01:54<00:00, 18.58it/s]
100%|██████████| 2106/2106 [01:53<00:00, 18.63it/s]
100%|██████████| 2131/2131 [01:54<00:00, 18.67it/s]
100%|██████████| 1562/1562 [01:23<00:00, 18.70it/s]
100%|██████████| 1573/1573 [01:23<00:00, 18.75it/s]
100%|██████████| 2089/2089 [01:51<00:00, 18.66it/s]
100%|██████████| 1584/1584 [01:24<00:00, 18.66it/s]
100%|██████████| 2119/2119 [01:53<00:00, 18.66it/s]
100%|██████████| 2086/2086 [01:52<00:00, 18.62it/s]
100%|██████████| 2108/2108 [01:52<00:00, 18.66it/s]
100%|██████████| 1566/1566 [01:23<00:00, 18.74it/s]
100%|██████████| 1540/1540 [01:22<00:00, 18.73it/s]
100%|███████

In [11]:
X_test, test_names = preprocess_test(test_path)

100%|██████████| 6835/6835 [06:17<00:00, 18.12it/s]


In [12]:
np.savez(os.path.join(feat_path, 'data'), X_train=X_train, X_test=X_test, y=y_train, test_names=test_names)

## 加载特征准备数据集

In [11]:
data = np.load(os.path.join(feat_path, 'data.npz'))

In [12]:
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y']
test_names = data['test_names']
labels = os.listdir(train_path)

In [13]:
del data
gc.collect()

12

## 准备数据集

In [14]:
"https://www.kaggle.com/gopidurgaprasad/audio-augmentation-albumentations/"

import matplotlib.pyplot as plt
import IPython.display as ipd
import albumentations
from albumentations.core.transforms_interface import DualTransform, BasicTransform
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, PolarityInversion, Gain, AddGaussianSNR


class AudioTransform(BasicTransform):
    """Transform for Audio task"""

    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params
    
      
class MelSpectrogram(AudioTransform):
    """Shifting time axis"""
    def __init__(self, parameters, always_apply=False, p=0.5):
        super(MelSpectrogram, self).__init__(always_apply, p)

        self.parameters = parameters
    
    def apply(self, data, **params):
        sound, sr = data

        melspec = librosa.feature.melspectrogram(sound, sr=sr, **self.parameters)
        melspec = librosa.power_to_db(melspec)
        melspec = melspec.astype(np.float32)

        return melspec, sr
    
    
class SpecAugment(AudioTransform):
    """Shifting time axis"""
    def __init__(self, num_mask=2, freq_masking=0.15, time_masking=0.20, always_apply=False, p=0.5):
        super(SpecAugment, self).__init__(always_apply, p)

        self.num_mask = num_mask
        self.freq_masking = freq_masking
        self.time_masking = time_masking
    
    def apply(self, data, **params):
        melspec, sr = data

        spec_aug = self.spec_augment(melspec, 
                                     self.num_mask,
                                     self.freq_masking,
                                     self.time_masking,
                                     melspec.min())
        


        return spec_aug, sr
    
    # Source: https://www.kaggle.com/davids1992/specaugment-quick-implementation
    def spec_augment(self, 
                    spec: np.ndarray,
                    num_mask=2,
                    freq_masking=0.15,
                    time_masking=0.20,
                    value=0):
        spec = spec.copy()
        num_mask = random.randint(1, num_mask)
        for i in range(num_mask):
            all_freqs_num, all_frames_num  = spec.shape
            freq_percentage = random.uniform(0.0, freq_masking)

            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[f0:f0 + num_freqs_to_mask, :] = value

            time_percentage = random.uniform(0.0, time_masking)

            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[:, t0:t0 + num_frames_to_mask] = value

        return spec

    
class SpectToImage(AudioTransform):

    def __init__(self, always_apply=False, p=0.5):
        super(SpectToImage, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        image, sr = data
        delta = librosa.feature.delta(image)
        accelerate = librosa.feature.delta(image, order=2)
        image = np.stack([image, delta, accelerate], axis=-1)
        image = image.astype(np.float32) / 100.0

        return image

sound_augment = Compose([
    PolarityInversion(p=0.2),
    Gain(min_gain_in_db=-15, max_gain_in_db=15, p=0.3),
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.2),
    AddGaussianSNR(p=0.2)
#     TimeStretch(min_rate=0.8, max_rate=1.25, p=0.2)
#     Shift(min_fraction=-0.1, max_fraction=0.1, p=0.2),
])


melspectrogram_parameters = {
        "n_mels": 256,
        'n_fft': 2048, 
        'hop_length': 512
    }

spec_augment = albumentations.Compose([
    MelSpectrogram(parameters=melspectrogram_parameters, always_apply=True),
    SpecAugment(p=0.2),
    SpectToImage(always_apply=True)
])

to_image = albumentations.Compose([
    MelSpectrogram(parameters=melspectrogram_parameters, always_apply=True),
    SpectToImage(always_apply=True)
])

def augment(wav):
    data = sound_augment(samples=wav, sample_rate=SR), SR
    image = spec_augment(data=data)['data']
    return image.transpose(2, 1, 0)

def get_image(wav):
    data = wav, SR
    image = to_image(data=data)['data']
    return image.transpose(2, 1, 0)

In [15]:
from torchvision import transforms
class CustomDataset(Dataset):
    def __init__(self, index, is_train=True, is_valid=False):
        self.index = index
        self.is_train = is_train
        self.is_valid = is_valid
        self.transformer = transforms.Compose([
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.index)
    
    def __getitem__(self, idx):
        if self.is_train:
            if self.is_valid:
                img = get_image(X_train[self.index[idx]])
            else:
                img = augment(X_train[self.index[idx]])
            return torch.tensor(img, dtype=torch.float32), y_train[self.index[idx]]
        else:
            img = get_image(X_test[self.index[idx]])
            return torch.tensor(img, dtype=torch.float32)

In [16]:
BATCH_SIZE_TRAIN = 128
BATCH_SIZE_VAL = 128
BATCH_SIZE_TEST = 128
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=GLOBAL_SEED)
data_folds = []
valid_indexs = []    


test_dataset = CustomDataset(np.arange(X_test.shape[0]), is_train=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, sampler=SequentialSampler(test_dataset), shuffle=False, num_workers=4)

for idx, (train_index, valid_index) in enumerate(kf.split(X=X_train, y=y_train)):
    valid_indexs.append(valid_index)
    
    train_dataset = CustomDataset(train_index, is_train=True)
    val_dataset = CustomDataset(valid_index, is_train=True, is_valid=True)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=4)
    valid_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE_VAL, sampler=SequentialSampler(val_dataset), shuffle=False, num_workers=4)
    data_folds.append((train_dataloader, valid_dataloader, test_dataloader))

## 搭建模型

In [17]:
from resnest.torch import resnest50
import torchvision

In [19]:
class DenseNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = torchvision.models.densenet161(pretrained=True)
        self.model.classifier = nn.Linear(2208, 30)    
        
        
    def forward(self, X):
        return self.model(X)

In [20]:
def validate(model, val_dataloader, criterion, history, n_iters):
    model.eval()
    costs = []
    accs = []
    with torch.no_grad():
        for idx, batch in enumerate(val_dataloader):
            X, y = batch
            X, y = X.cuda(), y.cuda().long()
            y_output = model(X)    
            loss = criterion(y_output, y)
            costs.append(loss.item())
            _, y_preds = torch.max(y_output, 1)
            accs.append((y_preds == y).float().mean().item())
    mean_accs = np.mean(accs)
    mean_costs = np.mean(costs)
    writer.add_scalar('age/validate_accuracy', mean_accs, n_iters)
    writer.add_scalar('age/validate_loss', mean_costs, n_iters)
    if mean_accs > history['best_acc']:  
        history['best_acc'] = mean_accs
        checkpoint_pth = history['best_model_path']
        torch.save(model.state_dict(), checkpoint_pth)
    return mean_costs, mean_accs


def train(model, train_dataloader, val_dataloader, criterion, optimizer, epoch, history, validate_points, scheduler, step=True):
    model.train()
    costs = []
    accs = []
    val_loss, val_acc = 0, 0
    with tqdm(total=len(train_dataloader.dataset), desc='Epoch{}'.format(epoch)) as pbar:
        for idx, batch in enumerate(train_dataloader):
            X, y = batch
            X, y = X.cuda(), y.cuda().long()
            y_output = model(X)    
            loss = criterion(y_output, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step:
                scheduler.step()
            with torch.no_grad():
                costs.append(loss.item())
                _, y_preds = torch.max(y_output, 1)
                accs.append((y_preds == y).float().mean().item())
                pbar.update(y.size(0))
            n_iters = idx + len(train_dataloader) * (epoch-1)
            if idx in validate_points:
                val_loss, val_acc = validate(model, val_dataloader, criterion, history, n_iters)
                model.train()
            
            writer.add_scalar('age/train_accuracy', accs[-1], n_iters)
            writer.add_scalar('age/train_loss', costs[-1], n_iters)
            writer.add_scalar('age/learning_rate', scheduler.get_lr()[0], n_iters)
            pbar.set_postfix_str('loss:{:.4f}, acc:{:.4f}, val-loss:{:.4f}, val-acc:{:.4f}'.format(np.mean(costs[-10:]), np.mean(accs[-10:]), val_loss, val_acc))
            torch.cuda.empty_cache()

    
def test(oof_train_test, model, test_dataloader, val_dataloader, valid_index, weight=1):
    model.eval()
    y_preds = []
    y_preds_val = []
    with torch.no_grad():
        for idx, batch in enumerate(test_dataloader):
            X = batch
            X= X.cuda()
            y_output = model(X)    
            y_preds.append(y_output.cpu())
            
        for idx, batch in enumerate(val_dataloader):
            X, y = batch
            X = X.cuda()
            y_output = model(X)
            y_preds_val.append(y_output.cpu())
    
    oof_train_test[valid_index] += F.softmax(torch.cat(y_preds_val)).numpy() * weight
    oof_train_test[57886:] += F.softmax(torch.cat(y_preds)).numpy() * weight

In [21]:
def criterion(y_output, y_true):
    loss = nn.CrossEntropyLoss()(y_output, y_true)
    return loss

res_folds = []
acc_folds = []
model_name = 'densenet161_augment_0'
for idx, (train_dataloader, val_dataloader, test_dataloader) in enumerate(data_folds):
    oof_train_test = np.zeros((X_train.shape[0] + X_test.shape[0], 30))
    history = {'best_acc': 0, 'best_model_path':os.path.join(model_save, '{}_checkpoint_fold_{}.pth'.format(model_name, idx))}
    validate_points = list(np.linspace(0, len(train_dataloader)-1, 3).astype(int))[1:]
    model = DenseNet().cuda()
#     model = nn.DataParallel(model, device_ids=[0, 1])
    optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), lr=1e-3)
    epochs = 5
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
#     scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-5, max_lr=2e-3, step_size_up=int(len(train_dataloader)/2), cycle_momentum=False, mode='triangular')
#     scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=3e-3, epochs=epochs, steps_per_epoch=len(train_dataloader), pct_start=0.2, anneal_strategy='linear', div_factor=30, final_div_factor=1e4)
    for epoch in range(1, epochs+1):
        writer = SummaryWriter(log_dir=os.path.join(tensorboard_path, '{}_fold_{}'.format(model_name, idx)))
        train(model, train_dataloader, val_dataloader, criterion, optimizer, epoch, history, validate_points, scheduler, step=False)
        scheduler.step()
        gc.collect()
    model.load_state_dict(torch.load(history['best_model_path'], map_location= torch.device('cpu')), strict=True)
    test(oof_train_test, model, test_dataloader, val_dataloader, valid_indexs[idx], weight=1)
    acc_folds.append(history['best_acc'])
    res_folds.append(oof_train_test)
    np.save(os.path.join(res_path, "{}_fold_{}.npy".format(model_name, idx)), oof_train_test)
    del model, history 
    gc.collect()
    torch.cuda.empty_cache()

Epoch1: 100%|██████████| 46308/46308 [08:02<00:00, 96.05it/s, loss:0.2494, acc:0.9248, val-loss:0.3475, val-acc:0.8942] 
Epoch2: 100%|██████████| 46308/46308 [07:55<00:00, 97.30it/s, loss:0.1829, acc:0.9444, val-loss:0.1290, val-acc:0.9616] 
Epoch3: 100%|██████████| 46308/46308 [07:58<00:00, 96.73it/s, loss:0.1444, acc:0.9630, val-loss:0.0991, val-acc:0.9705] 
Epoch4: 100%|██████████| 46308/46308 [07:59<00:00, 96.65it/s, loss:0.1124, acc:0.9722, val-loss:0.0892, val-acc:0.9734] 
Epoch5: 100%|██████████| 46308/46308 [07:48<00:00, 98.81it/s, loss:0.1071, acc:0.9689, val-loss:0.0812, val-acc:0.9763] 
Epoch1: 100%|██████████| 46309/46309 [07:56<00:00, 97.09it/s, loss:0.2573, acc:0.9220, val-loss:0.2728, val-acc:0.9170] 
Epoch2: 100%|██████████| 46309/46309 [07:58<00:00, 96.86it/s, loss:0.1932, acc:0.9446, val-loss:0.1291, val-acc:0.9606] 
Epoch3: 100%|██████████| 46309/46309 [07:57<00:00, 96.89it/s, loss:0.1054, acc:0.9722, val-loss:0.1013, val-acc:0.9703] 
Epoch4: 100%|██████████| 46309/4

In [22]:
acc_folds

[0.9762694200316628,
 0.9750819356886895,
 0.9709821428571429,
 0.9762416861869476,
 0.9791380494505495]

In [24]:
res_folds

[array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [7.85733759e-02, 4.25898797e-05, 3.19214882e-06, ...,
         9.37322238e-06, 3.30220617e-04, 5.94436733e-06],
        [9.99553263e-01, 2.14319229e-07, 4.09443608e-08, ...,
         1.15420889e-07, 1.89542170e-05, 9.49692147e-09],
        ...,
        [1.10477504e-06, 2.02628271e-05, 1.64571829e-05, ...,
         6.38984488e-07, 1.03654684e-05, 5.91033881e-07],
        [1.57495541e-03, 1.76962127e-03, 4.35744151e-02, ...,
         1.51485763e-03, 3.20568425e-03, 1.65873766e-03],
        [6.98836402e-07, 6.13883856e-07, 2.10523172e-06, ...,
         9.73290870e-09, 2.33207658e-07, 6.19521012e-09]]),
 array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 

In [25]:
res = []
for i in range(len(data_folds)):
    res.append(np.load(os.path.join(res_path, "{}_fold_{}.npy".format(model_name, i))))

In [27]:
sub = pd.DataFrame()
sub['file_name'] = test_names
sub['label'] = np.argmax(np.mean(res, axis=0)[57886:], axis=1)

In [28]:
sub['label'] = sub['label'].map({i:label for i, label in enumerate(labels)})

In [29]:
sub

Unnamed: 0,file_name,label
0,3o9p4zffh0.wav,marvin
1,srdw856mtq.wav,three
2,k42nwx43w4.wav,yes
3,6km36wy1rq.wav,five
4,mi8mrzrdra.wav,two
...,...,...
6830,mk1xjjrsuv.wav,happy
6831,0ctd4hbh13.wav,two
6832,akuoa16fdq.wav,stop
6833,vrjj8ay7x0.wav,tree


In [32]:
now = time.strftime("%Y%m%d_%H%M%S",time.localtime(time.time())) 
fname="submit_" + model_name + "_" + now + ".csv"    
sub.to_csv(os.path.join(res_path, fname), index=False)