In [270]:
import librosa
import torch
import numpy as np
from PIL import Image

from torchvision import transforms
from torch import nn

In [271]:
FILE_PATH = "/mnt/c/Users/86181/Datasets/Bird"
BIRD_LABEL = {
    "0009": "灰雁",
    "0017": "大天鹅",
    "0034": "绿头鸭",
    "0036": "绿翅鸭",
    "0074": "灰山鹑",
    "0077": "西鹌鹑",
    "0114": "雉鸡",
    "0121": "红喉潜鸟",
    "0180": "苍鹭",
    "0202": "普通鸬鹚",
    "0235": "苍鹰",
    "0257": "欧亚鵟",
    "0265": "西方秧鸡",
    "0281": "骨顶鸡",
    "0298": "黑翅长脚鹬",
    "0300": "凤头麦鸡",
    "0364": "白腰草鹬",
    "0368": "红脚鹬",
    "0370": "林鹬",
    "1331": "麻雀"
}

In [272]:
# CNN
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()

        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=32,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=0)

        self.conv2 = nn.Conv2d(in_channels=32,
                               out_channels=64,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=0)

        self.conv3 = nn.Conv2d(in_channels=64,
                               out_channels=128,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=0)

        self.conv4 = nn.Conv2d(in_channels=128,
                               out_channels=256,
                               kernel_size=(3, 3),
                               stride=(1, 1),
                               padding=0)

        self.batchNorm1 = nn.BatchNorm2d(32)
        self.batchNorm2 = nn.BatchNorm2d(64)
        self.batchNorm3 = nn.BatchNorm2d(128)
        self.batchNorm3 = nn.BatchNorm2d(256)
        self.relu = nn.LeakyReLU()
        self.max_pool = nn.MaxPool2d(kernel_size=2)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.linear1 = nn.Linear(in_features=256, out_features=32)
        self.linear2 = nn.Linear(in_features=32, out_features=20)

    def forward(self,x):
        x = self.conv1(x)
        x = self.batchNorm1(x)
        x = self.relu(x)
        x = self.max_pool(x)

        x = self.conv2(x)
        x = self.batchNorm2(x)
        x = self.relu(x)
        x = self.max_pool(x)

        x = self.conv3(x)
        # x = self.batchNorm3(x)
        x = self.relu(x)
        x = self.max_pool(x)
        x = self.dropout1(x)

        x = self.conv4(x)
        # x = self.batchNorm4(x)
        x = self.relu(x)
        x = self.max_pool(x)
        x = self.dropout1(x)

        x = x.view(x.shape[0], -1) # 将数据设置为1维
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.linear2(x)
        return x

In [273]:
def convert_to_mfcc(file_name):
    y, sr = librosa.load(file_name,sr=None)
    # spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    S_dB = librosa.power_to_db(spectrogram, ref=np.max) # use positive value!
    mfcc = librosa.feature.mfcc(S=S_dB,sr=None,n_mfcc=40,dct_type=2)
    mfcc = Image.fromarray(mfcc)
    return mfcc

In [274]:
def softmax(x):
    x = x - np.max(x)
    return np.exp(x)/np.sum(np.exp(x))

In [275]:
cnn = torch.load('cnn.pkl')
transform = transforms.Compose([
        transforms.Resize((50,50)),
        transforms.ToTensor(),
        transforms.Normalize(mean=0.5,std=0.5)
    ])

In [276]:
class_name = "0281"
file_name = "290480_3.wav"
full_file_name = FILE_PATH+'/'+class_name+'/'+file_name
keys = list(BIRD_LABEL.keys())

input_data = convert_to_mfcc(full_file_name)

input_data = transform(input_data)
input_data = input_data.unsqueeze(0)
with torch.no_grad():
    output_result = cnn(input_data)
output_result = np.array(output_result.squeeze())
output_result = softmax(output_result)
print(output_result*100)

predicted_result = output_result.argmax()
predicted_kind = BIRD_LABEL[keys[predicted_result]]
print("predicted result: {}".format(predicted_kind))
print("actual result: {}".format(BIRD_LABEL[class_name]))



[1.1829198e-01 1.2484890e-08 3.0809904e-03 5.8781046e-01 2.9499416e-09
 1.6363701e-07 1.6667817e-02 2.3498764e+00 2.1472057e-10 5.5874353e-03
 6.8742892e-08 2.4216066e-03 2.7947916e-02 9.6553925e+01 1.5290799e-04
 1.1838568e-13 8.9512079e-04 3.3327785e-01 5.2874083e-07 5.9161819e-05]
predicted result: 骨顶鸡
actual result: 骨顶鸡
