In [1]:
!pip install -q pydub
!pip install -q colored

In [None]:
#CPU times: user 52min 25s, sys: 1min 33s, total: 53min 58s
#Wall time: 56min 7s

In [2]:
import os
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from colored import fg, bg, attr

import pydub
import librosa
from pydub import AudioSegment as AS
from librosa.core import power_to_db as ptdb
from librosa.feature import melspectrogram as melsp

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torch import FloatTensor, LongTensor, DoubleTensor

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences as pad

In [3]:
U = 64
H = 128
EPOCHS = 5
SPLIT = 0.8
MAXLEN = 1000000
BATCH_SIZE = 128
VAL_BATCH_SIZE = 128
LEARNING_RATE = 1e-2
K = [2**i for i in range(1, 9)]

In [4]:
TRAIN_AUDIO_PATH = 'birdsong-recognition/train_audio/'
SUBMISSION_PATH = 'birdsong-recognition/sample_submission.csv'

In [5]:
test_df = pd.read_csv('birdsong-recognition/test.csv')
train_df = pd.read_csv('birdsong-recognition/train.csv')

In [6]:
test_df.head()

Unnamed: 0,site,row_id,seconds,audio_id
0,site_1,site_1_41e6fe6504a34bf6846938ba78d13df1_5,5.0,41e6fe6504a34bf6846938ba78d13df1
1,site_1,site_1_41e6fe6504a34bf6846938ba78d13df1_10,10.0,41e6fe6504a34bf6846938ba78d13df1
2,site_1,site_1_41e6fe6504a34bf6846938ba78d13df1_15,15.0,41e6fe6504a34bf6846938ba78d13df1
3,site_1,site_1_41e6fe6504a34bf6846938ba78d13df1_20,20.0,41e6fe6504a34bf6846938ba78d13df1
4,site_1,site_1_41e6fe6504a34bf6846938ba78d13df1_25,25.0,41e6fe6504a34bf6846938ba78d13df1


In [7]:
train_df.head()

Unnamed: 0,rating,playback_used,ebird_code,channels,date,pitch,duration,filename,speed,species,...,xc_id,url,country,author,primary_label,longitude,length,time,recordist,license
0,3.5,no,aldfly,1 (mono),2013-05-25,Not specified,25,XC134874.mp3,Not specified,Alder Flycatcher,...,134874,https://www.xeno-canto.org/134874,United States,Jonathon Jongsma,Empidonax alnorum_Alder Flycatcher,-92.962,Not specified,8:00,Jonathon Jongsma,Creative Commons Attribution-ShareAlike 3.0
1,4.0,no,aldfly,2 (stereo),2013-05-27,both,36,XC135454.mp3,both,Alder Flycatcher,...,135454,https://www.xeno-canto.org/135454,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...
2,4.0,no,aldfly,2 (stereo),2013-05-27,both,39,XC135455.mp3,both,Alder Flycatcher,...,135455,https://www.xeno-canto.org/135455,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...
3,3.5,no,aldfly,2 (stereo),2013-05-27,both,33,XC135456.mp3,both,Alder Flycatcher,...,135456,https://www.xeno-canto.org/135456,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...
4,4.0,no,aldfly,2 (stereo),2013-05-27,both,36,XC135457.mp3,level,Alder Flycatcher,...,135457,https://www.xeno-canto.org/135457,United States,Mike Nelson,Empidonax alnorum_Alder Flycatcher,-82.1106,0-3(s),08:30,Mike Nelson,Creative Commons Attribution-NonCommercial-Sha...


In [22]:
keys = set(train_df.ebird_code)
values = np.arange(0, len(keys))
code_dict = dict(zip(sorted(keys), values))

In [9]:
def normalize(x):
    return np.float32(x)/2**15

def read(file, norm=False):
    try: a = AS.from_mp3(file)
    except: return np.zeros(MAXLEN)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2: y = y.reshape((-1, 2))
    if norm: return a.frame_rate, normalize(y)
    if not norm: return a.frame_rate, np.float32(y)

def write(file, sr, x, normalized=False):
    birds_audio_bitrate, file_format = '320k', 'mp3'
    ch = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
    y = np.int16(x * 2 ** 15) if normalized else np.int16(x)
    song = AS(y.tobytes(), frame_rate=sr, sample_width=2, channels=ch)
    song.export(file, format=file_format, bitrate=birds_audio_bitrate)

In [10]:
def to_tensor(data):
    return [FloatTensor(point) for point in data]

def norm_melsp(data):
    return (data - np.mean(data))/(np.std(data) + 1e-7)

def get_signal(data):
    return np.float32(pad(data.flatten().reshape(1, -1), maxlen=MAXLEN).reshape(-1))

In [11]:
class BirdDataset(Dataset):
    def __init__(self, df, path):
        self.code_dict = code_dict
        self.classes = len(code_dict)
        self.df, self.path = df, path
        self.dataset_length = len(df)
        
    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, i):
        file_name = self.df.filename[i]
        ebird_code = self.df.ebird_code[i]
        num_code = self.code_dict[ebird_code]
        sr, data = read(self.path + ebird_code + '/' + file_name)
        code = to_categorical([num_code], num_classes=self.classes)
        return to_tensor([norm_melsp(ptdb(melsp(y=get_signal(data), sr=sr))), code])

In [25]:
split = int(SPLIT*len(train_df))
#print(split,len(train_df))
valid_df = train_df[split:].reset_index(drop=True)
train_df = train_df[:split].reset_index(drop=True)

train_set = BirdDataset(train_df, TRAIN_AUDIO_PATH)
valid_set = BirdDataset(valid_df, TRAIN_AUDIO_PATH)

valid_loader = DataLoader(valid_set, batch_size=VAL_BATCH_SIZE)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

In [13]:
class BirdPoolCNN(nn.Module):
    def __init__(self, h, k, u, o, device):
        super(BirdPoolCNN, self).__init__()
        self.dense_output = nn.Linear(u*len(k), o).to(device)
        self.conv1d = [nn.Conv1d(h, u, s).to(device) for s in k]
        
    def forward(self, x):
        x = x/x.max(axis=-1)[0].unsqueeze(-1)
        features = [layer(x) for layer in self.conv1d]
        features = [f.mean(axis=-1) for f in features]
        return self.dense_output(torch.cat(features, axis=1))

In [14]:
O = len(code_dict)
device = torch.device('cuda')
#K = [2**i for i in range(1, 9)]
network = BirdPoolCNN(h=H, k=K, u=U, o=O, device=device)
optimizer = Adam(params=network.parameters(), lr=LEARNING_RATE)

In [15]:
def cel(y_true, y_pred):
    y_true = torch.argmax(y_true, axis=-1)
    return nn.CrossEntropyLoss()(y_pred, y_true.squeeze())

def accuracy(y_true, y_pred):
    y_true = torch.argmax(y_true, axis=-1).squeeze()
    y_pred = torch.argmax(y_pred, axis=-1).squeeze()
    return (y_true == y_pred).float().sum()/len(y_true)

In [30]:
def print_metric(data, batch,
                 epoch, start,
                 end, metric, typ):

    t = typ, metric, "%s", data, "%s"
    if typ == "Train": pre = "BATCH " + str(batch-1) + "  "
    if typ == "Val": pre = "\nEPOCH " + str(epoch+1) + "  "
    time = np.round(end - start, 1); time = "Time: {} s".format(time)
    fonts = [(fg(211), attr('reset')), (fg(212), attr('reset')), (fg(213), attr('reset'))]
    print(fonts)
    #print(pre % fonts[0] + "{} {}: {}{}{}".format(*t) % fonts[1] + "  " + time % fonts[2])
    print(pre + "{} {}: {}{}{}".format(*t) + "  " + time)

In [34]:
start = time.time()
#file1 = open("logs.txt","w+") 

print("starting to Traing ...\n")
#file1.writelines("starting to Traing ...\n")
for epoch in range(EPOCHS):
    fonts = (fg(48), attr('reset'))
    logstr = ""
    print(("EPOCH " + str(epoch+1) + ""))
    logstr = ("EPOCH" + str(epoch+1)) + "\n"
    #file1.writelines(logstr)
    batch = 1
    network.train()
    for minibatch in train_loader:
        train_X, train_y = minibatch
        train_X = train_X.to(device)
        train_y = train_y.to(device)

        train_preds = network.forward(train_X)
        train_loss = cel(train_y, train_preds)
        train_accuracy = accuracy(train_y, train_preds)
        
        optimizer.zero_grad()
        train_loss.backward()

        optimizer.step()
        end = time.time()
        batch = batch + 1
        acc = np.round(train_accuracy.item(), 3)
        print_metric(acc, batch, 0, start, end, "Acc", "Train")
    
    val_loss = 0
    val_points = 0
    val_accuracy = 0

    with torch.no_grad():
        for minibatch in valid_loader:
            valid_X, valid_y = minibatch
            valid_X = valid_X.to(device)
            valid_y = valid_y.to(device)
            
            valid_points += len(valid_y)
            valid_preds = network.forward(valid_X)
            valid_loss += cel(valid_y, valid_preds)*len(valid_y)
            valid_accuracy += accuracy(valid_y, valid_preds)*len(valid_y)
            
    end = time.time()
    valid_loss /= valid_points
    valid_accuracy /= valid_points
    acc = np.round(valid_accuracy, 3)
    print_metric(acc, 0, epoch, start, end, "Acc", "Val")
print("ending to Traing ...")
#file1.writelines("ending to Traing ...\n")
#file1.close()

starting to Traing ...

EPOCH 1


ValueError: too many values to unpack (expected 2)