In [0]:
import tensorflow as tf, pandas as pd, numpy as np, time, os

In [2]:
!pip install pydub

Collecting pydub
  Downloading https://files.pythonhosted.org/packages/79/db/eaf620b73a1eec3c8c6f8f5b0b236a50f9da88ad57802154b7ba7664d0b8/pydub-0.23.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.23.1


In [0]:
# pydub
from pydub import AudioSegment

In [4]:
!git clone https://github.com/RedbirdTaiwan/silic.git

Cloning into 'silic'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 56 (delta 11), reused 37 (delta 10), pack-reused 18[K
Unpacking objects: 100% (56/56), done.


In [5]:
!ls silic/
!ls silic/model
!ls silic/sample

LICENSE  model	README.md  sample  scripts
conv_labels.txt  frozen_graph.pb  sound_class.txt
11794.mp3


In [0]:
from google.colab import files

In [0]:
files.upload()

In [0]:
class Autolabel():
    def __init__(self, label_file, graph_file):
        self.label_file = label_file
        self.graph_file = graph_file
        self.__sound = ''
        self.filename = ''
        self.results = {}
        self.labels_list = [line.rstrip() for line in tf.io.gfile.GFile(self.label_file)]
        with tf.io.gfile.GFile(self.graph_file, 'rb') as f:
            graph_def = tf.compat.v1.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def, name='')
        self.sess = tf.compat.v1.Session()
        self.softmax_tensor = self.sess.graph.get_tensor_by_name('labels_softmax:0')

    def readfile(self, audio_file):
        frame_rate = 30000
        self.filename = audio_file
        filext = audio_file[-3:].lower()
        if filext == "mp3":
            self.__sound = AudioSegment.from_mp3(audio_file)
        elif filext == "wma":
            self.__sound = AudioSegment.from_file(audio_file, "wma")
        elif filext == "m4a":
            self.__sound = AudioSegment.from_file(audio_file, "m4a")
        elif filext == "ogg":
            self.__sound = AudioSegment.from_ogg(audio_file)
        else:
            self.__sound = AudioSegment.from_wav(audio_file)
        if self.__sound.frame_rate > frame_rate:
            self.__sound = self.__sound.set_frame_rate(frame_rate)
        if self.__sound.channels > 1:
            self.__sound = self.__sound.split_to_mono()[0]
        if not self.__sound.sample_width == 2:
            self.__sound = self.__sound.set_sample_width(2)

    def ailabel(self, **kwargs):
        if not len(self.__sound):
            print('use ".readfile(<audio_file_path>)" to import a recording')
            return False
        clip_duration_ms = 2000
        if not 'step' in kwargs.keys():
            step = clip_duration_ms
        else:
            step = kwargs['step']
        if not 'target' in kwargs.keys() or not kwargs['target']:
            self.results = {int(k):[] for k in self.labels_list[2:]}
        else:
            self.results = {k:[] for k in kwargs['target']}
        for t in range(0,len(self.__sound),step):
            if (t + clip_duration_ms) <= len(self.__sound):
                soundclip = self.__sound[t:t+clip_duration_ms]
                tmpwav = "%s_tmp.wav" %self.filename
                soundclip.export(tmpwav, format="wav")
                with open(tmpwav, 'rb') as wav:
                    wav_data2 = wav.read()
                predictions, = self.sess.run(self.softmax_tensor, {'wav_data:0': wav_data2})
                for node_id in range(2, len(self.labels_list)):
                    soundid = int(self.labels_list[node_id])
                    if soundid in self.results.keys():
                        score = round(predictions[node_id],3)
                        self.results[soundid].append([t,score])
                if os.path.exists(tmpwav):
                    os.remove(tmpwav)

In [0]:
test = Autolabel('silic/model/conv_labels.txt', 'silic/model/frozen_graph.pb')

In [0]:
test.readfile('silic/sample/11794.mp3')

In [0]:
test.ailabel(step=1000)

In [21]:
results = test.results
soundclass = pd.read_csv('silic/model/sound_class.txt', index_col=0, sep='\t').T.to_dict()
print('\t'.join(['sound', 'time_start', 'ai_score']))
print('===============================')
for soundid, scores in zip(results.keys(), results.values()):
  for score in scores:
    if score[1] >= 0.65:
      sound = soundclass[int(soundid)]['scientific_name'] + ':' + soundclass[int(soundid)]['class_name']
      time_start = score[0]/1000
      ai_score = score[1]
      print('\t'.join([str(sound), str(time_start), str(ai_score)]))

sound	time_start	ai_score
Otus spilocephalus:S-01	60.0	0.654
Otus spilocephalus:S-01	236.0	0.738
Ninox japonica:S-01	1.0	0.66
Ninox japonica:S-01	5.0	0.81
Ninox japonica:S-01	36.0	0.951
Ninox japonica:S-01	37.0	0.824
Ninox japonica:S-01	38.0	0.911
Ninox japonica:S-01	39.0	0.949
Ninox japonica:S-01	40.0	0.942
Ninox japonica:S-01	41.0	0.867
Ninox japonica:S-01	42.0	0.805
Ninox japonica:S-01	43.0	0.985
Ninox japonica:S-01	44.0	0.865
Ninox japonica:S-01	45.0	0.876
Ninox japonica:S-01	46.0	0.812
Ninox japonica:S-01	47.0	0.934
Ninox japonica:S-01	49.0	0.868
Ninox japonica:S-01	50.0	0.821
Ninox japonica:S-01	51.0	0.949
Ninox japonica:S-01	93.0	0.943
Ninox japonica:S-01	94.0	0.789
Ninox japonica:S-01	95.0	0.939
Ninox japonica:S-01	96.0	0.895
Ninox japonica:S-01	97.0	0.977
Ninox japonica:S-01	98.0	0.876
Ninox japonica:S-01	99.0	0.953
Ninox japonica:S-01	100.0	0.943
Ninox japonica:S-01	101.0	0.938
Ninox japonica:S-01	102.0	0.943
Ninox japonica:S-01	103.0	0.808
Ninox japonica:S-01	104.0	0.976
Nin