In [None]:
import subprocess
import numpy as np
import IPython.display as ipd
import pathlib
import re
import typing
from collections import namedtuple
from operator import itemgetter
from matplotlib import pyplot as plt
from random import random
from ipywidgets import FloatProgress
import tensorflow as tf
from bisect import bisect_left
%matplotlib inline

In [None]:
FFMPEG_BIN = r"/data/ffmpeg-git-20191022-amd64-static/ffmpeg"
AUDIO_FOLDER = pathlib.Path(r"/data")
AUDIO_FILE = r"teste-tom-silencio.ogg"
LABEL_FILE = r"teste-tom-silencio.txt"

In [None]:
Interval = namedtuple("Interval", ["start", "end", "duration", "label"])
class Labels:
    def __init__(self, labels_file, empty_label = "EMPTY_LABEL"):
        with open(labels_file) as f:
            txt = f.read()
        last_end = 0
        intervals = []  
        uniques = set()
        for line in txt.splitlines():
            i, f, label = line.split()
            uniques.add(label)
            start = float(i)
            end = float(f)
            duration = end - start
            if start - last_end > 0:
                intervals.append(Interval(last_end, start, start-last_end, empty_label))
                uniques.add(empty_label)
            last_end = end
            if duration == 0:
                continue
            intervals.append(Interval(start, end, duration, label))   
        self._intervals = intervals
        self._labels = uniques
        self._index = [x for x in map(lambda a: getattr(a, 'end'), intervals)]
        
    def _print_intervals(self):
        for idx, ival in enumerate(self._intervals):
            print(idx, ival)
            
    def _get_interval_info(self, start, end, start_offset=0):
        got = []
        total_duration = end - start        
        for idx, ival in enumerate(self._intervals[start_offset:]):          
            if getattr(ival, "end") <= start:                
                continue
            label = getattr(ival, "label")
            s = max(getattr(ival, "start"), start)
            e = min(getattr(ival, "end"), end)
            dur = e - s
            got.append(Interval(s, e, dur, label))            
            if e == end:
                break   
        percents = {}
        for interval in got:
            label = getattr(interval, "label")
            percent = getattr(interval, "duration") / total_duration
            percents[label] = percents.get(label, 0) + percent
        return sorted(percents.items(), key=itemgetter(1), reverse=True)
    
    def get_interval_info(self, start, end):
        offset = bisect_left(self._index, start)
        return self._get_interval_info(start, end, offset)
        
    
    def iter_seconds(self, offset=0):
        start = 0 + offset
        while True:
            info = self.get_interval_info(start, start + 1)
            if len(info) == 0:
                break
            start = start + 1
            yield info

In [None]:
class AudioFile:
    def __init__(self, file_path, FFMPEG_BIN):        
        self._file_path = file_path
        self._FFMPEG_BIN = FFMPEG_BIN
                
    def _get_info(self):
        REGEX = re.compile(r"Duration\:\s([\d\:\.]+).*?\,\s(\d+)\sHz\,\s(\w+)\,", re.DOTALL)
        with subprocess.Popen([
            self._FFMPEG_BIN,
            "-i",
            self._file_path,
            "-hide_banner"
            ],
            stderr=subprocess.PIPE
            ) as f:  
            infos = f.stderr.read()
        infos = infos.decode("ascii")        
        (time, hz, ch) = REGEX.search(infos).groups()
        times = time.split(":")
        secs = (int(times[0]) * 60 * 60) + (int(times[1]) * 60) + (int(times[2].split(".")[0]))
        self.secs = secs
        return [secs]
    
    def _yield_raw_buffer(self, chuck_size=44100 * 2):
        command = [ 
            self._FFMPEG_BIN,
            '-i', self._file_path,
            '-f', 's16le',
            '-acodec', 'pcm_s16le',
            '-ar', '44100', # ouput will have 44100 Hz
            '-ac', '1', # stereo (set to '1' for mono)
            '-',
        ]
        CHUNK_SIZE = chuck_size    
        with subprocess.Popen(
            command, 
            stdout=subprocess.PIPE, 
            bufsize=CHUNK_SIZE * 2,             
        ) as pipe:
            while True:
                chunk = pipe.stdout.read(CHUNK_SIZE)
                if len(chunk) == 0:
                    break
                yield chunk
        
    def _yield_raw_by_seconds(self, seconds = 1):
        return self._yield_raw_buffer(seconds * 44100 * 2)
         
    def iter_seconds(self):
        command = [ self._FFMPEG_BIN,
        '-i', self._file_path,
        '-f', 's16le',
        '-acodec', 'pcm_s16le',
        '-ar', '44100', # ouput will have 44100 Hz
        '-ac', '1', # stereo (set to '1' for mono)
        '-']
        CHUNK_SIZE = 44100 * 2 # 16bit == 2 bytes ???      
        with subprocess.Popen(
            command, 
            stdout=subprocess.PIPE, 
            bufsize=CHUNK_SIZE * 2 * 2,             
        ) as pipe:
            while True:
                chunk = pipe.stdout.read(CHUNK_SIZE)
                if len(chunk) == 0:
                    break
                yield tf.constant(np.frombuffer(chunk, dtype=np.int16))
                
class AudioWithLabels:
    def __init__(self, audio_file, labels_file, ffmpeg):
        self._audio = AudioFile(audio_file, ffmpeg)
        self._labels = Labels(labels_file)
        
    def iter_seconds(self):
        secs = self._audio._get_info()[0]        
        for idx, (audio, labels) in enumerate(zip(self._audio.iter_seconds(), self._labels.iter_seconds())):
            if len(audio) == 0:                          
                break
            if len(labels) == 0:                
                break
            if audio.shape[0] != 44100:
                continue
            percent = idx / secs
            yield audio, labels, percent

In [None]:
awl = AudioWithLabels(AUDIO_FOLDER / AUDIO_FILE, AUDIO_FOLDER / LABEL_FILE, FFMPEG_BIN)
f = FloatProgress(min=0, 
                  max=1, 
    description='lendo audio...',
    bar_style='warning',
    orientation='horizontal')

ipd.display(f) # display the bar

for i, (a, l, p) in enumerate(awl.iter_seconds()):
    f.value = p
    print(i, l)
    
intervals = awl._labels._intervals

In [None]:
command = [ 
            FFMPEG_BIN,
            '-i', AUDIO_FOLDER / AUDIO_FILE,
            '-f', 's16le',
            '-acodec', 'pcm_s16le',
            '-ar', '44100', # ouput will have 44100 Hz
            '-ac', '1', # stereo (set to '1' for mono)
            '-',
        ]
CHUNK_SIZE = 44100 
with subprocess.Popen(
    command, 
    stdout=subprocess.PIPE, 
    bufsize=CHUNK_SIZE * 2,             
) as pipe:
    while True:
        chunk = pipe.stdout.read(CHUNK_SIZE)
        if len(chunk) == 0:
            break        
        break

In [None]:
chunk[5:200]

In [None]:
awl = AudioWithLabels(AUDIO_FOLDER / AUDIO_FILE, AUDIO_FOLDER / LABEL_FILE, FFMPEG_BIN)

ds_gen = tf.data.Dataset.from_generator(
    lambda: ((a, [1] if b[0][0] == "TOM" else [0]) for a, b, _ in awl.iter_seconds()),
    (tf.int16, tf.int16),
    (tf.TensorShape(44100), tf.TensorShape(1))
)

ds = ds_gen.map(lambda a, b: (tf.complex(a / 32767, tf.constant(0, dtype=tf.float32)), b))
ds = ds.map(lambda a, b: (tf.signal.fft(a), b))

ds = ds.repeat(5).batch(5)

model = tf.keras.Sequential([
tf.keras.layers.Dense(16, activation='relu', input_shape=(44100,)),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='relu')])

model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(ds, epochs=15)