# Understanding audio features
## import packages

In [None]:
import numpy as np
import sounddevice as sd
from scipy import signal
import librosa
import librosa.display
import sugartensor as tf
from model import *
import data
import matplotlib.pyplot as plt
%matplotlib inline

## Load/record audio

In [None]:
duration = 2.0  # seconds
fs = 16000 # sampling rate
#myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=1, blocking = True)
myrecording, _ = librosa.load('sample.wav', mono=True, sr=16000)

## Verify the audio

In [None]:
#sd.play(myrecording[10000:20000], fs)

## Remove leading and trailing silence to reduce data size

In [None]:
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of myrecording')
ax1.set_ylabel('Amplitude')
ax1.plot(myrecording)

ax2 = fig.add_subplot(212)
ax2.set_title('speech section of myrecording')
ax2.set_ylabel('Amplitude')
ax2.plot(myrecording[10000:160000])

## Create and plot spectrogram

In [None]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [None]:
freqs, times, spectrogram = log_specgram(myrecording, fs)
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of myrecording')
ax1.set_ylabel('Amplitude')
ax1.plot(myrecording)

ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower', 
           extent=[times.min(), times.max(), freqs.min(), freqs.max()])
#ax2.set_yticks(freqs[::16])
#ax2.set_xticks(times[::16])
ax2.set_title('Spectrogram of myrecording')
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('Seconds')

## Create and plot MFCC features

In [None]:
# From this tutorial
# https://github.com/librosa/librosa/blob/master/examples/LibROSA%20demo.ipynb
S = librosa.feature.melspectrogram(myrecording, sr=fs, n_mels=128)

# Convert to log scale (dB). We'll use the peak power (max) as reference.
log_S = librosa.power_to_db(S, ref=np.max)

fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of myrecording')
ax1.set_ylabel('Amplitude')
ax1.plot(myrecording)

ax2 = fig.add_subplot(212)
mappable = librosa.display.specshow(log_S, sr=fs, x_axis='time', y_axis='mel')
ax2.set_title('Mel power spectrogram ')

cax = plt.axes([0.95, 0.12, 0.03, 0.35])
plt.colorbar(format='%+02.0f dB', ax=ax2, cax=cax)


# Converting speech to text using transfer learning
## Set input file name and model location

In [None]:
SPEECH_FILE_NAME = 'sample.wav'
PRETRAINED_MODEL_LOCATION = 'asset/train'

## Define tensorflow graph

In [None]:
#tf.reset_default_graph()
# set log level to debug
tf.sg_verbosity(10)

#
# hyper parameters
#

batch_size = 1     # batch size

#
# inputs
#

# vocabulary size
voca_size = data.voca_size

# mfcc feature of audio
x = tf.placeholder(dtype=tf.sg_floatx, shape=(batch_size, None, 20))

# sequence length except zero-padding
seq_len = tf.not_equal(x.sg_sum(axis=2), 0.).sg_int().sg_sum(axis=1)

# encode audio feature
logit = get_logit(x, voca_size=voca_size)

# ctc decoding
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)

# to dense tensor
y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1

## Load audio file and extract mfcc feature

In [None]:
# load wave file
#wav, _ = librosa.load(SPEECH_FILE_NAME, mono=True, sr=16000)
# get mfcc feature
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(myrecording, 16000), axis=0), [0, 2, 1])

In [None]:
print(wav.shape)
print(myrecording[:,0].shape)

## Run pre-trained model to extract text

In [None]:
# run network
with tf.Session() as sess:

    # init variables
    tf.sg_init(sess)

    # restore parameters
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(PRETRAINED_MODEL_LOCATION))
    # run session
    label = sess.run(y, feed_dict={x: mfcc})

    # print label
    data.print_index(label)