In [1]:
!pip install python_speech_features



In [2]:
from tensorflow.keras.models import load_model
from tensorflow.keras.metrics import Precision, Recall
import tensorflow.keras.backend as K
from tensorflow import keras
import librosa
import python_speech_features
import numpy as np
import textgrids
from sklearn import preprocessing
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from joblib import load

In [3]:


model = load_model('Archive 10.15.42 PM/model2.h5')
scaler = load('Archive 10.15.42 PM/scaler2.joblib')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed (TimeDist  (None, 15, 28, 28, 64)    1664      
 ributed)                                                        
                                                                 
 time_distributed_1 (TimeDi  (None, 15, 14, 14, 64)    0         
 stributed)                                                      
                                                                 
 time_distributed_2 (TimeDi  (None, 15, 12, 12, 128)   73856     
 stributed)                                                      
                                                                 
 time_distributed_3 (TimeDi  (None, 15, 6, 6, 128)     0         
 stributed)                                                      
                                                                 
 time_distributed_4 (TimeDi  (None, 15, 4, 4, 128)     1

In [4]:
audio_file = "Data/Audio/Female/TMIT/SA2.wav"
preemphasis_coef = 0.97
num_nfft = 512
frame_step = 0.01
frame_length = 0.025
n_frames = 32
num_features = 32

In [5]:
dataset_valid = list()

# Load samples:
input_signal, fs = librosa.load(audio_file)


# Extract logfbank features:
features_logfbank_valid = python_speech_features.base.logfbank(signal=input_signal, samplerate=fs, winlen=frame_length, winstep=frame_step, nfilt=num_features, 
                                                               nfft=num_nfft, lowfreq=0, highfreq=None, preemph=preemphasis_coef)


spectrogram_image_valid = np.zeros((n_frames, n_frames))
for j in range(int(np.floor(features_logfbank_valid.shape[0] / n_frames))):
    spectrogram_image_valid = features_logfbank_valid[j * n_frames:(j + 1) * n_frames]
    dataset_valid.append((0, spectrogram_image_valid))




In [6]:
stride = int(15)

if stride - len(dataset_valid) > 0:
    for i in range(stride - len(dataset_valid)):
        dataset_valid.append((0, np.zeros((n_frames, n_frames))))
        
print(dataset_valid)

[(0, array([[-22.82996753, -22.77635926, -23.37173593, ..., -22.16289663,
        -25.71863596, -26.03968876],
       [-22.2587406 , -22.4842433 , -22.44176335, ..., -20.98990814,
        -22.36201164, -22.70398858],
       [-23.99110139, -23.04179037, -23.22710913, ..., -21.51373323,
        -22.26732778, -22.42417539],
       ...,
       [-16.33897086, -17.52864931, -12.07596248, ..., -16.4206274 ,
        -17.49826417, -17.7539053 ],
       [-14.98646007, -14.80442462, -12.15878143, ..., -15.13053368,
        -15.06937521, -15.01207838],
       [-15.00839514, -14.50145509, -11.8564718 , ..., -16.32940667,
        -18.37280687, -19.31463585]])), (0, array([[-14.41370476, -14.03694935, -11.83816604, ..., -15.00663266,
        -15.42271044, -15.36334216],
       [-13.77690694, -13.72173462, -12.25982692, ..., -15.23501705,
        -17.3723758 , -18.85962646],
       [-14.37088596, -13.98919648, -11.84754717, ..., -15.65358707,
        -16.26079153, -16.28376204],
       ...,
       [-1

In [7]:
# Split dataset on train and test:
X_valid = list()
y_valid = list()
for i in range(len(dataset_valid)):
    X_valid.append(dataset_valid[i][1])
    y_valid.append(dataset_valid[i][0])
    
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)

# Reshaping for scaling:
X_valid = X_valid.reshape(X_valid.shape[0], X_valid.shape[1] * X_valid.shape[2])

# Scale data:
X_valid = scaler.transform(X_valid)

print(X_valid)

# And reshape back:
X_valid = X_valid.reshape(X_valid.shape[0], n_frames, n_frames)

print(X_valid)

[[-1.6093458  -1.55373654 -1.64545242 ... -0.98205444 -1.56546862
  -1.74154981]
 [ 0.53803333  0.50483661  0.73465985 ... -1.06595062 -2.17890319
  -2.64144966]
 [ 0.15559179  0.10670819  0.55513886 ... -0.9526831  -1.14624927
  -1.12550075]
 ...
 [ 4.21563818  3.81124829  3.17762977 ...  4.5396624   4.53862859
   4.60406803]
 [ 4.21563818  3.81124829  3.17762977 ...  4.5396624   4.53862859
   4.60406803]
 [ 4.21563818  3.81124829  3.17762977 ...  4.5396624   4.53862859
   4.60406803]]
[[[-1.6093458  -1.55373654 -1.64545242 ... -2.88079528 -3.90167311
   -3.84557354]
  [-1.45738388 -1.4941956  -1.45552326 ... -2.47308187 -2.79007939
   -2.75393314]
  [-1.90512119 -1.61595361 -1.61309021 ... -2.66021403 -2.77619769
   -2.67931467]
  ...
  [ 0.01347077 -0.35179693  0.67333136 ... -1.00614831 -1.26929248
   -1.22819073]
  [ 0.36368411  0.29649204  0.6503016  ... -0.56972019 -0.4631792
   -0.32529068]
  [ 0.35894116  0.3671537   0.71363615 ... -0.98205444 -1.56546862
   -1.74154981]]

 [[

In [8]:
# Reshape data for convolution layer:
X_valid_reshaped = X_valid[:int(np.floor(X_valid.shape[0] / stride) * stride)]


X_valid_reshaped = X_valid_reshaped.reshape((int(X_valid_reshaped.shape[0] / stride), stride, n_frames, n_frames, 1))

print(X_valid_reshaped)

[[[[[-1.6093458 ]
    [-1.55373654]
    [-1.64545242]
    ...
    [-2.88079528]
    [-3.90167311]
    [-3.84557354]]

   [[-1.45738388]
    [-1.4941956 ]
    [-1.45552326]
    ...
    [-2.47308187]
    [-2.79007939]
    [-2.75393314]]

   [[-1.90512119]
    [-1.61595361]
    [-1.61309021]
    ...
    [-2.66021403]
    [-2.77619769]
    [-2.67931467]]

   ...

   [[ 0.01347077]
    [-0.35179693]
    [ 0.67333136]
    ...
    [-1.00614831]
    [-1.26929248]
    [-1.22819073]]

   [[ 0.36368411]
    [ 0.29649204]
    [ 0.6503016 ]
    ...
    [-0.56972019]
    [-0.4631792 ]
    [-0.32529068]]

   [[ 0.35894116]
    [ 0.3671537 ]
    [ 0.71363615]
    ...
    [-0.98205444]
    [-1.56546862]
    [-1.74154981]]]


  [[[ 0.53803333]
    [ 0.50483661]
    [ 0.73465985]
    ...
    [-0.50139403]
    [-0.54880767]
    [-0.41212534]]

   [[ 0.68955839]
    [ 0.568139  ]
    [ 0.63948445]
    ...
    [-0.56499851]
    [-1.16944345]
    [-1.5201485 ]]

   [[ 0.53424485]
    [ 0.50486711]
    [ 0.73

In [10]:
prediction = model.predict(X_valid_reshaped)

print(prediction)

predicted_label = np.zeros(prediction.shape[1])
predicted_proba = np.zeros(prediction.shape[1])
ind = 0
for i in range(prediction.shape[1]):
    if prediction[0][i][0] >= prediction[0][i][1]:
        predicted_label[ind] = 0
        predicted_proba[ind] = prediction[0][i][0]
    else:
        predicted_label[ind] = 1
        predicted_proba[ind] = prediction[0][i][1]
    ind = ind + 1
        

[[[1.7986882e-02 9.8201305e-01]
  [5.3227492e-02 9.4677252e-01]
  [1.7689090e-03 9.9823111e-01]
  [8.0366735e-04 9.9919635e-01]
  [3.4216343e-04 9.9965787e-01]
  [1.7366360e-01 8.2633644e-01]
  [1.7801227e-04 9.9982196e-01]
  [1.7731670e-04 9.9982268e-01]
  [1.6766648e-04 9.9983227e-01]
  [1.6388734e-04 9.9983609e-01]
  [1.6228532e-04 9.9983764e-01]
  [1.6225656e-04 9.9983776e-01]
  [1.6471074e-04 9.9983525e-01]
  [1.7647889e-04 9.9982351e-01]
  [2.9361850e-04 9.9970633e-01]]]


In [11]:
predicted_label_widely = np.zeros(predicted_label.shape[0] * n_frames)
ind_start = 0
ind_stop = n_frames
shift_step = n_frames
for i in range(predicted_label.shape[0]):
    predicted_label_widely[ind_start:ind_stop] = predicted_label[i]
    ind_start = ind_start + shift_step
    ind_stop = ind_stop + shift_step

label_timeseries = np.zeros(input_signal.shape[0])
begin = int(0)
end = int(frame_length * fs)
shift_step = int(frame_step * fs)
for i in range(predicted_label_widely.shape[0]):
    label_timeseries[begin:end] = predicted_label_widely[i]
    begin = begin + shift_step
    end = end + shift_step
print(label_timeseries.shape)

(47558,)


In [12]:

# define time axis
Ns = input_signal.shape[0]  # number of sample
Ts = 1 / fs  # sampling period
print(Ts)
t = np.arange(Ns) * Ts  # time axis in seconds
print(Ns)
norm_coef = 1.1 * np.max(input_signal)

output = []
prev_label = 0
curr_label = 0
test = 0
start = 0
end = 0

print(label_timeseries)
for i in range(len(label_timeseries)):
    if label_timeseries[i] == 1:
        curr_label = 1
        if prev_label == 0:
            start = t[i]
    else:
        curr_label = 0
        if prev_label == 1:
            end = t[i]
            output.append([start, end])
            test = test + 1
    prev_label = curr_label

if curr_label == 1:
    end = t[i]
    output.append([start, end])
    test = test + 1

print(output, t[-1])

edge_ind = np.min([input_signal.shape[0], len(labels)])

plt.figure(figsize=(24, 6))
plt.plot(t[:edge_ind], input_signal[:edge_ind])
plt.plot(t[:edge_ind], np.array(labels[:edge_ind]) * norm_coef)
plt.plot(t[:edge_ind], label_timeseries[:edge_ind] * norm_coef)

print(len(label_timeseries))

plt.title("Ground truth labels")
plt.legend(['Signal', 'Speech', 'Predicted'])
plt.show()

4.5351473922902495e-05
47558
[1. 1. 1. ... 1. 1. 1.]
[[0.0, 2.156780045351474]] 2.156780045351474


NameError: name 'labels' is not defined