In [24]:
from kaldi.feat.mfcc import Mfcc, MfccOptions
from kaldi.matrix import SubVector, SubMatrix
from kaldi.util.options import ParseOptions
from kaldi.util.table import SequentialWaveReader
from kaldi.util.table import MatrixWriter
from numpy import mean
from sklearn.preprocessing import scale, MinMaxScaler
import numpy as np
import os
import keras
from keras.layers import Input, Dense, Conv1D, Conv2D, Reshape, MaxPooling1D, Flatten, UpSampling1D
from keras.layers import TimeDistributed, RepeatVector
from keras.models import Model
import tensorflow as tf
scaler = MinMaxScaler
from sklearn.model_selection import train_test_split
from time import time

In [26]:
start_time = time()
current_time = start_time
def log(message):
    global current_time
    t = time()
    print(f"{message}, {int(t - current_time)}s, total {int(t - start_time)}s")
    current_time = t
def start():
    global start_time
    global current_time
    start_time = time()
    current_time = start_time

In [2]:
wavs_dir = "/home/larkkin/data/VCTK-Corpus/wav48/"
dirs = os.listdir(wavs_dir)
files = []
for d in dirs:
    dir_files = os.listdir(os.path.join(wavs_dir, d))
    files.extend([os.path.join(wavs_dir, d, f) for f in dir_files if not f.startswith("noise") and\
                                                                     not f.endswith("raw")])
with open("testfile.scp", "w") as otp:
#     otp.write('\n'.join([f"{filename} {os.path.join(wavs_dir, filename)}" for filename in files]))
    otp.write('\n'.join([f"{filename.split('/')[-1]} {filename}" for filename in files]))
labels = [filename.split('/')[-1].split('_')[0] for filename in files]
labels_set = set(labels)
label_to_id = {label : i for i, label in enumerate(sorted(labels_set))}
y = np.array([label_to_id[label] for label in labels])
num_labels = len(labels_set)
num_labels

109

In [3]:
# def get_windows(x, size=mfcc_opts.frame_opts.window_size(), shift=mfcc_opts.frame_opts.window_shift()):
#     return np.array([np.array(x[i*shift:i*shift+size]) for i in range((x.shape[0] - size) // shift)])
# def get_windows(x, size, shift):
#     return np.array([np.array(x[i*shift:i*shift+size]) for i in range((x.shape[0] - size) // shift)])

In [4]:
def get_mfcc():
    usage = """Extract MFCC features.
               Usage:  example.py [opts...] <rspec> <wspec>
            """
    po = ParseOptions(usage)
    po.register_float("min-duration", 0.0,
                      "minimum segment duration")
    mfcc_opts = MfccOptions()
    mfcc_opts.frame_opts.samp_freq = 8000
    mfcc_opts.register(po)

    opts = po.parse_args()
    rspec, wspec = "scp:testfile.scp", "ark,t:test_mfcc.ark"
    mfcc = Mfcc(mfcc_opts)
    sf = mfcc_opts.frame_opts.samp_freq
    X_mfcc = []
#     X_raw = []
    y = []
    with SequentialWaveReader(rspec) as reader, \
             MatrixWriter(wspec) as writer:
        for key, wav in reader:
            if wav.duration < opts.min_duration:
                continue
            assert(wav.samp_freq >= sf)
            assert(wav.samp_freq % sf == 0)
            s = wav.data()
            # downsample to sf [default=8kHz]
            s = s[:,::int(wav.samp_freq / sf)]
            # mix-down stereo to mono
            m = SubVector(mean(s, axis=0))
            # compute MFCC features
            f = mfcc.compute_features(m, sf, 1.0)
            # standardize features
            f = SubMatrix(scale(f))
            # write features to archive
#             raw_windows = get_windows(m,
#                                       size=mfcc_opts.frame_opts.window_size(),
#                                       shift=mfcc_opts.frame_opts.window_shift())
            f = np.array(f)
#             f = f[:raw_windows.shape[0]]
#             assert(f.shape[0] == raw_windows.shape[0])
#             X_raw.append(MinMaxScaler().fit_transform(raw_windows))
            X_mfcc.append(f)
            answer = np.zeros(num_labels)
            answer[label_to_id[key.split('_')[0]]] = 1.0
            y.append(answer)
#     return np.array(X_mfcc), np.array(X_raw), np.array(y)
    return np.array(X_mfcc), np.array(y)

In [5]:
# X_mfcc, X_raw, y = get_mfcc()
# X_mfcc, y = get_mfcc()

In [6]:
# noised_wavs_dir = "/home/larkkin/data/noised/wav48"
# dirs = os.listdir(noised_wavs_dir)
# files = []
# for d in dirs:
#     dir_files = os.listdir(os.path.join(noised_wavs_dir, d))
#     files.extend([os.path.join(noised_wavs_dir, d, f) for f in dir_files if not f.startswith("noise") and\
#                                                                             not f.endswith("raw")])
# with open("testfile.scp", "w") as otp:
# #     otp.write("TEST ../data/p225_001.wav")
#     otp.write('\n'.join([f"{filename.split('/')[-1]} {os.path.join(wavs_dir, filename)}" for filename in files]))
# # X_mfcc_noised, X_raw_noised, y = get_mfcc()
# X_mfcc_noised, y = get_mfcc()

In [53]:
# np.save("../data/mfcc.npy", X_mfcc)
# np.save("../data/mfcc_noised.npy", X_mfcc_noised)
X_mfcc = np.load("../data/mfcc.npy")
X_mfcc_noised = np.load("../data/mfcc_noised.npy")

In [65]:
test_size = 0.1
X_mfcc_train, X_mfcc_test, X_mfcc_noised_train, X_mfcc_noised_test, y_train, y_test =\
    train_test_split(X_mfcc, X_mfcc_noised, y, test_size=test_size, stratify=y)


In [66]:
np.save("../data/mfcc_train.npy", X_mfcc_train)
np.save("../data/mfcc_test.npy", X_mfcc_test)
np.save("../data/mfcc_noised_train.npy", X_mfcc_noised_train)
np.save("../data/mfcc_noised_test.npy", X_mfcc_noised_test)

In [67]:
np.save("../data/y_train.npy", y_train)
np.save("../data/y_test.npy", y_test)

In [9]:
def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same', activation='relu'):
    x = keras.layers.Lambda(lambda x: keras.backend.expand_dims(x, axis=2))(input_tensor)
    x = Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1),
                        padding=padding, activation=activation)(x)
    x = keras.layers.Lambda(lambda x: keras.backend.squeeze(x, axis=2))(x)
    return x
def sampling(args):
    z_mean, z_log_sigma = args
    epsilon = keras.backend.random_normal(shape=(batch_size, latent_dim),
                              mean=0., stddev=1.)
    return z_mean + keras.backend.exp(z_log_sigma) * epsilon

In [68]:
latent_dim = 6
original_dim = 13
x_inp = Input(shape=(None, original_dim))
x = keras.layers.GRU(latent_dim,
                     stateful=False,
                     return_sequences=True,
                     go_backwards=True)(x_inp)
z = TimeDistributed(Dense(latent_dim * 2))(x)
x = keras.layers.GRU(original_dim,
                     stateful=False,
                     return_sequences=True,
                     go_backwards=True)(z)
x_decoded = TimeDistributed(Dense(original_dim, activation='sigmoid'))(x)
rnn_ae = Model(x_inp, x_decoded)
rnn_encoder = Model(x_inp, z)

In [69]:
rnn_ae.compile(optimizer='adam', loss='mse')

In [70]:
def train_generator(X, Y):
    for i in range(X.shape[0]):
        x = MinMaxScaler().fit_transform(X[i])
        x = x.reshape(1, *x.shape)
        y = MinMaxScaler().fit_transform(Y[i])
        y = y.reshape(1, *y.shape)
        yield (x, y)
def test_generator(X, k):
    x = MinMaxScaler().fit_transform(X[k])
    x = x.reshape(1, *x.shape)
    yield x

In [71]:
rnn_ae.fit_generator(train_generator(X_mfcc_noised_train, X_mfcc_train),
                     epochs=100,
                     verbose=1,
                     shuffle=True,
#           validation_data=({'mfcc_input': mfcc_seq_val, 'fbank_input': fbank_seq_val},
#                            {'main_output': y_val_seq, 'aux_output': y_val_seq}),
#           class_weight={0 : class_weights[0], 1 : class_weights[1]},
                     initial_epoch=0,
                     steps_per_epoch=10,
                     validation_steps=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 100/100


<keras.callbacks.History at 0x7f0d30be0080>

In [72]:
# test_k = 32735
# pred = rnn_ae.predict_generator(test_generator(X_mfcc_noised),
# #                      epochs=10,
#                      verbose=1,
#                      steps=1)
# true = MinMaxScaler().fit_transform(X_mfcc[test_k])
# noisy = MinMaxScaler().fit_transform(X_mfcc_noised[test_k])
# print(((pred - true)**2).mean())
# print(((noisy - true)**2).mean()) 
# # ((pred.reshape(*pred.shape[1:]) - X_mfcc[0])**2).mean()
# # r1 = keras.losses.mse(pred.reshape(*pred.shape[1:]),
# #                                  tf.convert_to_tensor(X_mfcc[0]))
# # r2 = keras.losses.mse(tf.convert_to_tensor(X_mfcc_noised[0]),
#                                  tf.convert_to_tensor(X_mfcc[0]))
def predict_ae(X):
    res = []
    for i in range(X.shape[0]):
        if i % 500 == 0:
            log(f"{i}/{X.shape[0]}")
        pred = rnn_ae.predict_generator(test_generator(X, i),
                                        verbose=0,
                                        steps=1)
        pred = pred.reshape(*pred.shape[1:])
        res.append(pred)
    return res

def encode(X):
    res = []
    for i in range(X.shape[0]):
        if i % 500 == 0:
            log(f"{i}/{X.shape[0]}")
        pred = rnn_encoder.predict_generator(test_generator(X, i),
                                        verbose=0,
                                        steps=1)
        pred = pred.reshape(*pred.shape[1:])
        res.append(pred)
    return res

In [73]:
# start()
# for i, x_pred in enumerate(predict(X_mfcc_test[:201])):
#     true = MinMaxScaler().fit_transform(X_mfcc_test[i])
#     noised = MinMaxScaler().fit_transform(X_mfcc_noised_test[i])
#     print(((x_pred - true)**2).mean() - ((noised - true)**2).mean())

In [74]:
rnn_ae.save("./models/rnn_ae")
rnn_encoder.save("./models/rnn_encoder")

In [75]:
start()
ae_features_train = np.array(encode(X_mfcc_noised_train))
np.save(arr=ae_features_train, file="../data/ae_features_noised_train.npy")

0/39817, 0s, total 0s
500/39817, 35s, total 35s
1000/39817, 35s, total 71s
1500/39817, 37s, total 109s
2000/39817, 36s, total 146s
2500/39817, 35s, total 181s
3000/39817, 35s, total 216s
3500/39817, 36s, total 252s
4000/39817, 35s, total 288s
4500/39817, 36s, total 324s
5000/39817, 35s, total 360s
5500/39817, 35s, total 395s
6000/39817, 36s, total 431s
6500/39817, 35s, total 466s
7000/39817, 36s, total 502s
7500/39817, 36s, total 538s
8000/39817, 35s, total 574s
8500/39817, 37s, total 611s
9000/39817, 36s, total 647s
9500/39817, 35s, total 683s
10000/39817, 37s, total 720s
10500/39817, 36s, total 756s
11000/39817, 35s, total 792s
11500/39817, 36s, total 829s
12000/39817, 36s, total 865s
12500/39817, 35s, total 900s
13000/39817, 36s, total 936s
13500/39817, 34s, total 971s
14000/39817, 36s, total 1008s
14500/39817, 34s, total 1042s
15000/39817, 37s, total 1080s
15500/39817, 35s, total 1115s
16000/39817, 35s, total 1150s
16500/39817, 35s, total 1186s
17000/39817, 36s, total 1222s
17500/3

In [76]:
start()
ae_features_test = np.array(encode(X_mfcc_noised_test))
np.save(arr=ae_features_test, file="../data/ae_features_noised_test.npy")

0/4425, 0s, total 0s
500/4425, 35s, total 35s
1000/4425, 35s, total 71s
1500/4425, 33s, total 105s
2000/4425, 36s, total 141s
2500/4425, 35s, total 177s
3000/4425, 35s, total 212s
3500/4425, 35s, total 247s
4000/4425, 34s, total 282s


In [38]:
rnn_ae.predict_generator(test_generator(X_mfcc_noised))

ValueError: `steps=None` is only valid for a generator based on the `keras.utils.Sequence` class. Please specify `steps` or use the `keras.utils.Sequence` class.

In [144]:

mfcc_input = Input(shape=(None, 13), name='mfcc_input')
gru1 = keras.layers.GRU(64,
                        stateful=False,
                        return_sequences=True)(mfcc_input)
aux_output = Dense(num_labels, activation='softmax', name='aux_output')(gru1)

raw_input = Input(shape=(None, 200), name='raw_input')
gru2 = keras.layers.GRU(20,
                        stateful=False,
                        return_sequences=True)(raw_input)

x = keras.layers.concatenate([gru1, gru2])
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(num_labels, activation='softmax', name='main_output')(x)

model = Model(inputs=[mfcc_input, raw_input], outputs=[main_output, aux_output])

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [263]:
def train_generator(X_mfcc, X_raw, y):
    for i in range(X_mfcc.shape[0]):
        mfcc = X_mfcc[i]
        raw = X_raw[i]
        local_y = np.array([y[i]] * mfcc.shape[0]).reshape(1, mfcc.shape[0], num_labels)
        yield ({'mfcc_input': mfcc.reshape(1, *mfcc.shape),
                'raw_input': raw.reshape(1, *raw.shape)},
               {'main_output': local_y, 'aux_output': local_y})

In [20]:
print(len(X_mfcc), len(X_raw), y.shape)
model.fit_generator(train_generator(X_mfcc, X_raw, y),
          epochs=3,
          verbose=1,
          shuffle=True,
#           validation_data=({'mfcc_input': mfcc_seq_val, 'fbank_input': fbank_seq_val},
#                            {'main_output': y_val_seq, 'aux_output': y_val_seq}),
#           class_weight={0 : class_weights[0], 1 : class_weights[1]},
          initial_epoch=0,
          steps_per_epoch=10,
          validation_steps=10)

NameError: name 'X_raw' is not defined

In [293]:
predictions = []
i = 0
answers = []
for inp, otp in train_generator(X_mfcc, X_raw, y):
    i += 1
    if i % 100 == 0:
        print(i)
    current_res = model.predict(inp)
    predictions.append(current_res)
    answers.append(otp)
    

100
200
300
400
500


In [302]:
np.argmax(predictions[0][0].reshape(-1, 2), axis=1)

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,

In [316]:
counter = 0
corrects = 0
for i in range(len(answers)):
    if i % 100 == 0:
        print(i)
    a = answers[i]['main_output'].reshape(-1, 2)
    p = predictions[i][0].reshape(-1, 2)
    counter += a.shape[0]
    corrects += (np.argmax(p, axis=1) == np.argmax(a, axis=1)).sum()

0
100
200
300
400
500


In [317]:
keras.

0.614907053347938

In [312]:
counter

252887

In [148]:
def get_windows(x, size=mfcc_opts.frame_opts.window_size(), shift=mfcc_opts.frame_opts.window_shift()):
    return np.array([np.array(x[i*shift:i*shift+size]) for i in range((x.shape[0] - size) // shift)])

In [149]:
ordinary_windows = get_windows(X[0])

In [144]:
x  = X_mfcc[0]

(231, 231, (231,))

In [147]:
x[:-1].shape

(296, 13)

In [151]:
ordinary_windows.shape

(296, 200)

In [157]:
X_mfcc[-1].shape

(612, 13)

In [67]:
original_dim = 200
intermediate_dim = 40
latent_dim = 13
batch_size = 15
stride=1
x = Input(batch_shape=(batch_size, original_dim))
h = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim)(h)
z_log_sigma = Dense(latent_dim)(h)

z = keras.layers.Lambda(sampling)([z_mean, z_log_sigma])

decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)
# end-to-end autoencoder
vae = Model(x, x_decoded_mean)

# encoder, from inputs to latent space
encoder = Model(x, z_mean)


In [128]:
original_dim = 200
latent_dim = 12
batch_size = 15
stride=2
x_inp = Input(batch_shape=(batch_size, original_dim))
x = Reshape((original_dim, 1))(x_inp)
x = Conv1D(64,3, activation='relu', padding='valid', strides=stride)(x)
x = MaxPooling1D(2)(x)
x = Conv1D(32,3, activation='relu', padding='valid', strides=stride)(x)
x = MaxPooling1D(2)(x)
h = Flatten()(x)

z_mean = Dense(latent_dim)(h)
z_log_sigma = Dense(latent_dim)(h)
z = keras.layers.Lambda(sampling)([z_mean, z_log_sigma])

x = Reshape((latent_dim, 1))(z)
x = Conv1DTranspose(x, 32, 3, activation='relu', padding='valid', strides=stride)
x = UpSampling1D(2)(x)
x = Conv1DTranspose(x, 64, 3, activation='relu', padding='valid', strides=stride)
x = UpSampling1D(2)(x)
x = Flatten()(x)
x_decoded = Dense(original_dim,activation = 'sigmoid')(x)
vae = Model(x_inp, x_decoded)
encoder = Model(x_inp, z_mean)

def vae_loss(x, x_decoded_mean):
    l = 5.01
    xent_loss = keras.losses.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - l * keras.backend.mean(1 + z_log_sigma - keras.backend.square(z_mean) - \
                                     keras.backend.exp(z_log_sigma),
                                 axis=-1)
    return xent_loss + kl_loss


NameError: name 'sampling' is not defined

In [116]:
vae.compile(optimizer='rmsprop', loss=vae_loss)

In [120]:
vae.fit(plain_raw_features,
        plain_raw_features,
        batch_size=batch_size,
        epochs=1,
        shuffle=True,
        verbose=1)

In [158]:
X_raw[-1].shape

(612, 200)

In [182]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [108]:
plain_raw_features.max()

1.0000001

In [135]:
scale(np.arange(1, 10))



array([-1.54919334, -1.161895  , -0.77459667, -0.38729833,  0.        ,
        0.38729833,  0.77459667,  1.161895  ,  1.54919334])