In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import librosa as li
import time

from IPython import display
from concurrent.futures import ThreadPoolExecutor

%matplotlib inline

print("TensorFlow version is ", tf.__version__)

TensorFlow version is  2.3.1


In [3]:
PATH_TO_MIRA = 'AudioData/5 Мира'

paths = li.util.find_files(PATH_TO_MIRA)
print(len(paths))
print('\n'.join(paths[:10]))

70
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\1.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\10.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\11.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\12.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\13.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\14.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\15.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\16.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\17.wav
D:\Work\AudioRec\AudioData\5 Мира\1 Мира\18.wav


In [4]:
def get_audio(path, sr=16000):
    example = li.load(path, sr=sr)[0]
    if example.size < sr:
        example = np.concatenate((example, np.zeros(sr-example.size)))
    else:
        example = example[:sr]
    return example

In [5]:
def get_data(paths, num_threads=4):
    with ThreadPoolExecutor(num_threads) as pool:
        data = list(pool.map(get_audio, paths))
    return np.array(data)

In [6]:
data = get_data(paths)

In [51]:
BATCH_SIZE = 10

train_dataset = tf.data.Dataset.from_tensor_slices(data).batch(BATCH_SIZE)
train_dataset

<BatchDataset shapes: (None, 16000), types: tf.float64>

In [7]:
discriminator = tf.keras.Sequential([
    tf.keras.layers.InputLayer((16000)),
    tf.keras.layers.Reshape((16000, 1)),
    tf.keras.layers.Conv1D(filters=10, kernel_size=4, strides=1, padding='same', use_bias=False),
    tf.keras.layers.MaxPool1D(4),
    tf.keras.layers.Conv1D(filters=1, kernel_size=4, strides=4, use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('tanh'),
    tf.keras.layers.GRU(100),
    tf.keras.layers.Dense(1)
], name='discriminator')

discriminator.summary()

Model: "discriminator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 16000, 1)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 16000, 10)         40        
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4000, 10)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 1)           40        
_________________________________________________________________
batch_normalization (BatchNo (None, 1000, 1)           4         
_________________________________________________________________
activation (Activation)      (None, 1000, 1)           0         
_________________________________________________________________
gru (GRU)                    (None, 100)             

In [38]:
generator = tf.keras.Sequential([
    tf.keras.layers.InputLayer((100,)),
    tf.keras.layers.Reshape((100, 1)),
    tf.keras.layers.GRU(100),
    tf.keras.layers.Reshape((100, 1)),
    tf.keras.layers.Conv1DTranspose(filters=1, kernel_size=4, strides=4, use_bias=False),
    tf.keras.layers.Activation('tanh'),
    tf.keras.layers.Conv1DTranspose(filters=10, kernel_size=2, strides=1, padding='same', use_bias=False),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Conv1DTranspose(filters=20, kernel_size=4, strides=4, padding='same', use_bias=False),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Conv1DTranspose(filters=1, kernel_size=10, strides=10, padding='same', use_bias=False),
    tf.keras.layers.Activation('tanh'),
    tf.keras.layers.Reshape((16000,))
], name='generator')

generator.summary()

Model: "generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_39 (Reshape)         (None, 100, 1)            0         
_________________________________________________________________
gru_21 (GRU)                 (None, 100)               30900     
_________________________________________________________________
reshape_40 (Reshape)         (None, 100, 1)            0         
_________________________________________________________________
conv1d_transpose_53 (Conv1DT (None, 400, 1)            4         
_________________________________________________________________
activation_28 (Activation)   (None, 400, 1)            0         
_________________________________________________________________
conv1d_transpose_54 (Conv1DT (None, 400, 10)           20        
_________________________________________________________________
p_re_lu_23 (PReLU)           (None, 400, 10)           40

In [39]:
latent_dim = 100
noise = tf.random.normal((10, 100))

In [40]:
test = generator(noise)
test.shape

TensorShape([10, 16000])

In [43]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
    real_loss = loss(tf.ones_like(real_output), real_output)
    fake_loss = loss(tf.zeros_like(fake_output), fake_output)
    return real_loss + fake_loss

def generator_loss(fake_output):
    return loss(tf.ones_like(fake_output), fake_output)

In [44]:
gen_optim = tf.keras.optimizers.Adam(1e-4)
disc_optim = tf.keras.optimizers.Adam(1e-4)

In [71]:
@tf.function()
def train_step(train_batch):
    noise = tf.random.normal([BATCH_SIZE, latent_dim])
    
    with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape:
        generated_audio = generator(noise, training=True)
        
        real_output = discriminator(train_batch, training=True)
        fake_output = discriminator(generated_audio, training=True)
        
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
    gen_grad = g_tape.gradient(gen_loss, generator.trainable_variables)
    disc_grad = d_tape.gradient(disc_loss, discriminator.trainable_variables)
    
    gen_optim.apply_gradients(zip(gen_grad, generator.trainable_variables))
    disc_optim.apply_gradients(zip(disc_grad, discriminator.trainable_variables))
    return gen_loss, disc_loss

In [72]:
def train(dataset, epochs):
    start = time.time()
    for i in range(epochs):
        for train_batch in dataset:
            gen_loss, disc_loss = train_step(train_batch)
            
        if (i+1) % 1 == 0:
            print('Epoch {:4d}: gen_loss - {:.4f} | disc_loss - {:.4f} | {:.3f} seconds'.format(
                i+1, gen_loss, disc_loss, time.time()-start))
            start = time.time()

In [68]:
train(train_dataset, 10)

Epoch    1: gen_loss - 0.6991 | disc_loss - 1.3649 | 15.658 seconds
Epoch    2: gen_loss - 0.7024 | disc_loss - 1.3579 | 15.614 seconds
Epoch    3: gen_loss - 0.7040 | disc_loss - 1.3519 | 15.573 seconds
Epoch    4: gen_loss - 0.7050 | disc_loss - 1.3454 | 15.511 seconds
Epoch    5: gen_loss - 0.7097 | disc_loss - 1.3330 | 15.776 seconds
Epoch    6: gen_loss - 0.7123 | disc_loss - 1.3196 | 15.575 seconds
Epoch    7: gen_loss - 0.7231 | disc_loss - 1.2935 | 15.384 seconds
Epoch    8: gen_loss - 0.7318 | disc_loss - 1.2582 | 15.490 seconds
Epoch    9: gen_loss - 0.7503 | disc_loss - 1.1947 | 15.550 seconds
Epoch   10: gen_loss - 0.7766 | disc_loss - 1.0382 | 15.398 seconds


In [69]:
test = generator(noise)

In [70]:
display.Audio(data=test[0], rate=16000)

In [66]:
tf.nn.sigmoid(discriminator(test))

<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[0.49664816],
       [0.49665135],
       [0.49664924],
       [0.49665028],
       [0.49664935],
       [0.49664864],
       [0.49665016],
       [0.49664935],
       [0.49664822],
       [0.49665093]], dtype=float32)>