In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import librosa as li
import time

from IPython import display
from concurrent.futures import ThreadPoolExecutor

%matplotlib inline

print("TensorFlow version is ", tf.__version__)

TensorFlow version is  2.3.1


In [2]:
PATH_TO_MIRA = 'AudioData/5 Мира'

paths = li.util.find_files(PATH_TO_MIRA)
print(len(paths))
print('\n'.join(paths[:10]))

70
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\1.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\10.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\11.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\12.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\13.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\14.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\15.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\16.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\17.wav
F:\TF\AudioRec\AudioData\5 Мира\1 Мира\18.wav


In [3]:
def get_audio(path, sr=16000):
    example = li.load(path, sr=sr)[0]
    if example.size < sr:
        example = np.concatenate((example, np.zeros(sr-example.size)))
    else:
        example = example[:sr]
    return example

In [4]:
def get_data(paths, num_threads=4):
    with ThreadPoolExecutor(num_threads) as pool:
        data = list(pool.map(get_audio, paths))
    return np.array(data)

In [5]:
data = get_data(paths)

In [6]:
BATCH_SIZE = 10

train_dataset = tf.data.Dataset.from_tensor_slices(data).shuffle(70).batch(BATCH_SIZE)
train_dataset

<BatchDataset shapes: (None, 16000), types: tf.float64>

In [7]:
discriminator = tf.keras.Sequential([
    tf.keras.layers.InputLayer((16000)),
    tf.keras.layers.Reshape((16000, 1)),
    tf.keras.layers.Conv1D(filters=10, kernel_size=4, strides=1, padding='same', use_bias=False),
    tf.keras.layers.MaxPool1D(4),
    tf.keras.layers.Conv1D(filters=1, kernel_size=4, strides=4, use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('tanh'),
    tf.keras.layers.GRU(100),
    tf.keras.layers.Dense(1)
], name='discriminator')

discriminator.summary()

Model: "discriminator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 16000, 1)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 16000, 10)         40        
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4000, 10)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 1)           40        
_________________________________________________________________
batch_normalization (BatchNo (None, 1000, 1)           4         
_________________________________________________________________
activation (Activation)      (None, 1000, 1)           0         
_________________________________________________________________
gru (GRU)                    (None, 100)             

In [9]:
generator = tf.keras.Sequential([
    tf.keras.layers.InputLayer((100,)),
    tf.keras.layers.Reshape((100, 1)),
    tf.keras.layers.GRU(100),
    tf.keras.layers.Reshape((100, 1)),
    tf.keras.layers.Conv1DTranspose(filters=1, kernel_size=4, strides=4, use_bias=False),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Conv1DTranspose(filters=10, kernel_size=2, strides=1, padding='same', use_bias=False),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Conv1DTranspose(filters=20, kernel_size=4, strides=4, padding='same', use_bias=False),
    tf.keras.layers.PReLU(),
    tf.keras.layers.Conv1DTranspose(filters=1, kernel_size=10, strides=10, padding='same', use_bias=False),
    tf.keras.layers.Activation('tanh'),
    tf.keras.layers.Reshape((16000,))
], name='generator')

generator.summary()

Model: "generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 100, 1)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               30900     
_________________________________________________________________
reshape_2 (Reshape)          (None, 100, 1)            0         
_________________________________________________________________
conv1d_transpose (Conv1DTran (None, 400, 1)            4         
_________________________________________________________________
p_re_lu (PReLU)              (None, 400, 1)            400       
_________________________________________________________________
conv1d_transpose_1 (Conv1DTr (None, 400, 10)           20        
_________________________________________________________________
p_re_lu_1 (PReLU)            (None, 400, 10)           40

In [10]:
latent_dim = 100
noise = tf.random.normal((10, 100))

In [11]:
test = generator(noise)
test.shape

TensorShape([10, 16000])

In [12]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
    real_loss = loss(tf.ones_like(real_output), real_output)
    fake_loss = loss(tf.zeros_like(fake_output), fake_output)
    return real_loss + fake_loss

def generator_loss(fake_output):
    return loss(tf.ones_like(fake_output), fake_output)

In [13]:
gen_optim = tf.keras.optimizers.Adam(1e-4)
disc_optim = tf.keras.optimizers.Adam(1e-4)

In [14]:
@tf.function()
def train_step(train_batch):
    noise = tf.random.normal([BATCH_SIZE, latent_dim])
    
    with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape:
        generated_audio = generator(noise, training=True)
        
        real_output = discriminator(train_batch, training=True)
        fake_output = discriminator(generated_audio, training=True)
        
        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
    gen_grad = g_tape.gradient(gen_loss, generator.trainable_variables)
    disc_grad = d_tape.gradient(disc_loss, discriminator.trainable_variables)
    
    gen_optim.apply_gradients(zip(gen_grad, generator.trainable_variables))
    disc_optim.apply_gradients(zip(disc_grad, discriminator.trainable_variables))
    return gen_loss, disc_loss

In [23]:
def train(dataset, epochs):
    start = time.time()
    for i in range(epochs):
        for train_batch in dataset:
            gen_loss, disc_loss = train_step(train_batch)
            
        if (i+1) % 10 == 0:
            print('Epoch {:4d}: gen_loss - {:.4f} | disc_loss - {:.4f} | {:.3f} seconds'.format(
                i+1, gen_loss, disc_loss, time.time()-start))
            start = time.time()

In [24]:
train(train_dataset, 1000)

Epoch   10: gen_loss - 0.9212 | disc_loss - 0.8397 | 105.931 seconds
Epoch   20: gen_loss - 0.8976 | disc_loss - 1.1661 | 106.088 seconds
Epoch   30: gen_loss - 0.8757 | disc_loss - 0.7885 | 107.026 seconds
Epoch   40: gen_loss - 0.8429 | disc_loss - 0.8709 | 107.400 seconds
Epoch   50: gen_loss - 0.8206 | disc_loss - 1.3100 | 109.457 seconds
Epoch   60: gen_loss - 0.8074 | disc_loss - 1.0489 | 106.698 seconds
Epoch   70: gen_loss - 0.8162 | disc_loss - 1.4165 | 101.400 seconds
Epoch   80: gen_loss - 0.8649 | disc_loss - 1.4502 | 104.529 seconds
Epoch   90: gen_loss - 0.8916 | disc_loss - 1.4709 | 108.202 seconds
Epoch  100: gen_loss - 0.9470 | disc_loss - 0.9431 | 108.662 seconds
Epoch  110: gen_loss - 1.0020 | disc_loss - 1.2159 | 106.809 seconds
Epoch  120: gen_loss - 1.0288 | disc_loss - 1.4992 | 107.864 seconds
Epoch  130: gen_loss - 1.0364 | disc_loss - 0.8872 | 107.989 seconds
Epoch  140: gen_loss - 1.0386 | disc_loss - 1.4843 | 108.945 seconds
Epoch  150: gen_loss - 1.0469 | di

In [25]:
generator.save("models/generator-v0.1.h5")