# Import Modules

In [2]:

import tensorflow as tf
import os
from tqdm import tqdm, tqdm_notebook
from utils import emphasis
import tensorflow.keras.backend as K
from tensorflow.keras.utils import Sequence

import numpy as np
import librosa
import librosa.display


# SRCNN

## Model Definition

In [3]:
class SubPixel1D(tf.keras.layers.Layer):
    def __init__(self, r=2):
        super(SubPixel1D, self).__init__()
        self.r = r
    def call(self, I):
        """One-dimensional subpixel upsampling layer
        Calls a tensorflow function that directly implements this functionality.
         We assume input has dim (batch, width, r)
        """

        X = tf.transpose(I, [2,1,0]) # (r, w, b)
        X = tf.batch_to_space_nd(X, [self.r], [[0,0]]) # (1, r*w, b)
        X = tf.transpose(X, [2,1,0])
        return X

noisy = tf.keras.layers.Input(shape=(4096, 1))
clean = tf.keras.layers.Input(shape=(4096, 1))
x_input = noisy
x = x_input

# B = 8
# n_filters = [128, 256, 512, 512, 512, 512, 512, 512]
# kernel_sizes = [65, 33, 17, 9, 9, 9, 9, 9]

B = 4
n_filters = [128, 256, 512, 512]
kernel_sizes = [65, 33, 17, 9]

# B = 1
# n_filters = [128]
# kernel_sizes = [65]

# Downsampling Layers
encoder_features = []
for k, n_filter, kernel_size in zip(range(B), n_filters, kernel_sizes):
    x = tf.keras.layers.Conv1D(filters = n_filter,
                               kernel_size = kernel_size,
                               strides = 2,
                               padding = 'same')(x)
    x = tf.keras.layers.PReLU()(x)
    encoder_features.append(x)
    
# Bottleneck Layer
x = tf.keras.layers.Conv1D(filters = n_filters[-1],
                           kernel_size = kernel_sizes[-1],
                           strides = 2,
                           padding = 'same')(x)
x = tf.keras.layers.Dropout(rate=0.5)(x)
x = tf.keras.layers.PReLU()(x)

# Upsampling Layer
for k, n_filter, kernel_size, enc in reversed(list(zip(range(B), 
                                                  n_filters, 
                                                  kernel_sizes, 
                                                  encoder_features))):
    x = tf.keras.layers.Conv1D(filters = n_filter,
                               kernel_size = kernel_size,
                               strides = 1,
                               padding = 'same')(x)
    x = tf.keras.layers.Dropout(rate=0.5)(x)
    x = tf.keras.layers.PReLU()(x)
    x = SubPixel1D()(x)
    x = tf.keras.layers.Concatenate(axis=2)([x, enc])

# Final Conv Layer
x = tf.keras.layers.Conv1D(filters = 2,
                           kernel_size = 9,
                           strides = 1,
                           padding = 'same')(x)
x = SubPixel1D()(x)
x_final = tf.keras.layers.Add()([x, x_input])    
G = tf.keras.models.Model(inputs = [noisy], outputs = [x_final])    

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


## Load Model

In [None]:
G.load_weights('./model/weights.hdf5')
optim = tf.keras.optimizers.Adam(lr=1e-4)
def G_loss(true, fake):
    return 1 * K.sqrt(K.mean((fake - true) ** 2))

G.compile(loss = G_loss,
          optimizer = optim)

# Predict

In [1]:
window_size = 2 ** 12  # about 1 second of samples
sample_rate = 16000
stride = 0.5

def predict(file, window_size, stride, sample_rate):
    """
    Helper function for predicting the audio file
    by window size and sample rate with [1-stride] percent overlap (default 50%).
    """
    wav, sr = sf.read(file)
    hop = int(window_size * stride)
    result = np.zeros(wav.shape)
    for end_idx in range(window_size, len(wav), hop):
        start_idx = end_idx - window_size
        noisy_test = wav[start_idx:end_idx]
        noisy_test = noisy_test.reshape(1, -1, 1)
        z_test = np.random.randn(1, 8, 1024)
        clean_pred = G.predict((noisy_test, z_test), batch_size=1)
        result[start_idx: end_idx] += clean_pred.reshape(-1)
        
    s = hop
    e = end_idx - hop
    result[s:e] *= 0.5
    return result

y_pred = predict('../dataset/timit_noisy/test/DR1_FAKS0_SA2.wav', 
                     window_size, 
                     stride, 
                     sample_rate)

NameError: name 'sf' is not defined

High Resolution

In [None]:
y_clean, fs = sf.read('../dataset/timit_clean/test/DR1_FAKS0_SA2.wav')
librosa.display.waveplot(y_clean, sr = fs)

D_clean = librosa.stft(y_clean)
D_clean_db = librosa.amplitude_to_db(abs(D_clean))
plt.figure(figsize=(14,5))
librosa.display.specshow(D_clean_db, sr=fs, x_axis='time', y_axis='hz', cmap='jet')

ipd.Audio(y_clean, rate = fs)

Low Resolution

In [None]:
y_noisy, fs = sf.read('../dataset/timit_noisy/test/DR1_FAKS0_SA2.wav')
librosa.display.waveplot(y_noisy, sr = fs)

D_noisy = librosa.stft(y_noisy)
D_noisy_db = librosa.amplitude_to_db(abs(D_noisy))
plt.figure(figsize=(14,5))
librosa.display.specshow(D_noisy_db, sr=fs, x_axis='time', y_axis='hz', cmap='jet')

ipd.Audio(y_noisy, rate = fs)

Precited

In [None]:
librosa.display.waveplot(y_pred, sr = sample_rate)

D_pred = librosa.stft(y_pred)
D_pred_db = librosa.amplitude_to_db(abs(D_pred))
plt.figure(figsize=(14,5))
librosa.display.specshow(D_pred_db, sr=fs, x_axis='time', y_axis='hz', cmap='jet')

ipd.Audio(y_pred, rate = sample_rate)