# Wavenet Demo
Demo of our efficient generation implementation.

Trains wavenet on a single wav file. Then generates that file, starting from a single sample.

In [1]:
from time import time

from wavenet.utils import make_batch
from wavenet.models import Model, Generator

from IPython.display import Audio

%matplotlib inline

ImportError: No module named wavenet.utils

In [None]:
inputs, targets = make_batch('assets/voice.wav')
num_time_samples = inputs.shape[1]
num_channels = 1
gpu_fraction = 1.0

model = Model(num_time_samples=num_time_samples,
              num_channels=num_channels,
              gpu_fraction=gpu_fraction)

Audio(inputs.reshape(inputs.shape[1]), rate=44100)

In [None]:
tic = time()
model.train(inputs, targets)
toc = time()

print('Training took {} seconds.'.format(toc-tic))

In [None]:
generator = Generator(model)

# Get first sample of input
input_ = inputs[:, 0:1, 0]

tic = time()
predictions = generator.run(input_, 32000)
toc = time()
print('Generating took {} seconds.'.format(toc-tic))

In [None]:
Audio(predictions, rate=44100)

In [2]:
import numpy as np
import os
from scipy.io import wavfile
import fast_gen as fg



In [3]:

def normalize(data):
    temp = np.float32(data) - np.min(data)
    out = (temp / np.max(temp) - 0.5) * 2
    return out


In [33]:
def make_batch(path,class_num):
    data = wavfile.read(path)[1]

    data_ = normalize(data)
    if data.shape[0]>10000 and data.shape[0]<200000 :

        # data_f = np.sign(data_) * (np.log(1 + 255*np.abs(data_)) / np.log(1 + 255))

        bins = np.linspace(-1, 1, 256)
        # Quantize inputs.
        inputs = np.digitize(data_[0:-1], bins, right=False) - 1
        inputs = bins[inputs][None, :, None]

        # Encode targets as ints.
        targets_pred = (np.digitize(data_[1::], bins, right=False) - 1)[None, :]
        target_class = np.zeros(109)
        target_class[int(class_num)] = 1
        inputs = np.lib.pad(inputs, ((0,0), (0, 200000 - inputs.shape[1]), (0,0)), 'constant',
                                 constant_values=(0, 0))
        targets_pred = np.lib.pad(targets_pred, ((0,0), (0, 200000 - targets_pred.shape[1])), 'constant',
                                 constant_values=(0, 0))
        inputs = sc.resample(inputs.reshape(inputs.shape[1]),20000)
        targets_pred = sc.resample(targets_pred.reshape(targets_pred.shape[1]),20000)
        return inputs, targets_pred, target_class
    else :
        return [], [], []

def generate_batches(root_path = 'data/wav48/', indexes = range(0,10)):
    inputs = []
    target_class = []
    target_pred =[]
    ns = 0
    for path, subdirs, files in os.walk(root_path):
        for name in files:
            if name[-3:] == 'wav':
                if ns in indexes:
                    path_to_file = os.path.join(path, name)
                    class_num = path[-3:]
                    inputs_loc, targets_pred_loc, targets_class_loc = make_batch(path_to_file,class_num)
                    if len(inputs_loc)!=0:
                        inputs.append(inputs_loc)
                        target_pred.append(targets_pred_loc)
                        target_class.append(targets_class_loc)
                ns += 1


    inputs_st = np.vstack(inputs)
    target_pred_st = np.vstack(target_pred)
    target_class_st = np.vstack(target_class)

    return inputs_st,target_pred_st,target_class_st


In [34]:
def train_model(b_size = 5):
    num_files = 44257
    batch_size = b_size
    rnd = np.random.permutation(num_files)
    for epoch in range(10000):
        for iteration in range(num_files/batch_size):
            indices = rnd[iteration*batch_size:(iteration +1)*batch_size]
            inputs, targets_pred, targets_class  = generate_batches(indexes = indices)
            num_time_samples = inputs.shape[1]
            num_channels = 1
            gpu_fraction = 1.0
            model = fg.Model(num_time_samples=num_time_samples,
                          num_channels=num_channels,
                          gpu_fraction=gpu_fraction)
            model.train(inputs, targets_pred, targets_class)


In [35]:
inputs, targets_pred, targets_class  = generate_batches(indexes = [5])

In [36]:
inputs.shape

(1, 20000)

In [37]:
from IPython.display import Audio

In [38]:
Audio(inputs.reshape(inputs.shape[1]), rate=4410)

In [39]:
Audio(targets_pred.reshape(targets_pred.shape[1]), rate=4410)

In [11]:
import numpy as np

In [12]:
import scipy.signal as sc

In [13]:
x = sc.resample(inputs.reshape(inputs.shape[1]),30000)

In [15]:
Audio(x,rate=4410)