In [1]:
import os
import numpy as np
from cds_utils import *
import librosa ## audio preprocessing
import random
import multiprocessing as mp

import torch
import torch.utils.data
from sklearn.model_selection import train_test_split

In [4]:
n_frame = 4096

In [2]:
def process(t):
    drt, file, n_frame = t
    X,Y = [],[]
    [x,y],sr = librosa.load(drt+file, sr=16000, mono=False)
    audio_length = len(x)
    padding = n_frame - audio_length%n_frame
    x = np.pad(x,pad_width=(0,padding),mode='constant')
    y = np.pad(y,pad_width=(0,padding),mode='constant')
    for frame in range(audio_length//n_frame+1):
        X.append(x[frame*n_frame:(frame+1)*n_frame])
        Y.append(y[frame*n_frame:(frame+1)*n_frame])
    return X,Y

def load_datas(n_frame,is_test = False):
    X,y = [],[]
    pool = mp.Pool(mp.cpu_count())
    
    print("load Train Datas")
    args = [] ## [..(drt,file,n_frame)..]
    
    for drt in ['./datasets/TrainData/Alexis/','./datasets/TrainData/vietnam/','./datasets/TrainData/Childer/',
               './datasets/TrainData/CMU/','./datasets/TrainData/saarbrucken/']:
        for file in os.listdir(drt):
            if 'wav' in file:
                args.append((drt,file,n_frame))
    
    if is_test:
        args = args[:50]
    
    tmp = pool.map(process,args) ## [..[X,Y]..] X = [...[20000][20000][20000]...]
    for _X,_Y in tmp: ## _X : [...[20000]...]
        X += _X
        y += _Y
    pool.close()
    pool.join()
    X = np.array(X)
    y = np.array(y)
    
    train_X,val_X,train_y,val_y = train_test_split(X,y,test_size=0.3,random_state=42)
    return train_X,train_y,val_X,val_y

In [3]:
train_X, train_y, val_X, val_y = load_datas(n_frame)

load Train Datas


In [10]:
def save_parallel_train(t):
    i,n_frame = t
    np.save('./datasets/TrainData/trainX_%d/%d.npy'%(n_frame,i),train_X[i])
    np.save('./datasets/TrainData/trainy_%d/%d.npy'%(n_frame,i),train_y[i])
    return 0

def save_parallel_val(t):
    i,n_frame = t
    np.save('./datasets/TrainData/valX_%d/%d.npy'%(n_frame,i),val_X[i])
    np.save('./datasets/TrainData/valy_%d/%d.npy'%(n_frame,i),val_y[i])
    return 0

In [11]:
%%time
os.makedirs('./datasets/TrainData/trainX_%d/'%n_frame,exist_ok=True)
os.makedirs('./datasets/TrainData/valX_%d/'%n_frame,exist_ok=True)
os.makedirs('./datasets/TrainData/trainy_%d/'%n_frame,exist_ok=True)
os.makedirs('./datasets/TrainData/valy_%d/'%n_frame,exist_ok=True)

CPU times: user 1.91 ms, sys: 0 ns, total: 1.91 ms
Wall time: 1.03 ms


In [12]:
%%time
pool = mp.Pool(mp.cpu_count())
pool.map(save_parallel_train,zip(range(train_X.shape[0]),[n_frame]*train_X.shape[0]))
pool.close()
pool.join()

CPU times: user 138 ms, sys: 2.03 s, total: 2.17 s
Wall time: 11.8 s


In [13]:
%%time
pool = mp.Pool(mp.cpu_count())
pool.map(save_parallel_val,zip(range(val_X.shape[0]),[n_frame]*val_X.shape[0]))
pool.close()
pool.join()

CPU times: user 63.3 ms, sys: 2.07 s, total: 2.13 s
Wall time: 6.48 s
