In [1]:
import os
import numpy as np
from tqdm import tqdm_notebook
from utils import *
import librosa
import random
import multiprocessing as mp
import torch.utils.data
import pandas as pd
from sklearn.model_selection import train_test_split

seed_everything(42)

def process(t): ## args : drt, file,window,step
    drt,file,window,step,n_frame,top_db = t
    X,y = [],[]

    x,sr = librosa.load(drt+file,sr=16000,mono=False)
    itvs = librosa.effects.split(x[0],frame_length = n_frame+1, hop_length = step,top_db = top_db)
    
    for st_idx,end_idx in itvs:
        speech,egg = x[0][st_idx:end_idx],x[1][st_idx:end_idx]
        i=0
        while(i*step+window < len(speech)):
            tmp_speech = speech[i*step:i*step+window]
            tmp_egg = egg[i*step:i*step+window]
            if check_data(tmp_speech,tmp_egg,0.2,n_frame):
                X.append(tmp_speech)
                y.append(tmp_egg)
            i+=1
    return (X,y)
   
def load_datas(n_frame,window,step,top_db,is_test=False):
    X,y = [],[]
    pool = mp.Pool(mp.cpu_count())
    
    print("load Train Data")
    args = []
    
    for drt in ['./datasets/TrainData/Alexis/','./datasets/TrainData/vietnam/',
                './datasets/TrainData/CMU/','./datasets/TrainData/saarbrucken/']:
        for file in os.listdir(drt):
            if 'wav' in file:
                args.append((drt,file,window,step,n_frame,top_db))

    if is_test:
        args = args[:50]
    
    tmp = list(tqdm_notebook(pool.imap(process,args),total = len(args)))
    for _x,_y in tmp:
        if len(_x) >0 and len(_y) > 0:
            X +=_x
            y +=_y
    pool.close()
    pool.join()
    
    release_list(tmp)
    X = np.array(X)
    y = np.array(y)
    train_X,val_X, train_y, val_y = train_test_split(X, y, test_size=0.3, random_state=42)
    return train_X, train_y, val_X, val_y

In [2]:
train_X, train_y, val_X, val_y = load_datas(n_frame = 256,
                                            window = int(256*(1.25)),
                                            step = int(256//4),
                                            top_db = 20,
                                            is_test = False)

load Train Data


HBox(children=(IntProgress(value=0, max=19559), HTML(value='')))




In [3]:
train_X.shape

(3657259, 320)

In [4]:
val_X.shape

(1567398, 320)

In [5]:
%%time

def save_parallel_train(i):
    np.save('./datasets/TrainData/trainX/%d.npy'%i,train_X[i])
    np.save('./datasets/TrainData/trainy/%d.npy'%i,train_y[i])
    return 0

def save_parallel_val(i):
    np.save('./datasets/TrainData/valX/%d.npy'%i,val_X[i])
    np.save('./datasets/TrainData/valy/%d.npy'%i,val_y[i])
    return 0
os.makedirs('./datasets/TrainData/trainX/',exist_ok=True)
os.makedirs('./datasets/TrainData/valX/',exist_ok=True)
os.makedirs('./datasets/TrainData/trainy/',exist_ok=True)
os.makedirs('./datasets/TrainData/valy/',exist_ok=True)

pool = mp.Pool(mp.cpu_count())

tqdm_notebook(pool.imap(save_parallel_train,range(3657259)),total = 3657259)

pool.close()
pool.join()

HBox(children=(IntProgress(value=0, max=3657259), HTML(value='')))


CPU times: user 1min 56s, sys: 34.5 s, total: 2min 30s
Wall time: 11min 27s


In [6]:
%%time
pool = mp.Pool(mp.cpu_count())
tqdm_notebook(pool.imap(save_parallel_val,range(1567398)),total = 1567398)
pool.close()
pool.join()

HBox(children=(IntProgress(value=0, max=1567398), HTML(value='')))


