In [5]:
import tensorflow as tf
import numpy as np
from python_speech_features import mfcc, fbank, delta
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wav
import subprocess
import os, time, pickle
import random
import librosa

In [6]:
phn_61 = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh',
          'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 
          'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl',
          'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']

mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix',
               'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n',
               'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#',
               'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#',
               'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

phn_39 = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 
             'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', 
             'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw',
             'v', 'w', 'y', 'z', 'zh']

development_set = ['FAKS0', 'MMDB1', 'MBDG0', 'FEDW0', 'MTDT0', 'FSEM0', 'MDVC0', 'MRJM4', 'MJSW0', 'MTEB0',
                    'FDAC1', 'MMDM2', 'MBWM0', 'MGJF0', 'MTHC0', 'MBNS0', 'MERS0', 'FCAL1', 'MREB0', 'MJFC0',
                    'FJEM0', 'MPDF0', 'MCSH0', 'MGLB0', 'MWJG0', 'MMJR0', 'FMAH0', 'MMWH0', 'FGJD0', 'MRJR0',
                    'MGWT0', 'FCMH0', 'FADG0', 'MRTK0', 'FNMK0', 'MDLS0', 'FDRW0', 'FJSJ0', 'FJMG0', 'FMML0',
                    'MJAR0', 'FKMS0', 'FDMS0', 'MTAA0', 'FREW0', 'MDLF0', 'MRCS0', 'MAJC0', 'MROA0', 'MRWS1']

core_test_set = ['MDAB0', 'MFBT0', 'FELC0', 'MTAS1', 'MFEW0', 'FPAS0', 'MJMP0', 'MLNT0', 'FPKT0',
                'MLLL0', 'MTLS0', 'FJLM0', 'MBPM0', 'MKLT0', 'FNLP0', 'MCMJ0', 'MJDH0', 'FMGD0',
                'MGRT0', 'MNJM0', 'FDHC0', 'MJLN0', 'MPAM0', 'FMLD0']

TIMIT_DIR = './' # root directory for timit, it would be joined with timit/train or timit/test
TFRECORD_DIR = './data' # directory for tfrecords files

In [7]:
def prepare_timit_dataset(train_set=True, dev_set=True, test_set=True, feats_type='mfcc'):
    
    def phase_randomization(signal, delta):
        stft = librosa.stft(signal, n_fft=400, hop_length=160, win_length=400, window='hann')
        phase = np.angle(stft)
        num_cols = phase.shape[1]
        
        for i in range(num_cols):
            mu = np.random.normal(1, delta)
            phase[:, i] *= mu
        
        return phase
    
    def create_tfrecords(tfrecord_path, root_dir, fname, filter_fn):
        writer = tf.io.TFRecordWriter(os.path.join(tfrecord_path, (fname + '.tfrecords')))
        feats_list = []
        phoneme_list = []
        start = time.time()
        cnt = 0

        for path, dirs, files in os.walk(root_dir):
            for file in files:
                if filter_fn(file, path):
                    continue
                if file.endswith('WAV'):
                    fullFileName = os.path.join(path, file)
                    fnameNoSuffix = os.path.splitext(fullFileName)[0]
                    fNameTmp = fnameNoSuffix + '.WAV'
                    # convert nist file format to wav with command line program 'sox'
                    subprocess.call(['sox', fullFileName, fNameTmp], shell=True)
                    rate, sig = wav.read(fNameTmp)
                    os.remove(fNameTmp)
                    sig = sig.astype(np.float32)
                    
                    phase_randomized = phase_randomization(sig, delta=0.1)

                    freq_mask_width = np.random.randint(10, 30)  
                    freq_mask_start = np.random.randint(0, phase_randomized.shape[0] - freq_mask_width)
                    phase_randomized[freq_mask_start:freq_mask_start + freq_mask_width, :] = 0
                    
                    time_mask_width = np.random.randint(5, phase_randomized.shape[1] // 9)  
                    time_mask_start = np.random.randint(0, phase_randomized.shape[1] - time_mask_width)
                    phase_randomized[:, time_mask_start:time_mask_start + time_mask_width] = 0
                    
                    sig_phase_randomized = librosa.istft(np.abs(librosa.core.stft(sig, n_fft=400, hop_length=160, win_length=400, window='hann')) * np.exp(1j * phase_randomized))
                    
                     
                    if feats_type == 'mfcc':
                        mfcc_feat = mfcc(sig_phase_randomized, rate)
                        mfcc_feat_delta = delta(mfcc_feat, 2)
                        mfcc_feat_delta_delta = delta(mfcc_feat_delta, 2)
                        feats = np.concatenate((mfcc_feat, mfcc_feat_delta, mfcc_feat_delta_delta), axis=1)
                        
                    else: # fbank
                        filters, energy = fbank(sig_phase_randomized, rate, nfilt=40)
                        log_filters, log_energy = np.log(filters), np.log(energy)
                        logfbank_feat = np.concatenate((log_filters, log_energy.reshape(-1,1)), axis=1)
                        logfbank_feat_delta = delta(logfbank_feat, 2)
                        logfbank_feat_delta_delta = delta(logfbank_feat_delta, 2)
                        feats = np.concatenate((logfbank_feat, logfbank_feat_delta, logfbank_feat_delta_delta), axis=1)
                    
                    feats_list.append(feats)

                    # .phn
                    phoneme = []
                    with open(fnameNoSuffix + '.phn', 'r') as f:
                        for line in f.read().splitlines():
                            phn = line.split(' ')[2]
                            p_index = phn_61.index(phn)
                            phoneme.append(p_index)
                    phoneme_list.append(phoneme)

                    cnt += 1

        if fname == 'train':
            scaler = StandardScaler()
            scaler.fit(np.concatenate(feats_list, axis=0))
            print('scaler.n_samples_seen_:', scaler.n_samples_seen_)
            pickle.dump(scaler, open(os.path.join(tfrecord_path, 'scaler.pkl'), 'wb'))
            
        if not os.path.exists(os.path.join(tfrecord_path, 'scaler.pkl')):
            raise Exception('scaler.pkl not exist, call with [train_set=True]')
        else:
            scaler = pickle.load(open(os.path.join(tfrecord_path, 'scaler.pkl'), 'rb'))
        
        for feats, phoneme in zip(feats_list, phoneme_list):
            seq_exam = tf.train.SequenceExample()
            seq_exam.context.feature['feats_dim'].int64_list.value.append(feats.shape[1])
            seq_exam.context.feature['feats_seq_len'].int64_list.value.append(feats.shape[0])
            seq_exam.context.feature['labels_seq_len'].int64_list.value.append(len(phoneme))

            feats = scaler.transform(feats)
            for feat in feats:
                seq_exam.feature_lists.feature_list['features'].feature.add().float_list.value[:] = feat
            for p in phoneme:
                seq_exam.feature_lists.feature_list['labels'].feature.add().int64_list.value.append(p)
            writer.write(seq_exam.SerializeToString())

        writer.close()
        print('{} created: {} utterances - {:.0f}s'.format(fname+'.tfrecords', cnt, (time.time()-start)))
    # end create_tfrecords() definition
    
    tfrecord_path = os.path.join(TFRECORD_DIR, feats_type)
    if not os.path.isdir(tfrecord_path):
        os.makedirs(tfrecord_path)
    
    if train_set:
        create_tfrecords(tfrecord_path, os.path.join(TIMIT_DIR, 'TIMIT(wav)/TRAIN'), 'train',
                         lambda file, _: file.startswith('SA'))
    if dev_set:
        create_tfrecords(tfrecord_path, os.path.join(TIMIT_DIR, 'TIMIT(wav)/TEST'), 'dev', 
                         lambda file, path: file.startswith('SA') or os.path.split(path)[1] not in development_set)
    if test_set:
        create_tfrecords(tfrecord_path, os.path.join(TIMIT_DIR, 'TIMIT(wav)/TEST'), 'test', 
                         lambda file, path: file.startswith('SA') or os.path.split(path)[1] not in core_test_set)

In [8]:
prepare_timit_dataset(feats_type='fbank')

scaler.n_samples_seen_: 702601
train.tfrecords created: 3696 utterances - 167s
dev.tfrecords created: 392 utterances - 18s
test.tfrecords created: 176 utterances - 8s


In [3]:
import os
import shutil

shutil.rmtree("model")  

In [4]:
import zipfile

f = zipfile.ZipFile("./TIMIT(wav).zip",'r') 

for file in f.namelist():
    f.extract(file,"./") 
f.close()
