In [5]:
import numpy as np
import soundfile as sf
import binarizer as bz
import matplotlib.pyplot as plt
import pandas as pd
import vad
import os

# ITU-T supplement-23
OUT_META_DIR = '/home/richard/workspace/test_voice_generation/metadata/'
playlist = pd.DataFrame({}, columns=["file_id", "data_path"])
FS = 16000
FRAME_DUR = 0.002
NOISE_THR = -50
LEAD_SIL = 1 * FS
SMOOTHE_STEP = 1
STABILITY = 0.5 # Numnber of seconds for speech hang over time
SEGMENT_LENGTH_5 = 5 * FS
SEGMENT_LENGTH_10 = 10 * FS
NBITS = 16
NBITS_MAX = 2**(NBITS-1)
DATA_SET = 'Supp-23'
FILE_EXT = '.SRC'
DATA_DIR = '/home/richard/sambashare/media/ITU-T_test/P_Suppl_23_DB/'
total_time = 0
for dir_path, dir_name, file_names in os.walk(DATA_DIR, topdown=True):
    if len(file_names) == 0:
        continue
    if file_names[0].endswith(FILE_EXT):
        print(dir_path, dir_name, file_names)
    for idx, file_name in enumerate(file_names):
        if FILE_EXT not in file_name:
            continue
        data_path = os.path.join(dir_path,file_name)
        parent_dir = dir_path.split('/')[-1]
        data = np.memmap(data_path, dtype='<i2', mode='r')
        signal = np.array(data, copy=True).astype(float) / NBITS_MAX

        vadInst = vad.vad(signal, FS, FRAME_DUR, NBITS, NOISE_THR, SMOOTHE_STEP)
        binFlag, segments = bz.binarizer(vadInst.get_active_mask(level="sample"),STABILITY, FS)
        for sdx in range(len(segments["starts"])):
            segmented_signal = signal[segments["starts"][sdx]:segments["stops"][sdx]]
            segment_length = len(segmented_signal)
            # Sanity check
            if segments["stops"][sdx] <= segments["starts"][sdx]:
                continue
            file_id = file_name.replace(FILE_EXT, '') + '_' + str(sdx)
            output_path = os.path.join(dir_path, file_id + '.wav')
            if LEAD_SIL + segment_length < SEGMENT_LENGTH_5:
                output_signal = np.concatenate([np.zeros(LEAD_SIL).flatten(), segmented_signal, np.zeros(SEGMENT_LENGTH_5 - LEAD_SIL - segment_length).flatten()])
            elif LEAD_SIL + segment_length < SEGMENT_LENGTH_10:
                output_signal = np.concatenate([np.zeros(LEAD_SIL).flatten(), segmented_signal, np.zeros(SEGMENT_LENGTH_10 - LEAD_SIL - segment_length).flatten()])
            else:
                # raise error here
                raise Exception("Sorry segement length exceed limit")
            sf.write(output_path, output_signal, FS, 'PCM_16')
            playlist = pd.concat([\
                playlist, \
                pd.DataFrame([{\
                    "file_id": file_id, \
                    "src_data_path": data_path, \
                    "data_path": output_path, \
                    "data_set": DATA_SET, \
                    "parent_dir": parent_dir, \
                    "start": LEAD_SIL, \
                    "stop": LEAD_SIL + len(segmented_signal), \
                    }]) \
                ], ignore_index=True)
            total_time += len(segmented_signal) / FS
playlist.to_csv(os.path.join(OUT_META_DIR, 'ITU_playlist.csv'), index=False)
print("Total time: {:2.2f}".format(total_time))


/home/richard/sambashare/media/ITU-T_test/P_Suppl_23_DB/Disk1/EXP1/ORIGINAL/D [] ['._D_M02S10.SRC', '._D_F02S34.SRC', '._D_F01S44.SRC', 'D_M02S41_2.wav', '._D_M02S20.SRC', '._D_F01S41.SRC', 'D_F02S44_0.wav', 'D_F01S02_1.wav', 'D_M01S36.SRC', 'D_M01S31_1.wav', 'D_M02S42_1.wav', 'D_F01S20_1.wav', '._D_M02S13.SRC', 'D_M01S45.SRC', 'D_F01S37_2.wav', 'D_F01S38_0.wav', 'D_M02S12_1.wav', 'D_F02S04.SRC', '._D_M01S11.SRC', '._D_M02S24.SRC', 'D_M01S09_1.wav', 'D_F01S41_1.wav', 'D_F01S10_0.wav', 'D_M01S02_0.wav', 'D_F01S18_1.wav', 'D_F01S45_0.wav', 'D_M01S42_0.wav', 'D_F01S37_0.wav', 'D_M02S20_2.wav', 'D_F01S14.SRC', 'D_F02S33_0.wav', '._D_M02S34.SRC', 'D_F02S28_0.wav', '._D_M02S43.SRC', 'D_F02S19.SRC', 'D_F01S06_1.wav', '._D_M02S09.SRC', 'D_M02S16_0.wav', 'D_F02S20_0.wav', 'D_M02S45.SRC', 'D_M02S34_1.wav', 'D_F02S40.SRC', 'D_M01S07.SRC', 'D_F02S31.SRC', 'D_F01S04_1.wav', 'D_F02S37_1.wav', 'D_M02S11.SRC', 'D_M01S39_1.wav', 'D_M02S23.SRC', 'D_F02S25_0.wav', 'D_M01S14_0.wav', '._D_M02S30.SRC', '._D

In [31]:
import subprocess
import re
from scipy.signal import find_peaks

# 'English': O,E
OUT_DATA_DIR = '/home/richard/workspace/test_voice_generation/data/'
playbook = pd.read_csv('/home/richard/workspace/test_voice_generation/metadata/ITU_playlist.csv')
playbook = playbook[(playbook["parent_dir"] == "O") | (playbook["parent_dir"] == "E") ].copy(deep=True).reset_index().sample(frac=1)
display(playbook.iloc[0:10])
pattern = rb"(?<=Gain\[\]:)[^\t]+"

TARGET_LVL = -26
NBITS_MIN = int(20*np.log10(2**(-(NBITS)))) + 1
BIN_WIDTH_CORSE = 3
BINS_CORSE = np.arange(NBITS_MIN, 0, BIN_WIDTH_CORSE)
BIN_WIDTH_FINE = .5
SEARCH_RANGE_BINS = 6
BINS_FINE = np.arange(NBITS_MIN, 0, BIN_WIDTH_FINE)
TOTAL_LENGTH = 10 * FS * 60

y = np.array([])

for i in range(len(playbook)):
    data_path = playbook.iloc[i]["data_path"]
    x,fs = sf.read(data_path)
    vadInst = vad.vad(x, FS, FRAME_DUR, NBITS, NOISE_THR, SMOOTHE_STEP)
    rms = vadInst._ref_rms
    hist, bins_corse = np.histogram(rms, BINS_CORSE)
    peaks_corse, _ = find_peaks(hist, distance=2)
    search_center = bins_corse[peaks_corse[-1]] + BIN_WIDTH_CORSE / 2.0
    
    hist, bins_fine = np.histogram(rms, BINS_FINE)
    peaks_fine, _ = find_peaks(hist, distance=6)
    index = (np.abs(bins_fine - search_center)).argmin()
    # print(bins_fine[index]+BIN_WIDTH_CORSE / 2.0)
    search_range_hist = hist[index-SEARCH_RANGE_BINS:index+SEARCH_RANGE_BINS]
    maxindex = search_range_hist.argmax()
    est_lvl = bins_fine[index - SEARCH_RANGE_BINS + maxindex] + BIN_WIDTH_FINE / 2.0
    gain = 10.0 ** ((TARGET_LVL - est_lvl) / 20.0)
    x = x * gain
    y = np.concatenate([y, x])
    if len(y) >= TOTAL_LENGTH:
        break
sf.write(os.path.join(OUT_DATA_DIR, 'ITU_suppl23.wav'), y, fs, 'PCM_16')    
    
    # # plt.plot(rms)
    # f,ax = plt.subplots(2,1)
    # ax[0].hist(rms, bins=BINS_CORSE, edgecolor='black')
    # ax[1].hist(rms, bins=BINS_FINE, edgecolor='black')
    # break
    # cmd = ['/home/richard/workspace/STL/bin/actlev',  '-q', '-rms', '-lev', '-32', data_path]
    # result = subprocess.run(cmd, stdout=subprocess.PIPE)
    # print(result.stdout)
    # match = re.search(pattern, result.stdout)
    # gain = (float(match.group().decode('utf-8')))
    

Unnamed: 0,index,file_id,data_path,src_data_path,data_set,parent_dir,start,stop
820,2977,O_M01S74_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,58304.0
41,101,E_F01S05_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,E,16000.0,59712.0
97,980,O_M02S20_1,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,56992.0
199,1082,O_M01S42_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,60896.0
302,1185,O_M02S27_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,69216.0
35,95,E_F01S06_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,E,16000.0,54336.0
434,2591,O_F02S48_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,63392.0
800,2957,O_M02S79_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,56576.0
429,2586,O_F01S75_1,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,47552.0
381,1264,O_F02S39_0,/home/richard/sambashare/media/ITU-T_test/P_Su...,/home/richard/sambashare/media/ITU-T_test/P_Su...,Supp-23,O,16000.0,50912.0


In [18]:
bins[peaks[-1]]

-29