### writing to  s3

In [22]:
from io import BytesIO
import numpy as np
from urllib.parse import urlparse
import boto3
client = boto3.client("s3")

def to_s3_npy(data: np.array, s3_uri: str):
    # s3_uri looks like f"s3://{BUCKET_NAME}/{KEY}"
    bytes_ = BytesIO()
    np.save(bytes_, data, allow_pickle=True)
    bytes_.seek(0)
    parsed_s3 = urlparse(s3_uri)
    client.upload_fileobj(
        Fileobj=bytes_, Bucket=parsed_s3.netloc, Key=parsed_s3.path[1:]
    )
    return True

def from_s3_npy(s3_uri: str):
    bytes_ = BytesIO()
    parsed_s3 = urlparse(s3_uri)
    client.download_fileobj(
        Fileobj=bytes_, Bucket=parsed_s3.netloc, Key=parsed_s3.path[1:]
    )
    bytes_.seek(0)
    return np.load(bytes_, allow_pickle=True)

In [23]:
data = np.load("data/encoder_librispeech_valid.npz", allow_pickle=True)["data"]

In [27]:
to_s3_npy(data, "s3://rtvc-data/preprocessed/encoder_librispeech_valid.npz")

True

## Preprocessing

In [3]:
import struct
import librosa
import webrtcvad
import numpy as np

from glob import glob
from scipy.ndimage.morphology import binary_dilation

In [4]:
import time

def loadbar(iteration, total, prefix="", suffix="", decimal=0, 
            length=100, fill="=", extras=""):
    per_val = iteration*100/float(total)
    
    percent = ("{0:." + str(decimal) + "f}").format(per_val)   
    cur_percent = ( ' ' * (3-len(str(round(per_val)))) + percent)
    
    filledLen = int(length * iteration//total)
    if per_val == 100:
        bar = fill * filledLen + "." * (length - filledLen)
    else:
        bar = fill * filledLen + ">" + "." * (length - filledLen - 1)
        
    print(f"\r{prefix} [{bar}] {cur_percent}% {suffix}", end="\r")
    if iteration == total: 
        print(f"\r{prefix} [{bar}] {cur_percent}% {suffix} {extras}", end="\n")
        
    time.sleep(0.1)

In [5]:
TARG_dbFS = -30
TARG_FS = 16000 
INT16_MAX = (2**15) - 1

VAD_WNDW_LEN = 30  
VAD_WNDW_AVG_WIDTH = 8
VAD_MAX_SILENCE_LEN = 6

MEL_WNDW_LEN = 25  
MEL_WNDW_STP = 10  
MEL_N_CHANNELS = 40 

PAR_N_FRAMES=160

AUDIO_RE = "[fw][la][av]"

In [6]:
extensions = {
    "LibriSpeech": ".flac",
    "LibriTTS": ".wav"
}

In [7]:
pretty_print = lambda val, total, substitute=0: f'{substitute}' * (len(str(total))-len(str(val))) + str(val)

In [8]:
def normalize_volume(wav, target_dBFS=TARG_dbFS, increase_only=False, decrease_only=False):
    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
        return wav
    return wav * (10 ** (dBFS_change / 20))

def moving_average(array, width):
    array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
    ret = np.cumsum(array_padded, dtype=float)
    ret[width:] = ret[width:] - ret[:-width]
    return ret[width - 1:] / width

def trim_long_silences(wav, sampling_rate=TARG_FS):
    samples_per_window = (VAD_WNDW_LEN * sampling_rate) // 1000
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * INT16_MAX)).astype(np.int16))

    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=sampling_rate))
    audio_mask = moving_average(voice_flags, VAD_WNDW_AVG_WIDTH)
    audio_mask = np.round(audio_mask).astype(np.bool_)
    audio_mask = binary_dilation(audio_mask, np.ones(VAD_MAX_SILENCE_LEN + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    return wav[audio_mask == True]

def load_audio(file_path):
    arr, _ = librosa.load(file_path) 
    return arr

def preprocess_data(arr):
        return trim_long_silences(normalize_volume(arr))

def generate_frames(preprocessed_wav, sampling_rate=TARG_FS):
    frames = librosa.feature.melspectrogram(
    y=preprocessed_wav,
    sr=sampling_rate,
    n_fft=int(sampling_rate * MEL_WNDW_LEN / 1000),
    hop_length=int(sampling_rate * MEL_WNDW_STP / 1000),
    n_mels=MEL_N_CHANNELS
    )
    return frames.astype(np.float32).T

In [12]:
def preprocess(dataset="LibriSpeech", split="dev-clean", limit_speakers=None, limit_files=None):
    data = []
    for spath in glob(f"../data/{dataset}/{split}/*")[:limit_speakers]:
        sid = spath.split("/")[-1]
        fpaths = glob(f"{spath}/*/*{extensions[dataset]}")
        
        i = 0
        size = len(fpaths) if limit_files==None else limit_files
        p = f"Speaker {pretty_print(int(sid), '0000')} - File [{pretty_print(int(pretty_print(i, size)), '000')}/{pretty_print(size, '000')}]"
        loadbar(i, size, p, length=50)
        for fpath in fpaths[:limit_files]:           
            fid = fpath.split("/")[-1].split(".")[0]
            arr = generate_frames(preprocess_data(load_audio(fpath)))
            if arr.shape[0] >= PAR_N_FRAMES: data.append((fid, sid, arr))
            
            p = f"Speaker {pretty_print(int(sid), '0000')} - File [{pretty_print(int(pretty_print(i+1, size)), '000')}/{pretty_print(size, '000')}]"
            loadbar(i+1, size, p, length=50)
            i += 1

    return np.array(data, dtype=object)

tmp = preprocess(limit_speakers=4, limit_files=10)
tmp.shape



(40, 3)

In [1]:
# # LibriSpeech: dev-clean
# data = preprocess()
# data.shape

# np.savez("data/encoder_librispeech_valid.npz", data=data)

In [1]:
# # LibriSpeech: train-clean-100
# data = preprocess(split="train-clean-100")
# data.shape