## Preprocess LibriSpeech

In [None]:
import glob
import numpy as np
import pickle as pkl
import torchaudio
import json


wav_files = glob.glob('./wav/*.wav')
with open('../labels.json') as f:
    labels = json.load(f)    
labels_map = dict([(labels[i], i) for i in range(len(labels))])

def load_audio(path):
    sound, sample_rate = torchaudio.load(path)
    if sound.shape[0] == 1:
        sound = sound.squeeze()
    else:
        sound = sound.mean(axis=0)  # multiple channels, average
    return sound.numpy()

def parse_transcript(transcript_path):
    with open(transcript_path, 'r', encoding='utf8') as transcript_file:
        transcript = transcript_file.read().replace('\n', '')
    transcript = list(filter(None, [labels_map.get(x) for x in list(transcript)]))
    return transcript

# combine wav and txt
for wav_file in wav_files:
    txt_file = wav_file.replace('wav', 'txt')
    tag = txt_file.split('/')[-1].split('.')[0]
    wav = load_audio(wav_file)
    txt = parse_transcript(txt_file)
    np.save(f"./wavtxt/{tag}.npy", np.array([wav, txt]))

## Compress Data

In [None]:
from collections import defaultdict
import os
import tarfile
import glob
import concurrent
import multiprocessing
import pandas as pd
from multiprocessing import Pool, cpu_count

CHUNK_SIZE = int(128*1e6)
root = '.'
croot = './tar'
ds = 'train'

files = glob.glob('%s/%s/*.npy' % (root, ds))
files.sort()

def compressor(chunk, files_chunk):
    dst = "{}/{}".format(croot, ds)
    try:
        os.makedirs(dst)
    except FileExistsError:
        pass

    s = 0
    mytar = tarfile.open("{}/{}.tar.gz".format(dst, chunk),"w:gz")
    for f in files_chunk:
        s += os.path.getsize(f)
        if s < CHUNK_SIZE:
            mytar.add(f, arcname=f.split("/")[-1])
        else:
            s = 0
            mytar.close()
            mytar = tarfile.open("{}/{}.tar.gz".format(dst, chunk),"w:gz")
    mytar.close()

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def compress_data():
    with Pool(cpu_count()) as p:
        num_files_per_process = len(files) // cpu_count()  # divide files equally among processes
        chunked_files = list(chunks(files, num_files_per_process))
        p.starmap(compressor, enumerate(chunked_files))

compress_data()

## Upload Data to S3

In [None]:
import boto3
import glob
import pickle
import multiprocessing
import concurrent.futures
import os


session = boto3.Session()
s3 = session.client("s3")
bucket = 'vuzhuangwei'

def preprocess(path):
    key = 'LibriSpeech-Mini/{}'.format(path)
    s3.upload_file(path, bucket, key)
    print(key)
 
def upload_objects(folder):
    with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        futures = []
        if os.path.isdir(folder):
            imgs = glob.glob('{}/*'.format(folder))
        else:
            imgs = [folder]
        for path in imgs:
            futures.append(executor.submit(preprocess, path))
        concurrent.futures.wait(futures)
        

upload_objects("train/")