### Data Processing

In [None]:
import os, librosa, time, pickle, random, warnings
from glob import glob
import numpy as np
 
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import IPython.display as ipd
from IPython.core.display import display, clear_output
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
#%load_ext tensorboard
 
SR = 16_000
FRAME = 0.2
SECONDS = 10 #5
BASE_DIR = './preprocessed_dataset'

#### Dataset extraction

In [None]:
# Explaining process of extracting librispeech dataset
# Link: https://www.openslr.org/12

# Extracting tar files into a new folder

if not os.path.exists('./LibriSpeech'):
    os.makedirs('./LibriSpeech')
    os.system("tar -zxf './datasets/LibriSpeech/dev-clean.tar.gz'")
    
    # If needed you can use these also (used in actual benchmark of the paper)
    #os.system("tar -zxf './datasets/LibriSpeech/dev-other.tar.gz'")
    #os.system("tar -zxf './datasets/LibriSpeech/test-clean.tar.gz'")
    #os.system("tar -zxf './datasets/LibriSpeech/test-other.tar.gz'")

#### Data processig 

In [None]:
def process_data(wav_dirs, limit=None, sr=SR, fsize=FRAME, ds="LibriSpeech", 
                 seconds=SECONDS):
    # Length of segments
    flens = []
    speaker_id = set()
    X, person, filetoken, tlen = [], [], [], dict()
    frame = int(fsize*sr)
    frame_len = sr * seconds
    
    print("Sample rate", sr)
    print("Frame size", frame)
    print("Frame_len", frame_len)

    # Our code supports TIMIT and Bengali ASR also
    if ds == "TIMIT":
        spkridf = -2
    elif ds == "LibriSpeech":
        spkridf = -3
    elif ds != "ASR":
        print("Invalid dataset token")
        return

    fileId = 0
    for wdir in tqdm(wav_dirs):
        if ds[0] ==  "A":
            spkr = wdir.split('/')[-1]
            spkr = spkr.split('.')[0]
            if spkr in asr_maps:
                spkr = asr_maps[spkr]
            else:
                continue
        else:
            spkr = wdir.split('/')[spkridf]

        if (limit is not None) and (spkr not in speaker_id):
            if len(speaker_id) == limit:
                #print(len(speaker_id), limit)
                continue
            speaker_id.add(spkr)

        if spkr not in tlen:
            tlen[spkr] = 0
        elif tlen[spkr] >= frame_len:
            continue

        wav, sr = librosa.load(wdir, sr=sr)
        sframes = librosa.effects.split(y=wav, frame_length=frame, 
                                        hop_length=frame//2, top_db=16)

        flens.extend(sframes)
        
        for [st, ed] in sframes: 
            if ed-st+1 < frame:
                continue
            idx = 0
            while st+(idx+1)*frame < ed:
                twav = wav[st+idx*frame:st+(idx+1)*frame]
                X.append(twav)
                person.append(spkr)
                filetoken.append(fileId)
                tlen[spkr] += frame
                idx += 1

                if tlen[spkr] >= frame_len:
                    break

            if tlen[spkr] >= frame_len:
                    break

        fileId += 1
    
    return X, person, filetoken, flens


def segment_length(segment_data):
    tmp = np.array(segment_data)
    tlen = []

    for a, b in tmp:
        tlen.append(b-a)

    tlen = np.array(tlen)
    return np.mean(tlen/SR), np.std(tlen/SR), np.median(tlen/SR)

In [None]:
# Cumulative distribution function
import scipy
import seaborn as sns
def CDF(segment_data):
    tmp = np.array(segment_data)
    tlen = []

    for a, b in tmp:
        tmp = (b-a+1)
        if tmp < FRAME:
            continue
        tlen.append(tmp/SR)


    tlen = sorted(tlen)
    print(tlen[:10])
    norm_cdf = scipy.stats.norm.cdf(tlen) # calculate the cdf - also discrete

    # plot the cdf
    #sns.lineplot(x=tlen, y=norm_cdf)
    #plt.show()
    plt.plot(tlen, norm_cdf)
    plt.xlim([0, 2])
    plt.ylim([0, 1])
    plt.grid(), plt.minorticks_on();
    plt.grid(b=True, which='minor', linestyle='-', alpha=0.5)

In [None]:
X, py, fy, segs = process_data(asr_wav_dirs, fsize=FRAME, ds="LibriSpeech", seconds=SECONDS)
print(segment_length(segs))

In [None]:
CDF(segs)

#### Saving data into pickle file

In [None]:
X, py, fy  = process_data(libri_wav_dirs, fsize=FRAME, ds="LibriSpeech", seconds=SECONDS)

with open(os.path.join(BASE_DIR, f'LIBRI_X_16000_{SECONDS}_{FRAME}.pkl'), 'wb') as f:
    pickle.dump(X, f)
with open(os.path.join(BASE_DIR, f'LIBRI_y_16000_{SECONDS}_{FRAME}.pkl'), 'wb') as f:
    pickle.dump(py, f)
with open(os.path.join(BASE_DIR, f'LIBRI_fy_16000_{SECONDS}_{FRAME}.pkl'), 'wb') as f:
    pickle.dump(fy, f)