# PART 1 FROM 0 TO 50k SAMPLES

In [None]:
!pip install -U bnunicodenormalizer

In [None]:
#-------------------------------
# imports
#-------------------------------
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf

import pandas as pd 
import warnings
import librosa
import numpy as np 

from tqdm.auto import tqdm
from pandarallel import pandarallel
from bnunicodenormalizer import Normalizer 
from multiprocessing import Process

pandarallel.initialize(progress_bar=True,nb_workers=28)
tqdm.pandas()
warnings.filterwarnings('ignore')
bnorm=Normalizer()

# CSV Data Loading

In [None]:
errors=["common_voice_bn_31727562",
        'common_voice_bn_30998934',
        'common_voice_bn_31595526',
        'common_voice_bn_31534853',
        'common_voice_bn_31518061',
        'common_voice_bn_31518373',
        'common_voice_bn_31613621',
        'common_voice_bn_31555333',
        'common_voice_bn_31772113',
        'common_voice_bn_31605391',
        'common_voice_bn_31631175',
        'common_voice_bn_31563901',
        'common_voice_bn_31691690',
        'common_voice_bn_31692010',
        'common_voice_bn_31683653',
        'common_voice_bn_31692182',
        'common_voice_bn_31519976',
        'common_voice_bn_31675793',
        'common_voice_bn_31019914',
        'common_voice_bn_31660287',
        'common_voice_bn_31660384',
        'common_voice_bn_31557261',
        'common_voice_bn_31633101',
        'common_voice_bn_31599243',
        'common_voice_bn_31521515',
        'common_voice_bn_31777802',
        'common_voice_bn_31777848',
        'common_voice_bn_31669646',
        'common_voice_bn_31566083',
        'common_voice_bn_31530331',
        'common_voice_bn_31727697',
        'common_voice_bn_31513270',
        'common_voice_bn_31686295',
        'common_voice_bn_31753693',
        'common_voice_bn_31686334',
        'common_voice_bn_31765546',
        'common_voice_bn_31765548',
        'common_voice_bn_31662742',
        'common_voice_bn_31704856',
        'common_voice_bn_31635344',
        'common_voice_bn_31618327',
        'common_voice_bn_31743074',
        'common_voice_bn_31678862',
        'common_voice_bn_31626674',
        'common_voice_bn_31626677',
        'common_voice_bn_31523889',
        'common_voice_bn_31610804',
        'common_voice_bn_31769538',
        'common_voice_bn_31533273',
        'common_voice_bn_31445621',
        'common_voice_bn_31620650']
#---------------
# data filtering
#---------------
def filter_votes(x):
    p=x["path"]
    # avoid error data
    for pe in errors:
        if pe in p:
            return None
    # now process votes
    up=x["up_votes"]
    down=x["down_votes"]
    if down > 0:
        return "unv"
    else:
        return up
    '''if up-down<=0:
        return "unv"
    elif up==0:
        return up
    else:
        return up'''
# ------------------------- train data----------------------------------------
train_path="../input/train-wavs-all-dl-sprint/train_wavs"
df=pd.read_csv("../input/dlsprint/train.csv")
df["path"]=df["path"].progress_apply(lambda x:os.path.join(train_path,x).replace(".mp3",".wav"))
print("Total Data before filtering:",len(df))

df["up_votes"]=df.progress_apply(lambda x:filter_votes(x),axis=1)
df.dropna(subset=["up_votes"],inplace=True)
train_df=df.loc[df.up_votes!="unv"]
train_df.reset_index(drop=True,inplace=True)
print("Total Data after filtering:",len(train_df))
train_df=train_df[["path","sentence"]]
# ------------------------- eval data----------------------------------------
'''val_path="../input/validation-fileswav-format/validation_files_wav"
val_df=pd.read_csv("../input/dlsprint/validation.csv")
val_df=val_df[["path","sentence"]]
val_df["path"]=val_df["path"].progress_apply(lambda x:os.path.join(val_path,x).replace(".mp3",".wav"))
print("Total validation Data :",len(val_df))'''

# Text processing
* Normalize
* create vocab
* fix-missing vocab
    * numbers
    * no space charecter
    * sep: special token to indicate both start and end
    * pad: pad token to make all labels the same length
* find max_label_len


In [None]:
train_df = train_df[0:50000]

In [None]:
len(train_df)

In [None]:
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

#val_df["sentence"]=val_df["sentence"].parallel_apply(lambda x:normalize(x))
train_df["sentence"]=train_df["sentence"].parallel_apply(lambda x:normalize(x))
#unv_df["sentence"]=unv_df["sentence"].parallel_apply(lambda x:normalize(x))


# Mask non norm vocabs while encoding

In [None]:
vocab_norm=['\u200d',' ','!',"'",',','-','.',':',';','=','?','।',
            'ঁ','ং','ঃ',
            'অ','আ','ই','ঈ','উ','ঊ','ঋ','এ','ঐ','ও','ঔ',
            'ক','খ','গ','ঘ','ঙ',
            'চ','ছ','জ','ঝ','ঞ',
            'ট','ঠ','ড','ঢ','ণ',
            'ত','থ','দ','ধ','ন',
            'প','ফ','ব','ভ','ম',
            'য','র','ল',
            'শ','ষ','স','হ',
            'া','ি','ী','ু','ূ','ৃ','ে','ৈ','ো','ৌ','্',
            'ৎ','ড়','ঢ়','য়',
            '০','১','২','৩','৪','৫','৬','৭','৮','৯']
vocab_transfer=[" ", "_", "a", "b", "c", "d", "e", "f", "g", "h", 
                "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", 
                "t", "u", "v", "w", "x", "y", "z", "", "", "œ", 
                "।", "ঁ", "ং", "ঃ", "অ", "আ", "ই", "ঈ", "উ", "ঊ", 
                "ঋ", "এ", "ঐ", "ও", "ঔ", "ক", "খ", "গ", "ঘ", "ঙ", 
                "চ", "ছ", "জ", "ঝ", "ঞ", "ট", "ঠ", "ড", "ঢ", "ণ", 
                "ত", "থ", "দ", "ধ", "ন", "প", "ফ", "ব", "ভ", "ম", 
                "য", "র", "ল", "শ", "ষ", "স", "হ", "়", "া", "ি",
                "ী", "ু", "ূ", "ৃ", "ে", "ৈ", "ো", "ৌ", "্", "ৎ",
                "ৗ", "ড়", "ঢ়", "য়", "০", "১", "২", "৩", "৪", "৫", 
                "৬", "৭", "৮", "৯", "ৰ", "‌", "‍", "‎", "⁇"]  #["", "<s>", "</s>"]
for c in vocab_transfer:
    if c not in vocab_norm:
        idx=vocab_transfer.index(c)
        vocab_transfer[idx]="<empty>"

vocab=vocab_transfer+["", "<s>", "</s>"]        
print("New Vocab:")
print(vocab)

## Label Encoding

In [None]:
def encode_label(sen):
    sen=[c for c in sen if c in vocab]
    label=[vocab.index(c) for c in sen]
    label=" ".join([str(x) for x in label])
    label=bytes(label, "utf-8")
    return label
sen=train_df.iloc[0,1]
print("sentence:",sen)
print("encoded label:",encode_label(sen))

# TFRecord creation

In [None]:
SAMPLE_RATE  = 16000   # the sample rate at which wav were created
#---------------------------------------------------------------
# helpers functions
#---------------------------------------------------------------
def create_dir(base,ext):
    '''
        creates a directory extending base
        args:
            base    =   base path 
            ext     =   the folder to create
    '''
    _path=os.path.join(base,ext)
    if not os.path.exists(_path):
        os.mkdir(_path)
    return _path

def load_data(path):
    """loads a wav"""
    wave,_= librosa.load(path, sr=SAMPLE_RATE, mono=True)
    wave=np.trim_zeros(wave)
    return tf.audio.encode_wav(tf.expand_dims(wave, axis=-1), sample_rate=SAMPLE_RATE).numpy()

#---------------------------------------------------------------
# data functions
#---------------------------------------------------------------
# feature fuctions
def _bytes_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def toTfrecord(df,rnum,rec_path,sidx):
    '''
        args:
            df      :   the dataframe that contains the information to store
            rnum    :   record number
            rec_path:   save_path
            mask_dim:   the dimension of the mask
    '''
    tfrecord_name=f'{sidx}_{rnum}.tfrecord'
    tfrecord_path=os.path.join(rec_path,tfrecord_name) 
    with tf.io.TFRecordWriter(tfrecord_path) as writer:    
        
        for idx in tqdm(range(len(df))):
            try:
                _path=df.iloc[idx,0]
                sen=df.iloc[idx,1]

                audio=load_data(_path)
                label=encode_label(sen)
                # feature desc
                data ={ 'audio':_bytes_feature([audio]),
                        'label':_bytes_feature([label])}

                features=tf.train.Features(feature=data)
                example= tf.train.Example(features=features)
                serialized=example.SerializeToString()
                writer.write(serialized)  
            except Exception as e:
                print(_path)
                print(sen)
                print(e)


def createRecords(data,save_path,sidx,tf_size=1024):
    print(f"Creating TFRECORDS:{save_path}")
    for idx in tqdm(range(0,len(data),tf_size)):
        df        =   data.iloc[idx:idx+tf_size] 
        df.reset_index(drop=True,inplace=True) 
        rnum      =   idx//tf_size
        toTfrecord(df,rnum,save_path,sidx)

#---------------------------------------------------------------
# parallel processing functions
#---------------------------------------------------------------
    
def process_df(df,save_path,split=10240,num_proc=16,tf_size=1024):
    dfs=[df[idx:idx+split] for idx in range(0,len(df),split)]
    max_end=len(dfs)


    def run(idx):
        if idx <len(dfs):
            tf_path=create_dir(save_path,str(idx))
            createRecords(dfs[idx],tf_path,idx,tf_size)


    def execute(start,end):
        process_list=[]
        for idx in range(start,end):
            p =  Process(target= run, args = [idx])
            p.start()
            process_list.append(p)
        for process in process_list:
            process.join()


    if max_end==1:
        dfs=[df]
        run(0)
    else:
        num_proc=min(max_end,num_proc)
        for i in range(0,max_end,num_proc):
            start=i
            end=start+num_proc
            if end>max_end:end=max_end-1
            execute(start,end) 
    

In [None]:
#eval_save=create_dir(os.getcwd(),"eval")
train_save=create_dir(os.getcwd(),"voted")


In [None]:
#process_df(val_df,eval_save,split=1024,tf_size=256,num_proc=32)

In [None]:
process_df(train_df,train_save,split=1024,tf_size=256,num_proc=32)