# Update: The inference will be done on .wav files

In [3]:
!pip install -q git+https://github.com/mnansary/gsoc-wav2vec2.git

In [4]:
#-------------------------------
# imports
#-------------------------------
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf

import pandas as pd 
import warnings
import librosa
import numpy as np 

from tqdm.auto import tqdm
from pandarallel import pandarallel
from multiprocessing import Process
from wav2vec2 import RobustWav2Vec2Config,Wav2Vec2

pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()
warnings.filterwarnings('ignore')

#--------------------------------------------
# vocab
#--------------------------------------------
VOCAB   =[' ', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', 
          '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', 
          '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', 
          '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '।', 'ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 
          'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 
          'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল',
          'শ', 'ষ', 'স', 'হ', '<empty>', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 
          '<empty>', 'ড়', 'ঢ়', 'য়', '০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯', '<empty>', '<empty>', 
          '\u200d', '<empty>', '<empty>', '', '<s>', '</s>']

print("Vocab Len:",len(VOCAB))
print("Pad Id:",VOCAB.index(""))
#--------------------------------------------
# config
#--------------------------------------------
config = RobustWav2Vec2Config()
config.pad_id=VOCAB.index("")

In [5]:
class cfg:
    audio_shape      =  (246000,)                   # this is actually fixed for the pretrained weights we are using -- highets audio length=15 secs
    label_shape      =  (250,)                      # this is actually fixed for the pretrained weights we are using 
    sample_rate      =  16000
    shuffle_buffer   =  1024
    batch_size       =  1
    vocab_len        =  len(VOCAB)                
    embed_dim        =  1024
    
def create_model(cfg):
    inputs = tf.keras.Input(shape=cfg.audio_shape)
    # avoid using spec augmentation
    config.apply_spec_augment=False
    states = Wav2Vec2(config)(inputs)
    logits= tf.keras.layers.Dense(cfg.vocab_len,name="lm_head")(states)
    model = tf.keras.Model(inputs=inputs, outputs=logits)
    return model

model=create_model(cfg)
model.layers[1].freeze_feature_extractor()
model.load_weights("../input/final-model-dlsprint/model_final_semi_50.h5")
model.summary()

In [None]:
import os
from glob import glob
from tqdm import tqdm
import pandas as pd
tqdm.pandas()

# CHANGE ACCORDINGLY
BATCH_SIZE = 16
TEST_DIRECTORY = '../input/test-wav-files-dl-sprint/test_files_wav'

### Please refactor your inference code into the below functions. It doesn't matter which function you use to ultimately infer on the test set, you can use any one. But be sure to implement working versions of these function formats.

In [6]:
def load_data(path):
    """loads a wav"""
    wave,_= librosa.load(path, sr=SAMPLE_RATE, mono=True)
    wave=np.trim_zeros(wave)
    return wave

def normalize(x):
    # -> (1, seqlen)
    mean = tf.reduce_mean(x, axis=-1, keepdims=True)
    var = tf.math.reduce_variance(x, axis=-1, keepdims=True)
    return tf.squeeze((x - mean) / tf.sqrt(var + 1e-5))

# Infer on a directory

In [None]:
sub=pd.read_csv("../input/dlsprint/sample_submission.csv")
sub

In [None]:
TEST_WAVS="../input/test-wav-files-dl-sprint/test_files_wav"
PREDS=[]
PATHS = []
SAMPLE_RATE=16000

def load_data(path):
    """loads a wav"""
    wave,_= librosa.load(path, sr=SAMPLE_RATE, mono=True)
    wave=np.trim_zeros(wave)
    return wave

def normalize(x):
    # -> (1, seqlen)
    mean = tf.reduce_mean(x, axis=-1, keepdims=True)
    var = tf.math.reduce_variance(x, axis=-1, keepdims=True)
    return tf.squeeze((x - mean) / tf.sqrt(var + 1e-5))

for idx in tqdm(range(0,len(sub),cfg.batch_size)):
    batch=[]
    for bi in range(idx,idx+cfg.batch_size):
        _path=sub.iloc[bi,0]
        PATHS.append(_path)
        _path=os.path.join(TEST_WAVS,_path).replace(".mp3",".wav")
        signal=load_data(_path)
        signal=normalize(signal)
        pad=tf.cast([0.0 for _ in range(cfg.audio_shape[0]-signal.shape[0])],tf.float32)
        signal=tf.concat([signal,pad],axis=-1)
        batch.append(tf.expand_dims(signal,axis=0))
    batch=tf.concat(batch,axis=0)
    preds=model(batch,training=False)
    for pred in preds:
        out=np.argmax(pred,axis=-1)
        text=[VOCAB[i] for i in out]
        text="".join(text)
        PREDS.append(text)

In [None]:
df = pd.read_csv('../input/dlsprint/sample_submission.csv')
for i in range(len(df)):
    df.sentence[i] = PREDS[i]

In [None]:
!pip install bnunicodenormalizer
from bnunicodenormalizer import Normalizer 
bnorm = Normalizer()
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None])

In [None]:
for i in range(len(df)):
    df["sentence"][i] = normalize(df["sentence"][i])

In [None]:
df.to_csv("submission.csv", index=False)

### Your code must output a submission.csv file in the end with predictions on the test_files