In [None]:
!pip install -q git+https://github.com/mnansary/gsoc-wav2vec2.git

# Data Access

In [None]:
from kaggle_datasets import KaggleDatasets
GCS_PATH=KaggleDatasets().get_gcs_path("tfrecs-for-arijitx")
GCS_PATH_TRAIN=KaggleDatasets().get_gcs_path("buet-dl-tpu-records-part1")
import os 
#------------------------------
# change able params
#------------------------------
TRAIN_GCS_PATTERNS      = [os.path.join(GCS_PATH_TRAIN,"voted","*/*.tfrecord")]
                           
EVAL_GCS_PATTERNS       = [os.path.join(GCS_PATH,"eval","*/*.tfrecord")]

PER_REPLICA_BATCH_SIZE  = 16      # this is a safe batch size 
EPOCHS                  = 10      # change this as needed .. keep the kaggle allowed TPU limit of 9 hours in mind    

#------------------------------
# fixed params while creating the tfrecords
#------------------------------
REC_SIZE=256  
VOCAB   =[' ', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', 
          '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', 
          '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', 
          '<empty>', '<empty>', '<empty>', '<empty>', '<empty>', '।', 'ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 
          'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 
          'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল',
          'শ', 'ষ', 'স', 'হ', '<empty>', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 
          '<empty>', 'ড়', 'ঢ়', 'য়', '০', '১', '২', '৩', '৪', '৫', '৬', '৭', '৮', '৯', '<empty>', '<empty>', 
          '\u200d', '<empty>', '<empty>', '', '<s>', '</s>']

print("Vocab Len:",len(VOCAB))
print("Pad Id:",VOCAB.index(""))

### Imports and data 

In [None]:
#-------------------------------
# imports
#-------------------------------
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import random
import tensorflow as tf
import transformers
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import display,Audio
from wav2vec2 import RobustWav2Vec2Config,Wav2Vec2,CTCLoss
tqdm.pandas()

#--------------------------
# GCS Paths and tfrecords
#-------------------------
train_recs=[]
eval_recs =[]
def get_tfrecs(gcs_pattern):
    file_paths = tf.io.gfile.glob(gcs_pattern)
    random.shuffle(file_paths)
    print("found ",len(file_paths), "tfrecords")
    return file_paths

for gcs in TRAIN_GCS_PATTERNS:
    print("Looking into gcs path:",gcs)
    train_recs+=get_tfrecs(gcs)
for gcs in EVAL_GCS_PATTERNS:
    print(gcs)
    eval_recs+=get_tfrecs(gcs)

print("Total Eval-recs:",len(eval_recs))
print("Total Train-recs:",len(train_recs))
#------------------------------------------------
# change config
#------------------------------------------------
config = RobustWav2Vec2Config()
config.pad_id=VOCAB.index("")

In [None]:
#----------------------------------------------------------
# Detect hardware, return appropriate distribution strategy
#----------------------------------------------------------
# TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    tf.config.optimizer.set_jit(True)
else:
    strategy = tf.distribute.get_strategy() 
    # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

#-------------------------------------
# batching , strategy and steps
#-------------------------------------
if strategy.num_replicas_in_sync==1:
    BATCH_SIZE = PER_REPLICA_BATCH_SIZE
else:
    BATCH_SIZE = PER_REPLICA_BATCH_SIZE*strategy.num_replicas_in_sync

# set    
STEPS_PER_EPOCH = (len(train_recs)*REC_SIZE)//(BATCH_SIZE)
EVAL_STEPS      = (len(eval_recs)*REC_SIZE)//(2*BATCH_SIZE)
print("Batch Size:",BATCH_SIZE)
print("Steps:",STEPS_PER_EPOCH)
print("Eval Steps:",EVAL_STEPS)

In [None]:
class cfg:
    audio_shape      =  (246000,)                   # this is actually fixed for the pretrained weights we are using -- highets audio length=15 secs
    label_shape      =  (250,)                      # this is actually fixed for the pretrained weights we are using 
    sample_rate      =  16000
    shuffle_buffer   =  1024
    batch_size       =  BATCH_SIZE
    vocab_len        =  len(VOCAB)                
    embed_dim        =  1024
    

In [None]:
#------------------------------
# parsing tfrecords 
#------------------------------
def normalize(x):
    # -> (1, seqlen)
    mean = tf.reduce_mean(x, axis=-1, keepdims=True)
    var = tf.math.reduce_variance(x, axis=-1, keepdims=True)
    return tf.squeeze((x - mean) / tf.sqrt(var + 1e-5))

def read_raw_audio(audio):
    wave,rate = tf.audio.decode_wav(audio, desired_channels=1, desired_samples=-1)
    return tf.reshape(wave, shape=[-1]) 
    
def preprocess_example(audio,label):
    with tf.device("/CPU:0"):
        signal = normalize(read_raw_audio(audio))
        label = tf.strings.to_number(tf.strings.split(label), out_type=tf.int32)
        return signal,label

def data_input_fn(recs): 
    '''
      This Function generates data from gcs
      * The parser function should look similiar now because of datasetEDA
    '''
    def _parser(example):   
        feature ={  'audio' : tf.io.FixedLenFeature([],tf.string) ,
                    'label' : tf.io.FixedLenFeature([],tf.string) 
        }    
        example=tf.io.parse_single_example(example,feature)
        audio,label=preprocess_example(**example)
        return audio,label
    # fixed code (for almost all tfrec training)
    dataset = tf.data.TFRecordDataset(recs)
    dataset = dataset.map(_parser)
    dataset = dataset.shuffle(cfg.shuffle_buffer,reshuffle_each_iteration=True)
    dataset = dataset.repeat()
    dataset = dataset.padded_batch(cfg.batch_size, padded_shapes=(cfg.audio_shape[0],cfg.label_shape[0]), padding_values=(0.0,VOCAB.index("")))
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    dataset = dataset.apply(tf.data.experimental.ignore_errors())
    return dataset

In [None]:
train_ds=data_input_fn(train_recs)
eval_ds =data_input_fn(eval_recs)

### Visualize

In [None]:
#------------------------------
# view data
#------------------------------
for x,y in eval_ds.take(1):
    signal=x[0].numpy()
    display(Audio(data=signal, rate=cfg.sample_rate))
    label=y[0].numpy()
    sen="".join([VOCAB[int(i)] for i in label])
    print("label:",sen)
    print("input shape:",x.shape)
    print("output shape:",y.shape)

# Modeling

In [None]:
SUFFIX = ":0"
MAPPING = (
    ("layer_norm.weight", "layer_norm/gamma"),
    ("layer_norm.bias", "layer_norm.beta"),
    ("weight", "kernel"),
    (".", "/"),
)

SPECIAL_MAPPING_WITH_HEAD = {
    "wav2vec2.encoder.pos_conv_embed.conv.weight_g": "wav2vec2/encoder/pos_conv_embed/conv/weight_g:0",
    "wav2vec2.encoder.pos_conv_embed.conv.weight_v": "wav2vec2/encoder/pos_conv_embed/conv/weight_v:0",
}



def replace(k):
    """
    Converts PyTorch state_dict keys to TensorFlow varible name.
    """
    for hf_v, tf_v in MAPPING:
        k = k.replace(hf_v, tf_v)
    return k + SUFFIX


def get_tf_pretrained_model(hf_model_id,tf_model):
    """
    Converts HuggingFace PyTorch weights to TensorFlow compatible weights.
    """
    hf_model = transformers.Wav2Vec2ForCTC.from_pretrained(hf_model_id)

    hf_state_dict = hf_model.state_dict()

    tf_variables = tf_model.variables
    tf_variables_dict = {}
    for v in tf_variables:
        tf_variables_dict[v.name] = v

    tf_weights = []
    extra_keys = []
    for k in tqdm(hf_state_dict):
        if k in SPECIAL_MAPPING_WITH_HEAD:
            new_k = (SPECIAL_MAPPING_WITH_HEAD[k])
        else:
            new_k = replace(k)
        if new_k not in tf_variables_dict.keys():
            extra_keys.append(k)
            print(f"SKIPPING {k}")
            continue

        
        array = hf_state_dict[k].numpy()

        if k in SPECIAL_MAPPING_WITH_HEAD:
            array = np.transpose(array, axes=(2, 1, 0))
        elif "kernel" in new_k:
            array = np.transpose(array)

        tf_weights.append((tf_variables_dict[new_k], array))

    print("EXTRA KEYS:\n", extra_keys)

    tf.keras.backend.batch_set_value(tf_weights)
    return tf_model

## Build the model

In [None]:
def create_model(cfg):
    inputs = tf.keras.Input(shape=cfg.audio_shape)
    # avoid using spec augmentation
    config.apply_spec_augment=False
    # freeze feature extractor
    states = Wav2Vec2(config)(inputs)
    logits= tf.keras.layers.Dense(cfg.vocab_len,name="lm_head")(states)
    model = tf.keras.Model(inputs=inputs, outputs=logits)
    return model


In [None]:
with strategy.scope():
    model=get_tf_pretrained_model("arijitx/wav2vec2-xls-r-300m-bengali",create_model(cfg))
    # freeze feature extractor
    model.layers[1].freeze_feature_extractor()
    model.load_weights("../input/final-model-dlsprint/model_final_semi_50.h5")
model.summary()

In [None]:
    
# early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, 
                                                  verbose=1, 
                                                  mode = 'auto') 
lr_reducer=tf.keras.callbacks.ReduceLROnPlateau( patience=3)

model_save=tf.keras.callbacks.ModelCheckpoint("final.h5",
                                                save_best_only=True,
                                                save_weights_only=True,
                                                verbose=1)
callbacks = [model_save]

with strategy.scope():
    #lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=0.000001,
                            #                                 decay_steps=10000,
                              #                               decay_rate=0.95,)
    lr_schedule = tf.keras.experimental.CosineDecay(initial_learning_rate=0.0000007,
                                                         decay_steps=10000,
                                                         alpha= 0.02)

    loss_fn = CTCLoss(config, (PER_REPLICA_BATCH_SIZE,cfg.audio_shape[0]), division_factor=PER_REPLICA_BATCH_SIZE)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr_schedule),
                  loss=loss_fn)

In [None]:
history=model.fit(train_ds,
                  epochs=30,
                  steps_per_epoch=STEPS_PER_EPOCH,
                  verbose=1,
                  validation_data=eval_ds,
                  validation_steps=EVAL_STEPS, 
                  callbacks=callbacks)

In [None]:
curves={}
for key in history.history.keys():
    curves[key]=history.history[key]
curves=pd.DataFrame(curves)
curves.to_csv(f"history.csv",index=False)

In [None]:
curves