<h1 style="border:2px solid Purple;text-align:center">Introduction 🎖</h1>

This notebook is in continuation of my previous notebook on **training** Tf BERT based baseline model here -> https://www.kaggle.com/prvnkmr/tf-bert-baseline-lb-0-646-training. 

Please go through that if not already done so.

This notebook focuses on inferencing on test data, by the model trained in our training notebook.
Remember that we trained Multiple CV models, so we will go over all those models and take the mean output (sort of like what we do in bagging).

**As always if you like the content, please do remember to upvote !! 😃😃**

# Imports

In [None]:
import pandas as pd
from pathlib import Path
import os
import numpy as np
from sklearn.model_selection import KFold
import gc

import tensorflow as tf
from tensorflow.keras import Model, Input, backend as K
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel, BertConfig, BertTokenizerFast

# Configs

In [None]:
class CFG:
    
    model_name = 'best_model'

    data_dir = Path('../input/commonlitreadabilityprize')
    train_file = data_dir / 'train.csv'
    test_file = data_dir / 'test.csv'
    sample_file = data_dir / 'sample_submission.csv'

    build_dir = Path('./build/')
    output_dir = build_dir / model_name
    trn_encoded_file = output_dir / 'trn.enc.joblib'
    val_predict_file = output_dir / f'{model_name}.val.txt'
    submission_file = 'submission.csv'

    pretrained_dir = '../input/tfbert-large-uncased'

    id_col = 'id'
    target_col = 'target'
    text_col = 'excerpt'

    output_dir.mkdir(parents=True, exist_ok=True)

    max_len = 205
    n_fold = 5
    n_est = 10
    n_stop = 2
    batch_size = 8
    seed = 42

# Building Model

In [None]:
class Tokenize:
    
    def load_tokenizer():
        
        if not os.path.exists(CFG.pretrained_dir + '/vocab.txt'):
            Path(CFG.pretrained_dir).mkdir(parents=True, exist_ok=True)
            tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")
            tokenizer.save_pretrained(CFG.pretrained_dir)
        else:
            print('loading the saved pretrained tokenizer')
            tokenizer = BertTokenizerFast.from_pretrained(CFG.pretrained_dir)

        model_config = BertConfig.from_pretrained(CFG.pretrained_dir)
        model_config.output_hidden_states = True
        
        return tokenizer, model_config

In [None]:
class BERT:
    
    def load_bert(config):
        
        if not os.path.exists(CFG.pretrained_dir + '/tf_model.h5'):
            Path(CFG.pretrained_dir).mkdir(parents=True, exist_ok=True)
            bert_model = TFBertModel.from_pretrained("bert-large-uncased", config=config)
            bert_model.save_pretrained(CFG.pretrained_dir)
        else:
            print('loading the saved pretrained model')
            bert_model = TFBertModel.from_pretrained(CFG.pretrained_dir, config=config)
        return bert_model

    def bert_encode(texts, tokenizer, max_len=CFG.max_len):
        
        input_ids = []
        token_type_ids = []
        attention_mask = []

        for text in texts:
            token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                             add_special_tokens=True)
            input_ids.append(token['input_ids'])
            token_type_ids.append(token['token_type_ids'])
            attention_mask.append(token['attention_mask'])

        return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

In [None]:
class model:

    def build_model(bert_model, max_len=CFG.max_len):    
        
        input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
        token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
        attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

        sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
        clf_output = sequence_output[:, 0, :]
        clf_output = Dropout(.1)(clf_output)
        out = Dense(1, activation='linear')(clf_output)

        model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
        model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])

        return model
    
    def scheduler(epoch, lr, warmup=5, decay_start=10):
        
        if epoch <= warmup:
            return lr / (warmup - epoch + 1)
        elif warmup < epoch <= decay_start:
            return lr
        else:
            return lr * tf.math.exp(-.1)

In [None]:
tokenizer, bert_config = Tokenize.load_tokenizer()

test_df = pd.read_csv(CFG.test_file, index_col=CFG.id_col)
X_test = BERT.bert_encode(test_df[CFG.text_col].values, tokenizer, max_len=CFG.max_len)

# Inference

In [None]:
class Infer:
    
    def infer():
        infer_result = np.zeros((X_test[0].shape[0], ), dtype=float)
        cv = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)

        for i, (i_trn, i_val) in enumerate(cv.split(X_test[0]), 1):

            bert_model = BERT.load_bert(bert_config)
            clf = model.build_model(bert_model, max_len=CFG.max_len)

            clf.load_weights(Path(f'../input/tf-bert-baseline-lb-0-646-training/bert_v13_cv{i}.h5'))

            infer_result += clf.predict(X_test).flatten() / CFG.n_fold
            
        return infer_result

In [None]:
infer_result = Infer.infer()

# Submission

In [None]:
sub = pd.read_csv(CFG.sample_file, index_col=CFG.id_col)
sub[CFG.target_col] = infer_result
sub.to_csv(CFG.submission_file)
sub.head()