<h1 style="border:2px solid Purple;text-align:center">Introduction 🎖</h1>

This notebook is aimed at folks trying to get started with the competition in Tensorflow & BERT. We will be going through the entire training pipeline, so sit tight.

It is very simple and intuitive to understand. I hope it is useful for everyone !!

**Please remember to upvote the notebook, if you liked the content !!** 😃😃 

<h1 style="border:2px solid Purple;text-align:center">Topics Covered 📌</h1>

**1. Importing Libraries 📚**

**2. Defining Configs 💬**

**3. Data Pipeline 📂**

**4. Training Pipeline 🎯**

**5. Evaluation 🖊**

<h1 style="border:2px solid Purple;text-align:center">1. Importing Libraries 📚</h1>

In [None]:
%matplotlib inline
from copy import copy
import gc
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import sys
from warnings import simplefilter

import tensorflow as tf
from tensorflow.keras import Model, Input, backend as K
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from transformers import TFBertModel, BertConfig, BertTokenizerFast

<h1 style="border:2px solid Purple;text-align:center">2. Defining Configs 💬</h1>

In [None]:
class CFG:
    
    model_name = 'best_model'

    data_dir = Path('../input/commonlitreadabilityprize')
    train_file = data_dir / 'train.csv'
    test_file = data_dir / 'test.csv'
    sample_file = data_dir / 'sample_submission.csv'

    build_dir = Path('./build/')
    output_dir = build_dir / model_name
    trn_encoded_file = output_dir / 'trn.enc.joblib'
    val_predict_file = output_dir / f'{model_name}.val.txt'
    submission_file = 'submission.csv'

    pretrained_dir = '../input/tfbert-large-uncased'

    id_col = 'id'
    target_col = 'target'
    text_col = 'excerpt'

    output_dir.mkdir(parents=True, exist_ok=True)

    max_len = 205
    n_fold = 5
    n_est = 10
    n_stop = 2
    batch_size = 8
    seed = 42

<h1 style="border:2px solid Purple;text-align:center">3. Data Pipeline 📂</h1>

In [None]:
train_df = pd.read_csv(CFG.train_file, index_col=CFG.id_col)
test_df = pd.read_csv(CFG.test_file, index_col=CFG.id_col)
y = train_df[CFG.target_col].values
print(test_df.shape, y.shape, train_df.shape)
train_df.head()

In [None]:
class Tokenize:
    
    def load_tokenizer():
        
        if not os.path.exists(CFG.pretrained_dir + '/vocab.txt'):
            Path(CFG.pretrained_dir).mkdir(parents=True, exist_ok=True)
            tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")
            tokenizer.save_pretrained(CFG.pretrained_dir)
        else:
            print('loading the saved pretrained tokenizer')
            tokenizer = BertTokenizerFast.from_pretrained(CFG.pretrained_dir)

        model_config = BertConfig.from_pretrained(CFG.pretrained_dir)
        model_config.output_hidden_states = True
        
        return tokenizer, model_config

In [None]:
class BERT:
    
    def load_bert(config):
        
        if not os.path.exists(CFG.pretrained_dir + '/tf_model.h5'):
            Path(CFG.pretrained_dir).mkdir(parents=True, exist_ok=True)
            bert_model = TFBertModel.from_pretrained("bert-large-uncased", config=config)
            bert_model.save_pretrained(CFG.pretrained_dir)
        else:
            print('loading the saved pretrained model')
            bert_model = TFBertModel.from_pretrained(CFG.pretrained_dir, config=config)
        return bert_model

    def bert_encode(texts, tokenizer, max_len=CFG.max_len):
        
        input_ids = []
        token_type_ids = []
        attention_mask = []

        for text in texts:
            token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                             add_special_tokens=True)
            input_ids.append(token['input_ids'])
            token_type_ids.append(token['token_type_ids'])
            attention_mask.append(token['attention_mask'])

        return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

<h1 style="border:2px solid Purple;text-align:center">4. Training Pipeline 🎯</h1>

In [None]:
tokenizer, bert_config = Tokenize.load_tokenizer()

X = BERT.bert_encode(train_df[CFG.text_col].values, tokenizer, max_len=CFG.max_len)
X_tst = BERT.bert_encode(test_df[CFG.text_col].values, tokenizer, max_len=CFG.max_len)
y = train_df[CFG.target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)

In [None]:
joblib.dump(X, CFG.trn_encoded_file)

In [None]:
class model:

    def build_model(bert_model, max_len=CFG.max_len):    
        
        input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
        token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
        attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

        sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
        clf_output = sequence_output[:, 0, :]
        clf_output = Dropout(.1)(clf_output)
        out = Dense(1, activation='linear')(clf_output)

        model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
        model.compile(Adam(lr=1e-5), loss='mean_squared_error', metrics=[RootMeanSquaredError()])

        return model
    
    def scheduler(epoch, lr, warmup=5, decay_start=10):
        
        if epoch <= warmup:
            return lr / (warmup - epoch + 1)
        elif warmup < epoch <= decay_start:
            return lr
        else:
            return lr * tf.math.exp(-.1)

In [None]:
class Train:
    
    def train():

        ls = LearningRateScheduler(model.scheduler)
        es = EarlyStopping(patience=CFG.n_stop, restore_best_weights=True)

        cv = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)

        p = np.zeros_like(y, dtype=float)
        p_tst = np.zeros((X_tst[0].shape[0], ), dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X[0]), 1):
            print(f'Training CV #{i}:')
            tf.random.set_seed(CFG.seed + i)

            bert_model = BERT.load_bert(bert_config)
            clf = model.build_model(bert_model, max_len=CFG.max_len)
            if i == 1:
                print(clf.summary())
            history = clf.fit([x[i_trn] for x in X], y[i_trn],
                              validation_data=([x[i_val] for x in X], y[i_val]),
                              epochs=CFG.n_est,
                              batch_size=CFG.batch_size,
                              callbacks=[ls])
            clf.save_weights(f'{CFG.model_name}_cv{i}.h5')

            p[i_val] = clf.predict([x[i_val] for x in X]).flatten()
            p_tst += clf.predict(X_tst).flatten() / CFG.n_fold

            K.clear_session()
            del clf, bert_model
            gc.collect()
        
        return p, p_tst

In [None]:
p, p_test = Train.train()

<h1 style="border:2px solid Purple;text-align:center">5. Evaluation 🖊</h1>

In [None]:
print(f'CV RMSE: {mean_squared_error(y, p, squared=False):.6f}')
np.savetxt(CFG.val_predict_file, p, fmt='%.6f')

Hope you liked the notebook !!