# Abstract

Train and prototype your models quickly by using TPUs. This notebook shows easy and quick way to train 🤗Transformers on TPUs.




# Versions
[Inference Notebook](https://www.kaggle.com/code/bharadwajvedula/pppm-tpu-high-speed-bert-inference)
* Version 1: Base Uncased Bert on TPUs **CV:- 0.8379 LB:- 0.7201**
* Version 2: Base Uncased Bert on TPUs trained as classification problem **CV:- 0.728 LB:- 0.**
* Version 4: Base Uncased Bert on TPUs fixing CV issues **CV:- 0. LB:- 0.**

# Imports

In [None]:
import numpy as np 
import pandas as pd 

from scipy.stats import pearsonr

from transformers import TFAutoModel, AutoTokenizer

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.losses import SparseCategoricalCrossentropy 
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model, save_model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau , EarlyStopping
from tensorflow.keras.optimizers import Adam, SGD

import os
if not os.path.exists("./result"):
    os.makedirs("./result")


# Configs

In [None]:
class config:
    # dataset path
    train_dataset_path = "../input/p2pmcleaneddataset/train.csv"
    test_dataset_path = "../input/p2pmcleaneddataset/test.csv"
    sample_submission_path = "../input/p2pmcleaneddataset/sample_submission.csv"
    cpc_path = "../input/cpc-codes/titles.csv"
    
   

    save_dir="./result"
    
    AUTOTUNE = tf.data.AUTOTUNE
    
    #tokenizer params
    truncation = True 
    padding = 'max_length'
    max_length = 150
    
    # model params
    model_name = "bert-base-uncased"
    
    #training params
    learning_rate = 1e-5
    batch_size = 512
    epochs = 12

# TPU config

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# PreProcessing

In [None]:
df_train = pd.read_csv(config.train_dataset_path)
df_test = pd.read_csv(config.test_dataset_path)
df_ss = pd.read_csv(config.sample_submission_path)
df_cpc = pd.read_csv(config.cpc_path)

df_train.drop(['context','anchor_length','target_length'] , axis = "columns" , inplace = True)

In [None]:
context_mapping = {}

for i,j in zip(df_cpc['code'],df_cpc['title']):
    context_mapping[i] = j
    
df_test['context_text'] = df_test['context'].map(context_mapping)


In [None]:
df_train['text'] = df_train['anchor'] + " " + "[SEP]" + " " + df_train['target'] + " " + "[SEP]" + " " + df_train['context_text']
df_test['text'] = df_test['anchor'] + " " + "[SEP]" + " " + df_test['target'] + " " + "[SEP]" + " " + df_test['context_text']

In [None]:
df_train

In [None]:
df_train['kfold']

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

# Model

In [None]:
def create_model():
    transformers = TFAutoModel.from_pretrained(config.model_name)
    
    input_ids_layer = Input(shape = (config.max_length) , dtype = tf.int32, name = 'input_ids')
    attention_mask_layer = Input(shape = (config.max_length), dtype = tf.int32, name = 'attention_mask')
    
    transformer = TFAutoModel.from_pretrained(config.model_name)
    
    cls_token = transformer(input_ids = input_ids_layer , attention_mask = attention_mask_layer)[0][:,0,:]
    
    x = Dropout(0.3)(cls_token)
    x = Dense(256 , activation = "relu")(x)
    x = Dense(128 , activation = "relu")(x)
    prediction = Dense(1 , activation = "relu")(x)
    
    model = Model(inputs = [input_ids_layer, attention_mask_layer] , outputs = prediction)
    
    return model

# Dataset Prep Function

In [None]:
@tf.function
def map_function(encodings , target):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    target = tf.cast(target, tf.float32 )
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}, target

# Competition Metrics
Grabbed this snippet from [here](https://www.kaggle.com/code/mohamadmerchant/us-phrase-matching-tf-keras-train-tpu#Competition-Metrics)

In [None]:
class Pearsonr(tf.keras.callbacks.Callback):
    def __init__(self, model , val_data, y_val):
        self.val_data = val_data
        self.y_val = y_val
        self.model = model
        
    def on_epoch_end(self, epoch, logs):
        val_preds = self.model.predict(self.val_data, verbose=1)
        val_pearsonr = pearsonr(self.y_val, val_preds.ravel())[0]

        print(f"val_pearsonr: {val_pearsonr:.4f}\n")
        logs["val_pearsonr"] = val_pearsonr

# KFold Training

In [None]:
histories = []
scores = []
for fold in range(1,6):
    print(f"====== FOLD RUNNING {fold}======")
    
    X_train = df_train.loc[df_train['kfold'] != fold]['text']
    y_train = df_train.loc[df_train['kfold'] != fold]['score']
    
    X_test = df_train.loc[df_train['kfold'] == fold]['text']
    y_test = df_train.loc[df_train['kfold'] == fold]['score']
    
    print("Generating Tokens")
    train_embeddings = tokenizer(
        X_train.tolist(),
        truncation = config.truncation, 
        padding = config.padding,
        max_length =config.max_length   
    )
    
    validation_embeddings = tokenizer(
        X_test.tolist(),
        truncation = config.truncation, 
        padding = config.padding,
        max_length =config.max_length   
    )
    
    print("Generating Datasets")
    
    train = tf.data.Dataset.from_tensor_slices((train_embeddings , y_train))
    train = (
                train
                .map(map_function, num_parallel_calls= config.AUTOTUNE)
                .batch(config.batch_size)
                .prefetch(config.AUTOTUNE)
            )
    
    val = tf.data.Dataset.from_tensor_slices((validation_embeddings , y_test))
    val = (
                val
                .map(map_function, num_parallel_calls= config.AUTOTUNE)
                .batch(config.batch_size)
                .prefetch(config.AUTOTUNE)
            )
    
    #Clearing backend session
    K.clear_session()
    print("Backend Cleared")

    print("Model Creation")
    with strategy.scope():
        model = create_model()
        model.compile(
          optimizer = Adam(learning_rate = config.learning_rate), 
          metrics = ['mae'],
          loss = ['mae']
      )    
    early_stopping=EarlyStopping(monitor="val_loss",min_delta=0,patience=4,verbose=0,mode="min",restore_best_weights=True)
    pearson_corr = Pearsonr(model,val, y_test)
    
    hist = model.fit(train , validation_data = val , epochs = config.epochs , callbacks = [early_stopping,pearson_corr])
    
    # prediction on val
    print("prediction on validation data")
    preds = model.predict(val , verbose = 1)
    score = pearsonr(preds.reshape(preds.shape[0],), y_test)[0]    
    scores.append(score)
    
    #saving model
    print("saving model")
    
    localhost_save_option = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")
    model.save(f'{config.save_dir}/{config.model_name}_{fold}', options=localhost_save_option)
    
    del model

    histories.append(hist)

print("the final average rmse is ", np.mean(scores))





























Thanks for viewing, drop your suggestions down in the comments below. 🙂