# Abstract

Train and prototype your models quickly by using TPUs. 
This notebook shows easy and quick way to train 🤗Transformers on TPUs.

# Versions 
 * Version 2 : Basic roberta on TPU, CV :- 0.425 , LB:- 0.543 
 * Version 3 : Correction in KFold
#### Future Work 
* To improve input pipeline by incorporating TFrecords 


# Imports

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error as mse

import h5py

import tensorflow as tf 
from tensorflow.keras.layers import Input,LSTM,Bidirectional,Embedding,Dense, Conv1D, Dropout , MaxPool1D , MaxPooling1D, GlobalAveragePooling2D , GlobalAveragePooling1D
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model,load_model,save_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler
from tensorflow.keras import backend as K

from transformers import RobertaTokenizerFast , TFRobertaModel



# ⚙️ Parameters

In [None]:
max_len = 256
batch_size = 32
AUTOTUNE = tf.data.AUTOTUNE

MODEL=['bert-base-uncased' ,'roberta-base']

import os
os.makedirs("./result")

save_dir="./result"

In [None]:
paths=["/kaggle/input/commonlitreadabilityprize/sample_submission.csv",
"/kaggle/input/commonlitreadabilityprize/train.csv",
"/kaggle/input/commonlitreadabilityprize/test.csv"]

df_train=pd.read_csv(paths[1])
df_test=pd.read_csv(paths[2])
df_ss=pd.read_csv(paths[0])

In [None]:
df_train = df_train.drop(['url_legal','license','standard_error'],axis='columns')
df_test = df_test.drop(['url_legal','license'],axis='columns')

In [None]:
df_train.columns, df_test.columns

In [None]:
X= df_train['excerpt']
y=df_train['target']

X_test = df_test['excerpt']

# Define Tokenizer

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL[1])
tokenizer.save_pretrained("./result/roberta-tokenizer")

# 📊 Dataset Prep Function

In [None]:
@tf.function
def map_function(encodings , target):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    target = tf.cast(target, tf.float32 )
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}, target

# 🧠 Modelling

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def create_model(roberta_model):
  input_layer_id = Input(shape=(max_len,) ,dtype=tf.int32, name = 'input_ids')
  input_layer_mask = Input(shape=(max_len,) ,dtype=tf.int32, name = 'attention_mask')
    
  roberta = roberta_model.roberta(input_ids = input_layer_id , attention_mask = input_layer_mask)[0]
  roberta_output = roberta[:,0,:]
  x= Dropout(0.2)(roberta_output)
  predictions = Dense(1,activation='linear')(x)
    
  model = Model(inputs=[input_layer_id, input_layer_mask] , outputs=predictions)
  model.compile(
      optimizer = Adam(learning_rate=1e-5),
      metrics = RootMeanSquaredError(),
      loss = "mse"
  )
  return model



In [None]:
with strategy.scope():
  roberta_model = TFRobertaModel.from_pretrained(MODEL[1])
  model = create_model(roberta_model)

model.summary()

# 🔄 Kfold Training

In [None]:
scores=[]
iterations = 1
kfold = KFold(n_splits=5, shuffle= True , random_state = 2021)
for train_idx, test_idx in kfold.split(X,y):
    print("************** iteration",iterations,"**************")
    X_train = X.loc[train_idx]
    X_test = X.loc[test_idx]
    y_train = y.loc[train_idx]
    y_test = y.loc[test_idx]
    
    X_train = X_train.tolist()
    X_test = X_test.tolist()

    y_train = y_train.tolist()
    y_test = y_test.tolist()

    #tokenization
    print('tokenization')
    train_embeddings = tokenizer(X_train, truncation = True , padding = True , max_length=max_len)
    test_embeddings = tokenizer(X_test , truncation = True , padding =True , max_length = max_len)
    
    print(train_embeddings.keys())
    train = tf.data.Dataset.from_tensor_slices((train_embeddings,y_train))

    train = (
            train
            .shuffle(1024)
            .map(map_function, num_parallel_calls=AUTOTUNE)
            .batch(16)
            .prefetch(AUTOTUNE)
        )


    test = tf.data.Dataset.from_tensor_slices((test_embeddings , y_test))
    test = (
        test
        .map(map_function, num_parallel_calls = AUTOTUNE)
        .batch(16)
        .prefetch(AUTOTUNE)
    )
    #Clearing backend session
    K.clear_session()
    print("Backend Cleared")
    
    early_stopping=EarlyStopping(monitor="val_root_mean_squared_error",min_delta=0,patience=5,verbose=0,mode="min",restore_best_weights=True)
    reduce_lr=ReduceLROnPlateau(monitor="val_root_mean_squared_error",factor=0.2,patience=5,min_lr=0.00001)

    hist=model.fit(train,validation_data=test,epochs=20, callbacks=[early_stopping, reduce_lr])

    #prediction
    print("predicting")
    y_pred = model.predict(test)
    print(np.sqrt(mse(y_pred,y_test)))
    scores.append(np.sqrt(mse(y_pred,y_test)))
      
    #saving model
    print("saving model")
    localhost_save_option = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")
    model.save(f'{save_dir}/roberta_{iterations}', options=localhost_save_option)
    
    iterations+=1
print("the final average rmse is ", np.mean(scores))

# References
1. [Notebook 1 ](https://www.kaggle.com/miklgr500/jigsaw-tpu-bert-with-huggingface-and-keras)


Thanks for viewing, drop your suggestions down in the comments below. 🙂