## XLM Roberta Squad2

In this notebook I try to train model using Roberta finetuned model (Finetuned on Squad 2 dataset) combined with Hugging Face and Keras. The idea is to freeze the embeddings and train only the final layer parameters to create a baseline.


I will be doing very less of EDA as other fellow kagglers have done an excellent job in explaining the dataset.

**I will also be using extra dataset provided by other Kagglers"

**Note**: I am not using the Datasets library by HF because I love to work directly with pandas and tf.data so there are a couple of changes in the preprocesssing script that everyone has been using from HF notebook.

## Stack

1. Pandas for Data Preprocessing

2. tf.data from Tensorflow for Data Pipelines

3. Hugging Face for pretrained model and tokenizer

4. XLM Roberta Squad2 as the pretrained model

5. Keras for Model training



# Table of Contents

1. [Imports](#section1)
2. [Configuration](#section2)
3. [Define Data Pipelines](#section3)
4. [Setting Tokenizer](#section4)
6. [Postprocessing Pipeline](#section6)
8. [Creating Model](#section8)
9. [Define folds](#section9)
10. [Multi Fold Training](#section10)

<a id="section1"></a>
## Imports

In [None]:

import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import random
import gc

from transformers import AutoTokenizer
from transformers import TFAutoModel

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K
import tensorflow_addons as tfa

from sklearn.model_selection import StratifiedKFold

import logging

In [None]:
print(tf.__version__)

<a id="section2"></a>
## Configuration

In [None]:
config = {
    'seed' : 42,
    'model': '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2',
    'group': 'XLMRSquad2',
    'batch_size': 4,
    'max_length': 384,
    'doc_stride': 128,
    'device' : 'TPU',
    'epochs' : 5,
    'n_best_scores': 20,
    'max_answer_length': 50,
    
    'use_transfer_learning' : False,
    'use_extra_data': True,
    'num_folds' : 8,
    
    'lr' : 1e-6,
    'scheduler' : None,
    'use_wandb': True,
    'wandb_mode' : 'online',
    
    'use_dropout': False,
    'dropout_value':0.1
}

In [None]:
def seed_everything(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
# Creating a logger 📃
def init_logger(log_file:str ='training.log'):
    
    # Specify the format 
    formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
    
    # Create a StreamHandler Instance
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    stream_handler.setFormatter(formatter)
    
    # Create a FileHandler Instance
    file_handler = logging.FileHandler(log_file)
    file_handler.setFormatter(formatter)
    
    # Create a logging.Logger Instance
    logger = logging.getLogger('Chaii-XLMR-TPU-MultiFold')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    return logger


LOGGER = init_logger()
LOGGER.info("Logger Initialized")

seed_everything(config['seed'])
LOGGER.info("Seed Setting done")



In [None]:
## To make this work with TPU

def get_device(device):
    if device == 'TPU':
        try: # detect TPUs
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
            strategy = tf.distribute.TPUStrategy(tpu)
        except ValueError: # detect GPUs
            print('Cannot initialize TPU')
    else:
        strategy = tf.distribute.MirroredStrategy() 
        

    print("Number of accelerators: ", strategy.num_replicas_in_sync)
    return strategy

strategy= get_device(config['device'])
config['batch_size'] = config['batch_size'] * strategy.num_replicas_in_sync

LOGGER.info("Effective batch size is " + str(config['batch_size']))

In [None]:
if config['use_wandb']:
    import wandb
    from wandb.keras import WandbCallback
    from kaggle_secrets import UserSecretsClient

    if config['wandb_mode'] == 'offline':
        os.environ["WANDB_MODE"] = "offline"
        key='X'*40
        wandb.login(key=key)
    else:
        user_secrets = UserSecretsClient()
        wandb_api = user_secrets.get_secret("wandb_api")
        wandb.login(key=wandb_api)

    run = wandb.init(project='chaii', 
                     group =config['group'], 
                     job_type='train-tl',
                     config = config)

    LOGGER.info("Wandb is initialized")

<a id="section3"> </a>
## Define Data Pipelines

In [None]:
df = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv', encoding='utf-8')
df.head()

In [None]:
df.language.value_counts(normalize=True)*100

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
class ChaiiDataset:
    def __init__(self, max_length, stride, tokenizer):
        self.max_length = max_length
        self.doc_stride = stride
        self.pad_on_right = tokenizer.padding_side == "right"
        self.tokenizer = tokenizer
    

    def run_tokenizer(self, data):
        tokenized_data = self.tokenizer(
        list(data["question" if self.pad_on_right else "context"].values),
        list(data["context" if self.pad_on_right else "question"].values),
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=self.max_length,
        stride=self.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        )

        return tokenized_data

    def prepare_train_features(self, train_data):
        tokenized_train_data = self.run_tokenizer(train_data)

        sample_mapping = tokenized_train_data['overflow_to_sample_mapping']
        offset_mapping = tokenized_train_data['offset_mapping']

        tokenized_train_data["start_positions"] = []
        tokenized_train_data["end_positions"] = []


        for i, offsets in tqdm(enumerate(offset_mapping)):
            input_ids = tokenized_train_data["input_ids"][i]
            cls_index = input_ids.index(self.tokenizer.cls_token_id)

            
            sequence_ids = tokenized_train_data.sequence_ids(i)
            sample_index = sample_mapping[i]
            answers = list(train_data["answer_text"].values)[sample_index]
            
            if len(train_data["answer_start"]) == 0:
                tokenized_train_data["start_positions"].append(cls_index)
                tokenized_train_data["end_positions"].append(cls_index)
            else:
                start_char = list(train_data["answer_start"].values)[sample_index]
                end_char = start_char + len(answers)


                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if self.pad_on_right else 0):
                    token_start_index += 1

                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if self.pad_on_right else 0):
                    token_end_index -= 1

                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_train_data["start_positions"].append(cls_index)
                    tokenized_train_data["end_positions"].append(cls_index)
                else:
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_train_data["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_train_data["end_positions"].append(token_end_index + 1)

        return tokenized_train_data


    def prepare_eval_features(self, eval_data):
        tokenized_eval_data = self.run_tokenizer(eval_data)
        sample_mapping = tokenized_eval_data.pop("overflow_to_sample_mapping")

        tokenized_eval_data["example_id"] = []

        for i in tqdm(range(len(tokenized_eval_data["input_ids"]))):
            sequence_ids = tokenized_eval_data.sequence_ids(i)
            context_index = 1 if self.pad_on_right else 0

            sample_index = sample_mapping[i]
            tokenized_eval_data["example_id"].append(eval_data["id"].values[sample_index])

            tokenized_eval_data["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_eval_data["offset_mapping"][i])
            ]

        return tokenized_eval_data

    def prepare_tf_data_pipeline(self, data, batch_size = 16, type='train'):
        
        if type=='train' or type=='valid':
            tokenized_data = self.prepare_train_features(data)
        else:
            tokenized_data = self.prepare_eval_features(data)

        input_ids = tokenized_data['input_ids']
        attention_mask = tokenized_data['attention_mask']

        def map_func(X_ids,  X_att, labels_s, labels_e):
            return {'input_ids':X_ids,  'attention_mask': X_att}, {'start_positions': labels_s, 'end_positions': labels_e}

        def map_func_eval(X_ids, X_att):
            return {'input_ids':X_ids,  'attention_mask': X_att}

        
        if type=='train':
            start_positions = tokenized_data['start_positions']
            end_positions =  tokenized_data['end_positions']

            dataset_train_raw = tf.data.Dataset.from_tensor_slices((input_ids,  attention_mask, start_positions,  end_positions))
            dataset_train = dataset_train_raw.map(map_func).shuffle(1024).batch(batch_size).prefetch(buffer_size=AUTOTUNE) 
            return dataset_train
        
        if type=='valid':
            start_positions = tokenized_data['start_positions']
            end_positions =  tokenized_data['end_positions']

            dataset_valid_raw = tf.data.Dataset.from_tensor_slices((input_ids,  attention_mask, start_positions,  end_positions))
            dataset_valid = dataset_valid_raw.map(map_func).batch(batch_size).prefetch(buffer_size=AUTOTUNE) 
            return dataset_valid


        if type == 'eval':
            dataset_eval_raw = tf.data.Dataset.from_tensor_slices((input_ids,  attention_mask))
            dataset_eval = dataset_eval_raw.map(map_func_eval).batch(batch_size)
            return dataset_eval, tokenized_data       


<a id="section4"></a>
## Setting Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])
print(tokenizer)
LOGGER.info('Tokenizer loaded')

<a id="section6"> </a>
## Postprocessing Pipeline

In [None]:
        
def postprocess(raw_dataframe, features, pred_start, pred_end):
    postprocessed_predictions = {}
    dict_local = {}

    for index, item in enumerate(features['example_id']):
        if item not in dict_local:
            dict_local[item] = [index]
        else:
            dict_local[item].append(index)


    for key, value in dict_local.items():
        valid_answers = []
        min_null_score = None
        context = raw_dataframe[raw_dataframe['id'] == key]['context'].values[0]

        for indx in value:
            start = pred_start[indx]
            end = pred_end[indx]

            offset_mapping = features["offset_mapping"][indx]
            cls_index = features["input_ids"][indx].index(tokenizer.cls_token_id)
            feature_null_score = start[cls_index] + end[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start)[-1 : -config['n_best_scores'] - 1 : -1].tolist()
            end_indexes = np.argsort(end)[-1 : -config['n_best_scores'] - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > config['max_answer_length']:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    local_context = features["input_ids"][indx]
                    valid_answers.append(
                        {
                            "score": start[start_index] + end[end_index],
                            "text": context[start_char: end_char],
                            "token": tokenizer.decode(local_context[start_char: end_char])
                        }
                    )
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        postprocessed_predictions[key] = best_answer['text']

    return postprocessed_predictions


In [None]:
dataset = ChaiiDataset(config['max_length'], config['doc_stride'], tokenizer)

<a id="section8"> </a>
## Create Model

In [None]:

def get_keras_model():
    with strategy.scope():
        bert = TFAutoModel.from_pretrained(config['model'], from_pt=True)

        input_ids = layers.Input(shape=(config['max_length'],),  name='input_ids', dtype=tf.int32)
        attention_mask = layers.Input(shape=(config['max_length'],), name='attention_mask', dtype=tf.int32)
        embedding = bert(input_ids,  attention_mask=attention_mask)[0]

        
        if config['use_dropout']:
            x1 = tf.keras.layers.Dropout(config['dropout_value'])(embedding)
            x1 = tf.keras.layers.Conv1D(1,1)(x1)
            
            x2 = tf.keras.layers.Dropout(config['dropout_value'])(embedding)
            x2 = tf.keras.layers.Conv1D(1,1)(x2)
        else:
            x1 = tf.keras.layers.Conv1D(1,1)(embedding)
            x2 = tf.keras.layers.Conv1D(1,1)(embedding)

        
        x1 = tf.keras.layers.Flatten()(x1)
        x1 = tf.keras.layers.Activation('softmax', name='start_positions')(x1)

        x2 = tf.keras.layers.Flatten()(x2)
        x2 = tf.keras.layers.Activation('softmax', name='end_positions')(x2)

        model = keras.Model(inputs=[input_ids, attention_mask],
                            outputs=[x1, x2])
        
    
    if config['scheduler']=='cosine_decay':
        decay_steps = 600
#         lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
#         initial_learning_rate = config['lr'], decay_steps=decay_steps)
        #optimizer = tf.keras.optimizers.Adam(lr=decayed_learning_rate(config['lr'], decay_steps))
    else:
        optimizer = tf.keras.optimizers.Adam(lr=config['lr'])
    
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer=optimizer)
    
    return model

### Freezing Embedding layer

In [None]:
if config['use_transfer_learning']:
    for layer in model.layers:
        if 'tfxlm_roberta_model' in layer.name:
            layer.trainable = False


In [None]:
def scheduler(epoch, lr):
    if epoch < 2:
        return lr
    else:
        return lr * 0.125

def get_callbacks(fold):
    bm = tf.keras.callbacks.ModelCheckpoint(f'best_model_fold_{fold}.h5',verbose=1, monitor='val_loss', mode='min', save_best_only=True, save_weights_only=True)
    lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
    callbacks = [bm, lr_scheduler] 
    return callbacks


def get_jaccard(str1, str2):
        a = set(str1.split()) 
        b = set(str2.split())
        if (len(a)==0) & (len(b)==0): return 0.5
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    
def get_jaccard_score(y_true, y_pred):
    score=0.0

    for gt, pred in zip(y_true, y_pred):
        score += get_jaccard(gt, pred)
    score = score / len(y_pred)
    return score

<a id="section9"> </a>
## Define Folds

In [None]:
df["fold"] = -1
oof_start = np.zeros((len(df), config['max_length']))
skf = StratifiedKFold(n_splits=config['num_folds'],shuffle=True,random_state=config['seed'])

for indx, (train_index, test_index) in enumerate(skf.split(X = df, y = df['language'])):
    df.loc[test_index, 'fold'] = indx

LOGGER.info("Data divided into folds")

<a id="section10"> </a>
## Multi fold Training

### Read Test data

In [None]:
%%time
df_test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
test_dataset, test_tokenized = dataset.prepare_tf_data_pipeline(df_test, type='eval', batch_size=config['batch_size'])
LOGGER.info("Test dataset is loaded.")

In [None]:
df_extra_data = pd.read_csv('../input/chaiiexternalextradata/mlqa_hindi.csv')
df_extra_data_1 = pd.read_csv('../input/chaiiexternalextradata/xquad.csv')
LOGGER.info("Extra dataset is loaded")

In [None]:
pred_list = {}
jaccard_list = []

In [None]:
global_model = get_keras_model()
global_model.save_weights("base_model.h5")
LOGGER.info("Global model loaded")

### Start Training

In [None]:
for idx in range(config['num_folds']):
    K.clear_session()
    print(f"############ Fold: {idx} ############")
    LOGGER.debug("Fold "+str(idx))
    df_train = df[df['fold'] != idx]
    df_valid = df[df['fold'] == idx]
    
    if config['use_extra_data']:
        df_train = pd.concat([df_train, df_extra_data])
        df_train = pd.concat([df_train, df_extra_data_1])
    
    print(f"### Preparing Data for Fold: {idx}")
    LOGGER.debug("Data Prep Started for Fold "+str(idx))
    
    train_dataset = dataset.prepare_tf_data_pipeline(df_train , batch_size = config['batch_size'])
    valid_dataset_for_epoch = dataset.prepare_tf_data_pipeline(df_valid, type='valid', batch_size=config['batch_size'])    
    
    callbacks = get_callbacks(idx)
    if config['use_wandb'] and config['device']!='TPU':
        callbacks.append( WandbCallback(save_model=False) )
    
    model = global_model
    model.load_weights('base_model.h5')

    print(f"### Traning Data Prep done for Fold: {idx}")
    LOGGER.debug("Data Prep done for Fold "+str(idx))

    
    print(f"### Model training for Fold: {idx}")
    LOGGER.debug("Model training for Fold "+str(idx))

    history = model.fit(train_dataset, epochs=config['epochs'], callbacks=callbacks, verbose = 1, validation_data = valid_dataset_for_epoch)
    model.load_weights(f'best_model_fold_{idx}.h5')
    
    print(f"### Predicting on Valid set for Fold: {idx}")
    valid_dataset, valid_tokenized = dataset.prepare_tf_data_pipeline(df_valid, type='eval',  batch_size=config['batch_size'])
    pred_valid_start, pred_valid_end = model.predict(valid_dataset, verbose = 1, workers=4)
    valid_preds = postprocess(df_valid.reset_index(drop=True), valid_tokenized, pred_valid_start, pred_valid_end)
    
    y_valid_gt = []
    y_valid_pred = []
    for id, answer in valid_preds.items():
        y_valid_gt.append(df_valid[df_valid['id'] == id]['answer_text'].iloc[0])
        y_valid_pred.append(answer)
        
    
     
    jaccard_fold = get_jaccard_score(y_valid_gt, y_valid_pred)
    jaccard_list.append(jaccard_fold)
    
    print(f"### Jaccard for Fold {idx}: ", jaccard_fold)
    
    
    pred_test_start, pred_test_end = model.predict(test_dataset, verbose = 1, workers=4)
    pred_list['fold_'+str(idx)] = {'start': pred_test_start, 'end': pred_test_end }
    del model
    del valid_dataset
    del valid_tokenized
    del train_dataset

    gc.collect()

In [None]:
mean_jaccard = np.mean(jaccard_list)
print(f"Mean jaccard is {mean_jaccard}")


In [None]:
start_ = []
end_ = []
for i in range(0,config['num_folds']):
    start_.append(pred_list[f'fold_{i}']['start'])
    end_.append(pred_list[f'fold_{i}']['end'])

pred_test_start = np.mean(start_, axis=0)                           
pred_test_end = np.mean(end_, axis=0)    

## Saving Predictions

In [None]:
test_preds = postprocess(df_test, test_tokenized, pred_test_start, pred_test_end)
pred_test_df = pd.DataFrame(test_preds.keys(), test_preds.values()).reset_index()
pred_test_df.columns = ['PredictionString', 'id']
pred_test_df = pred_test_df[['id', 'PredictionString']]
pred_test_df.to_csv('submission.csv', index=False)

**If you learnt something from this kernel please don't forget to upvote**

My other kernels:

https://www.kaggle.com/harveenchadha/chaii-muril-hf-keras-wb