# Refactor TensorFlow 2.0 - Bert Yes/No Answers
## this post the refactor version of https://www.kaggle.com/mmmarchetti/tensorflow-2-0-bert-yes-no-answers
## with little change on get_short_pred(), 
### Importing Libraries

In [1]:
import os
import sys

import json
import tensorflow as tf

FLAGS = {
   'skip_nested_contexts': True,
   'max_position': 50,
   'max_contexts': 48,
   'max_query_length': 64,
   'max_seq_length': 512,
   'doc_stride': 128,
   'include_unknowns': -1.0,
   'n_best_size': 20,
   'max_answer_length': 30,
}
F = FLAGS

RUN_ON = 'kaggle' if os.path.exists('/kaggle') else 'gcp'
# #### Configurations. change by user.
if RUN_ON == 'gcp':
    os.chdir('/home/jupyter/kaggle/working')
    sys.path.extend(['../input/bert-joint-baseline/'])
    INPUT_PATH = 'gs://tyu-kaggle/input/'
    MODEL_LOAD_DIR = './bert_trained/'
else:
    INPUT_PATH = '../input/'
    MODEL_LOAD_DIR = '../input/berttrained3/'

# #### Configurations. No need to change.
BERT_CONFIG_PATH = os.path.join(INPUT_PATH, 'bert-joint-baseline/bert_config.json')
VOCAB_PATH = os.path.join(INPUT_PATH, 'bert-joint-baseline/vocab-nq.txt')

NQ_TEST_JSONL_PATH = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'
NQ_TRAIN_JSONL_PATH = '../input/tensorflow2-question-answering/simplified-nq-train.jsonl'
SAMPLE_SUBMISSION_PATH = '../input/tensorflow2-question-answering/sample_submission.csv'
# we pad example_id, so we must redo preprocess...
NQ_TEST_TFRECORD_PATH = 'gs://tyu-kaggle/working/nq-test.tfrecords' if RUN_ON == 'gcp' else 'nq-test.tfrecords'

MODEL_LOAD_PATH = MODEL_LOAD_DIR + 'weights.h5'
CKPT_PATH = os.path.join(INPUT_PATH, 'bert-joint-baseline/model_cpkt-1')

TEST_DS_TYPE = 'public' if os.path.getsize(NQ_TEST_JSONL_PATH) < 20000000 else 'private'

SEQ_LENGTH = F['max_seq_length']  # bert_config['max_position_embeddings']

ANSWER_TYPE_ORDER = ['UNKNOWN', 'YES', 'NO', 'SHORT', 'LONG']

with tf.io.gfile.GFile(BERT_CONFIG_PATH, 'r') as f:
    bert_config = json.load(f)
print(json.dumps(bert_config, indent=4))

!ls -lh $MODEL_LOAD_DIR
!cat $MODEL_LOAD_DIR/fingerprint.json

{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "type_vocab_size": 2,
    "max_position_embeddings": 512,
    "initializer_range": 0.02,
    "hidden_size": 1024,
    "hidden_dropout_prob": 0.1,
    "intermediate_size": 4096,
    "num_attention_heads": 16,
    "num_hidden_layers": 24,
    "vocab_size": 30522
}
total 1.3G
-rw-r--r-- 1 jupyter jupyter  121 Jan 21 02:55 dataset-metadata.json
-rw-r--r-- 1 jupyter jupyter   58 Jan 21 02:52 fingerprint.json
-rw-r--r-- 1 jupyter jupyter 1.3G Jan 20 18:43 weights.h5
{
    "message": "train 3 epoch",
    "modify_time": ""
}


In [2]:
import gzip

import numpy as np
import pandas as pd
import tensorflow.keras.backend as K

import bert_utils
import modeling
import tokenization

from tqdm.auto import tqdm
import importlib

importlib.reload(bert_utils)
K.clear_session()

tf.__version__


'2.1.0'

### Classes & Functions

In [3]:
class TDense(tf.keras.layers.Layer):
    def __init__(self,
                 output_size,
                 kernel_initializer=None,
                 bias_initializer="zeros",
                 **kwargs):
        super().__init__(**kwargs)
        self.output_size = output_size
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer

    def build(self, input_shape):
        dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
        if not (dtype.is_floating or dtype.is_complex):
            raise TypeError("Unable to build `TDense` layer with "
                            "non-floating point (and non-complex) "
                            "dtype %s" % (dtype,))
        input_shape = tf.TensorShape(input_shape)
        if tf.compat.dimension_value(input_shape[-1]) is None:
            raise ValueError("The last dimension of the inputs to "
                             "`TDense` should be defined. "
                             "Found `None`.")
        last_dim = tf.compat.dimension_value(input_shape[-1])
        self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: last_dim})
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.output_size, last_dim],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.bias = self.add_weight(
            "bias",
            shape=[self.output_size],
            initializer=self.bias_initializer,
            dtype=self.dtype,
            trainable=True)
        super(TDense, self).build(input_shape)

    def call(self, x):
        return tf.matmul(x, self.kernel, transpose_b=True) + self.bias


class Squeeze(tf.keras.layers.Layer):
    def call(self, x, axis=None, name=None):
        return tf.squeeze(x, axis, name)


def mk_model(config):
    seq_len = config['max_position_embeddings']
    unique_id = tf.keras.Input(shape=(1,), dtype=tf.int64, name='unique_id')
    input_ids = tf.keras.Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
    input_mask = tf.keras.Input(shape=(seq_len,), dtype=tf.int32, name='input_mask')
    segment_ids = tf.keras.Input(shape=(seq_len,), dtype=tf.int32, name='segment_ids')
    BERT = modeling.BertModel(config=config, name='bert')
    pooled_output, sequence_output = BERT(input_word_ids=input_ids,
                                          input_mask=input_mask,
                                          input_type_ids=segment_ids)
    logits = TDense(2, name='logits')(sequence_output)
    start_logits, end_logits = tf.split(logits, axis=-1, num_or_size_splits=2, name='split')
    start_logits = Squeeze(name='start_logits')(start_logits, axis=-1)
    end_logits = Squeeze(name='end_logits')(end_logits, axis=-1)

    ans_type = TDense(5, name='ans_type')(pooled_output)
    return tf.keras.Model([input_ for input_ in [unique_id, input_ids, input_mask, segment_ids]
                           if input_ is not None],
                          [unique_id, start_logits, end_logits, ans_type],
                          name='bert-baseline')

In [4]:
# Detect hardware, return appropriate distribution strategy
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', TPU.cluster_spec().as_dict()['worker'])
except ValueError:
    TPU = None

if TPU:
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 64
    # drop_remainder must be True if running on TPU, maybe a bug
    # so we pad some examples.
    nq_test_jsonl_path2 = NQ_TEST_JSONL_PATH + '.pad'
    !cp $NQ_TEST_JSONL_PATH $nq_test_jsonl_path2
    !tail -n 3 $NQ_TEST_JSONL_PATH >> $nq_test_jsonl_path2
    NQ_TEST_JSONL_PATH = nq_test_jsonl_path2
else:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 16

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  ['10.254.212.146:8470']
INFO:tensorflow:Initializing the TPU system: tyu


INFO:tensorflow:Initializing the TPU system: tyu


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


REPLICAS:  8


In [5]:
with strategy.scope():
    model = mk_model(bert_config)
    model.summary()
    cpkt = tf.train.Checkpoint(model=model)
    cpkt.restore(CKPT_PATH).assert_consumed()

Model: "bert-baseline"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (BertModel)                ((None, 1024), (None 335141888   input_ids[0][0]                  
                                                                 input_mask[0][0]     

In [6]:
def url_exists(url):
    """test local or gs file exists or not."""
    from urllib import parse
    res = parse.urlparse(url)
    if res.scheme == 'gs':
        # blob_name has no '/' prefix
        bucket_name, blob_name = res.netloc, res.path[1:]
        from google.cloud import storage
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        return blob.exists()
    else:
        return os.path.exists(res.path)


FEATURE_DESCRIPTION = {
    #     "example_id": tf.io.FixedLenFeature([], tf.int64),
    "unique_id": tf.io.FixedLenFeature([], tf.int64),
    "input_ids": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64),
    "input_mask": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64),
}
def _decode_record(record, feature_description=None):
    """Decodes a record to a TensorFlow example."""
    feature_description = feature_description or FEATURE_DESCRIPTION
    example = tf.io.parse_single_example(serialized=record, features=feature_description)
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for key in [k for k in example.keys() if k not in ['example_id', 'unique_id']]:
        example[key] = tf.cast(example[key], dtype=tf.int32)

    #     example.pop('example_id')
    return example

In [7]:
if not url_exists(NQ_TEST_TFRECORD_PATH):
    # tf2baseline.F.max_seq_length = 512
    eval_writer = bert_utils.FeatureWriter(filename=NQ_TEST_TFRECORD_PATH,
                                           is_training=False)
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_PATH,
                                           do_lower_case=True)
    features = []
    convert = bert_utils.ConvertExamples2Features(tokenizer=tokenizer,
                                                  is_training=False,
                                                  output_fn=eval_writer.process_feature,
                                                  collect_stat=False)
    n_examples = 0
    # tqdm_notebook = tqdm.tqdm_notebook  # if not on_kaggle_server else None
    for examples in bert_utils.nq_examples_iter(input_file=NQ_TEST_JSONL_PATH,
                                                is_training=False,
                                                tqdm=tqdm):
        for example in examples:
            n_examples += convert(example)
    eval_writer.close()
    print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features))

Reading: ../input/tensorflow2-question-answering/simplified-nq-test.jsonl.pad


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


number of test examples: 9090, written to file: 9090


In [8]:
raw_ds = tf.data.TFRecordDataset(NQ_TEST_TFRECORD_PATH)
decoded_ds = raw_ds.map(_decode_record)
batched_ds = decoded_ds.batch(batch_size=BATCH_SIZE, drop_remainder=(TPU is not None))

result = model.predict(batched_ds, verbose=1)

    142/Unknown - 38s 267ms/step

In [9]:
# add example_id to beginning.
example_id_ds = raw_ds.map(lambda x: tf.io.parse_single_example(
    serialized=x,
    features={"example_id": tf.io.FixedLenFeature([], tf.int64)}
)['example_id'])
result = (np.array(list(example_id_ds)[:len(result[0])]), *result)

## 1- Understanding the code
#### For a better understanding, I will briefly explain here.
#### In the item "answer_type", in the last lines of this block, it is responsible for storing the identified response type, which, according to [github project repository](https://github.com/google-research/language/blob/master/language/question_answering/bert_joint/run_nq.py) can be:
UNKNOWN = 0
YES = 1
NO = 2
SHORT = 3
LONG = 4


In [10]:
def read_candidates_from_one_split(input_path):
    """Read candidates from a single jsonl file."""
    candidates_dict = {}
    print("Reading examples from: %s" % input_path)
    if input_path.endswith(".gz"):
        with gzip.GzipFile(fileobj=tf.io.gfile.GFile(input_path, "rb")) as input_file:
            for index, line in enumerate(input_file):
                e = json.loads(line)
                candidates_dict[e["example_id"]] = e["long_answer_candidates"]
    else:
        with tf.io.gfile.GFile(input_path, "r") as input_file:
            for index, line in enumerate(input_file):
                e = json.loads(line)
                candidates_dict[e["example_id"]] = e["long_answer_candidates"]
                # candidates_dict['question'] = e['question_text']
    return candidates_dict


def read_candidates(input_pattern):
    """Read candidates with real multiple processes."""
    input_paths = tf.io.gfile.glob(input_pattern)
    final_dict = {}
    for input_path in input_paths:
        final_dict.update(read_candidates_from_one_split(input_path))
    return final_dict


print("getting candidates...")
candidates_dict = read_candidates('../input/tensorflow2-question-answering/simplified-nq-test.jsonl')

getting candidates...
Reading examples from: ../input/tensorflow2-question-answering/simplified-nq-test.jsonl


In [11]:
print("getting result_df...")
result_df = pd.DataFrame({
    "example_id": result[0].squeeze().tolist(),
    "unique_id": result[1].squeeze().tolist(),
    "start_logits": result[2].tolist(),
    "end_logits": result[3].tolist(),
    "answer_type_logits": result[4].tolist()
}).set_index(['example_id', 'unique_id'])
# we pad some instances when using TPU, deduplicate it here.
if TPU is not None:
    print('result_df len before dedup: ' + str(len(result_df)))
    result_df = result_df[~result_df.index.duplicated()]
    print('result_df len after  dedup: ' + str(len(result_df)))

getting result_df...
result_df len before dedup: 9088
result_df len after  dedup: 9070


In [12]:
token_map_ds = raw_ds.map(lambda x: tf.io.parse_single_example(
    serialized=x,
    features={
        "example_id": tf.io.FixedLenFeature([], tf.int64),
        "unique_id": tf.io.FixedLenFeature([], tf.int64),
        # token_map: token to origin map.
        "token_map": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64)
    }
))
print("getting token_map_df...")
token_map_df = pd.DataFrame.from_records(list(token_map_ds)).applymap(
    lambda x: x.numpy()
).set_index(['example_id', 'unique_id'])
# we pad some instances when using TPU, deduplicate it here.
if TPU is not None:
    print('token_map_df len before: ' + str(len(token_map_df)))
    token_map_df = token_map_df[~token_map_df.index.duplicated()]
    print('token_map_df len before: ' + str(len(token_map_df)))

getting token_map_df...
token_map_df len before: 9090
token_map_df len before: 9070


In [13]:
joined = result_df.join(token_map_df, on=['example_id', 'unique_id'])

In [14]:
def best_score_start_end_of_instance(res: pd.Series):
    """
    :param res: index: ['answer_type_logits', 'end_logits', 'start_logits', 'token_map', 'candidates']
    :return: best_score_of_instance, start_short_idx, end_short_idx
    """
    invalid_token_msk = np.array(res['token_map']) == -1
    s_logits, e_logits = pd.Series(res['start_logits']), pd.Series(res['end_logits'])
    # filter logits corresponding to context token and rank top-k.
    s_not_top_k_msk = s_logits.mask(invalid_token_msk) \
                          .rank(method='min', ascending=False) > F['n_best_size']
    s_indexes = np.ma.masked_array(np.arange(s_logits.size),
                                   mask=s_not_top_k_msk | invalid_token_msk)
    e_not_top_k_msk = e_logits.mask(invalid_token_msk) \
                          .rank(method='min', ascending=False) > F['n_best_size']
    e_indexes = np.ma.masked_array(np.arange(e_logits.size),
                                   mask=e_not_top_k_msk | invalid_token_msk)
    # s_e_msk has shape: [512, 512], end index should greater than start index, otherwise, mask it.
    s_e_msk = e_indexes[np.newaxis, :] <= s_indexes[:, np.newaxis]
    # short answer length should litter than max_answer_length, otherwise, mask it.
    s_e_msk |= (e_indexes[np.newaxis, :] - s_indexes[:, np.newaxis] >= F['max_answer_length'])
    # full mask.
    s_e_msk = s_e_msk.filled(True)

    if s_e_msk.all():  # if all start-end combinations has been masked.
        return np.NAN, np.NAN, np.NAN
    else:
        # broadcast to shape: [512, 512], and set mask=s_e_msk
        s_logits_bc = np.ma.array(
            np.broadcast_to(s_logits[:, np.newaxis], shape=[s_logits.size, e_logits.size]),
            mask=s_e_msk)
        e_logits_bc = np.ma.array(
            np.broadcast_to(e_logits[np.newaxis, :], shape=[s_logits.size, e_logits.size]),
            mask=s_e_msk)
        short_span_score = s_logits_bc + e_logits_bc
        cls_token_score = s_logits[0] + e_logits[0]
        score = short_span_score - cls_token_score
        s_short_idx, e_short_idx = divmod(score.argmax(), e_logits.size)
        s_logits_masked = np.ma.array(res['start_logits'], mask=invalid_token_msk)
        e_logits_masked = np.ma.array(res['end_logits'], mask=invalid_token_msk)
        ins_score = score.max() - (s_logits_masked.mean() + e_logits_masked.mean())
        # s_e_sum = s_logits_bc + e_logits_bc
        # s_short_idx, e_short_idx = divmod(s_e_sum.argmax(), e_logits.size)
        # s_probs_masked = np.exp(s_logits_masked) / np.sum(np.exp(s_logits_masked))
        # e_probs_masked = np.exp(e_logits_masked) / np.sum(np.exp(e_logits_masked))
        # n_valid_tokens = (~invalid_token_msk).sum()
        # ins_score = (s_probs_masked[s_short_idx] + e_probs_masked[e_short_idx]) * n_valid_tokens
        
        return ins_score, s_short_idx, e_short_idx


pred_df = pd.DataFrame(columns=['example_id', 'score', 'answer_type',
                                'short_span_start', 'short_span_end',
                                'long_span_start', 'long_span_end', ]
                       ).set_index('example_id')
# fill pred_df
for example_id, group_df in tqdm(joined.groupby('example_id')):
    # group_df: each row got a unique id(unique_id), all rows have a some example_id.
    # columns = ['answer_type_logits', 'end_logits', 'start_logits', 'token_map', 'candidates']
    group_df = group_df.copy().reset_index(level='example_id', drop=True)
    # get best score/start/end and answer type for every instance within same example.
    for u_id, res in group_df.iterrows():
        answer_type_logits = pd.Series(res['answer_type_logits'], index=ANSWER_TYPE_ORDER)
        group_df.loc[u_id, 'ins_answer_type'] = answer_type_logits.idxmax()
        ins_score, ins_start, ins_end = best_score_start_end_of_instance(res)
        group_df.loc[u_id, 'ins_score'] = ins_score
        group_df.loc[u_id, 'ins_short_span_start'] = res['token_map'][ins_start]
        # end span should be exclusive, and np.nan + 1 = np.nan
        group_df.loc[u_id, 'ins_short_span_end'] = res['token_map'][ins_end] + 1
    # we pick instance result who's best score is best among the instances within same example
    best_u_id = group_df['ins_score'].idxmax()
    if best_u_id is not np.NAN:  # if all instances got no score
        short_span_start, short_span_end = group_df.loc[best_u_id, ['ins_short_span_start', 'ins_short_span_end']]
        pred_df.loc[example_id, 'score'] = group_df.loc[best_u_id, 'ins_score']
        pred_df.loc[example_id, 'short_span_start'] = short_span_start
        pred_df.loc[example_id, 'short_span_end'] = short_span_end
        # search for long answer span.
        for cand in candidates_dict[str(example_id)]:
            if cand['top_level'] and cand['start_token'] <= short_span_start and short_span_end <= cand['end_token']:
                pred_df.loc[example_id, 'long_span_start'] = cand['start_token']
                pred_df.loc[example_id, 'long_span_end'] = cand['end_token']
                break
        pred_df.loc[example_id, 'answer_type'] = group_df.loc[best_u_id, 'ins_answer_type']
        # break


HBox(children=(FloatProgress(value=0.0, max=346.0), HTML(value='')))




## 2- Main Change
#### Here is the small, but main change: we created an if to check the predicted response type and thus filter / identify the responses that are passed to the submission file.

### Filtering the Answers

In [69]:
def get_short_pred(pred_row: pd.Series):
    # score(best short answer) is np.NAN means: there's no short/long answers.
    if pred_row['score'] is np.NAN:
        return ''
    # answer_type can not be np.NAN if score is not np.NAN.
    if pred_row['answer_type'] == 'UNKNOWN':
        return ''
    if pred_row['answer_type'] in ['YES', 'NO']:
        return pred_row['answer_type']
    if pred_row['answer_type'] in ['SHORT', 'LONG']:
        if pred_row['score'] < 22.5:
            return ''
        else:
            return '%d:%d' % (pred_row['short_span_start'], pred_row['short_span_end'])


def get_long_pred(pred_row: pd.Series):
    # score(best short answer) is np.NAN means: there's no short/long answers.
    if pred_row['score'] is np.NAN:
        return ''
    # answer_type can not be np.NAN if score is not np.NAN.
    if pred_row['answer_type'] == 'UNKNOWN':
        return ''
    if pred_row['answer_type'] in ['YES', 'NO', 'SHORT', 'LONG']:
        if pred_row['score'] < 17.5 or pred_row['long_span_start'] is np.NAN:
            return ''
        else:
            return '%d:%d' % (pred_row['long_span_start'], pred_row['long_span_end'])


### Creating a DataFrame

In [70]:
prediction_df = pred_df.copy()
prediction_df['long_pred'] = pred_df.apply(get_long_pred, axis='columns')
prediction_df['short_pred'] = pred_df.apply(get_short_pred, axis='columns')
prediction_df.index = prediction_df.index.map(lambda x: str(x))

### Generating the Submission File

In [71]:
sample_submission = pd.read_csv(SAMPLE_SUBMISSION_PATH).set_index('example_id')

for eid, row in prediction_df.iterrows():
    sample_submission.loc[eid + '_long', 'PredictionString'] = row['long_pred']
    sample_submission.loc[eid + '_short', 'PredictionString'] = row['short_pred']

In [72]:
sample_submission.reset_index().to_csv('submission.csv', index=False)

In [73]:
prediction_df[prediction_df['score']>800]

Unnamed: 0_level_0,score,answer_type,short_span_start,short_span_end,long_span_start,long_span_end,long_pred,short_pred
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-9106782542406690435,985.147,SHORT,5013,5015,4976,5144,4976:5144,5013:5015
-9076824655968712429,928.005,UNKNOWN,1592,1596,1571,1793,,
-9030298026304897796,955.776,SHORT,629,636,620,727,620:727,629:636
-8752372642178983917,964.582,UNKNOWN,1382,1384,1380,1790,,
-8232502904250181946,964.491,SHORT,1276,1282,1255,1373,1255:1373,1276:1282
...,...,...,...,...,...,...,...,...
8838160678370847582,973.083,SHORT,3626,3630,2611,3640,2611:3640,3626:3630
8847717112932520020,964.771,SHORT,1595,1598,1503,1613,1503:1613,1595:1598
8898704006223012351,810.931,SHORT,292,293,290,346,290:346,292:293
8901598759301757038,966.46,UNKNOWN,2021,2026,2020,2200,,


In [74]:
prediction_df.tail(60)

Unnamed: 0_level_0,score,answer_type,short_span_start,short_span_end,long_span_start,long_span_end,long_pred,short_pred
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5926321958271744357,692.269,UNKNOWN,909,911,904,998,,
5962215690907729115,478.793,YES,728,741,688,742,688:742,YES
6026862874448345027,959.704,SHORT,1622,1624,1576,1674,1576:1674,1622:1624
6034422085903453063,945.256,UNKNOWN,760,762,759,795,,
6170709165341391390,48.2712,SHORT,24,27,22,49,,
6200146665861807611,486.414,SHORT,469,498,468,503,468:503,
6217999213225495325,854.772,SHORT,3473,3476,3458,3631,3458:3631,3473:3476
6244104306165846098,632.418,UNKNOWN,1703,1711,1673,1768,,
6307387084414718104,433.498,UNKNOWN,162,165,161,487,,
6386368216346051300,433.56,LONG,9,26,8,88,8:88,


In [75]:
blank_answers = sample_submission[sample_submission['PredictionString'] == '']
blank_answers

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-1011141123527297803_short,
-1028916936938579349_long,
-1028916936938579349_short,
-1055197305756217938_short,
-1114334749483663139_long,
...,...
9212083134098244596_short,
934950704129184964_long,
934950704129184964_short,
958723574737344087_long,


*Yes
Answers

In [76]:
yes_answers = sample_submission[sample_submission['PredictionString'] == 'YES']
yes_answers

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-6363382149580295498_short,YES
5509115004294491239_short,YES
5962215690907729115_short,YES


*No
Answers

In [77]:
no_answers = sample_submission[sample_submission['PredictionString'] == 'NO']
no_answers

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-3461207570097431362_short,NO


In [78]:
short_answers = sample_submission.iloc[np.array(range(0, len(sample_submission), 2)) + 1]
short_answers[short_answers['PredictionString'] != '']

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-1074129516932871805_short,3153:3155
-1521618734431802363_short,1130:1131
-1636829004676923357_short,2159:2165
-1883092661126100958_short,172:174
-196591140811642071_short,795:798
...,...
8838160678370847582_short,3626:3630
8847717112932520020_short,1595:1598
8898704006223012351_short,292:293
8994564890124107395_short,378:392


In [79]:
long_answers = sample_submission.iloc[np.array(range(0, len(sample_submission), 2)) + 0]
long_answers[long_answers['PredictionString'] != '']

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-1011141123527297803_long,223:277
-1055197305756217938_long,221:335
-1074129516932871805_long,3119:3210
-1237358188352001279_long,583:683
-1521618734431802363_long,1118:1250
...,...
8994564890124107395_long,350:466
900126738178451873_long,640:706
9204032098950736962_long,2086:2221
9212083134098244596_long,168:241


### I am only sharing modifications that I believe may help. I left out Tunning and any significant code changes I made.

### We'll be grateful if someone gets a better understanding and can share what really impacts the assessment. No need to share code, just knowledge.
### Thank you!