### Importing Libraries

In [1]:
import os
import sys

os.chdir('/home/jupyter/kaggle/working')
sys.path.extend(['../input/bert-joint-baseline/'])

In [2]:
import collections
import gzip
import json
import bert_utils
import modeling
import numpy as np
import pandas as pd
import tensorflow as tf
import tokenization

import importlib

importlib.reload(bert_utils)

tf.__version__


'2.2.0-dev20200112'

### Classes & Functions

In [3]:
class TDense(tf.keras.layers.Layer):
    def __init__(self,
                 output_size,
                 kernel_initializer=None,
                 bias_initializer="zeros",
                 **kwargs):
        super().__init__(**kwargs)
        self.output_size = output_size
        self.kernel_initializer = kernel_initializer
        self.bias_initializer = bias_initializer

    def build(self, input_shape):
        dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
        if not (dtype.is_floating or dtype.is_complex):
            raise TypeError("Unable to build `TDense` layer with "
                            "non-floating point (and non-complex) "
                            "dtype %s" % (dtype,))
        input_shape = tf.TensorShape(input_shape)
        if tf.compat.dimension_value(input_shape[-1]) is None:
            raise ValueError("The last dimension of the inputs to "
                             "`TDense` should be defined. "
                             "Found `None`.")
        last_dim = tf.compat.dimension_value(input_shape[-1])
        self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: last_dim})
        self.kernel = self.add_weight(
            "kernel",
            shape=[self.output_size, last_dim],
            initializer=self.kernel_initializer,
            dtype=self.dtype,
            trainable=True)
        self.bias = self.add_weight(
            "bias",
            shape=[self.output_size],
            initializer=self.bias_initializer,
            dtype=self.dtype,
            trainable=True)
        super(TDense, self).build(input_shape)

    def call(self, x):
        return tf.matmul(x, self.kernel, transpose_b=True) + self.bias


class DummyObject:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)


def mk_model(config):
    seq_len = config['max_position_embeddings']
    example_id = tf.keras.Input(shape=(1,), dtype=tf.int64, name='example_id')
    unique_id = tf.keras.Input(shape=(1,), dtype=tf.int64, name='unique_id')
    input_ids = tf.keras.Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
    input_mask = tf.keras.Input(shape=(seq_len,), dtype=tf.int32, name='input_mask')
    segment_ids = tf.keras.Input(shape=(seq_len,), dtype=tf.int32, name='segment_ids')
    BERT = modeling.BertModel(config=config, name='bert')
    pooled_output, sequence_output = BERT(input_word_ids=input_ids,
                                          input_mask=input_mask,
                                          input_type_ids=segment_ids)

    logits = TDense(2, name='logits')(sequence_output)
    start_logits, end_logits = tf.split(logits, axis=-1, num_or_size_splits=2, name='split')
    start_logits = tf.squeeze(start_logits, axis=-1, name='start_squeeze')
    end_logits = tf.squeeze(end_logits, axis=-1, name='end_squeeze')

    ans_type = TDense(5, name='ans_type')(pooled_output)
    return tf.keras.Model([input_ for input_ in [example_id, unique_id, input_ids, input_mask, segment_ids]
                           if input_ is not None],
                          [example_id, unique_id, start_logits, end_logits, ans_type],
                          name='bert-baseline')


def url_exists(url):
    """test local or gs file exists or not."""
    from urllib import parse
    res = parse.urlparse(url)
    print(res)
    if res.scheme == 'gs':
        # blob_name has no '/' prefix
        bucket_name, blob_name = res.netloc, res.path[1:]
        from google.cloud import storage
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(blob_name[1:])
        return blob.exists()
    else:
        return os.path.exists(res.path)


In [4]:
# url_exists('gs://tyu-kaggle/input/bert-joint-baseline/bert_config.json')
# !gsutil ls -R gs://tyu-kaggle/input/

In [5]:
FLAGS = DummyObject(skip_nested_contexts=True,
                    max_position=50,
                    max_contexts=48,
                    max_query_length=64,
                    max_seq_length=512,
                    doc_stride=128,
                    include_unknowns=-1.0,
                    n_best_size=20,
                    max_answer_length=30)

SEQ_LENGTH = FLAGS.max_seq_length  # config['max_position_embeddings']

RUN_ON = 'kaggle' if os.path.exists('/kaggle') else 'gcp'

if RUN_ON == 'gcp':
    INPUT_PATH = 'gs://tyu-kaggle/input/'
else:
    INPUT_PATH = '../input/'
CPKT_PATH = os.path.join(INPUT_PATH, 'bert-joint-baseline/model_cpkt-1')
VOCAB_PATH = os.path.join(INPUT_PATH, 'bert-joint-baseline/vocab-nq.txt')
NQ_TEST_TFRECORD_PATH = os.path.join(INPUT_PATH, 'bert-joint-baseline/nq-test.tfrecords')

NQ_TEST_JSONL_PATH = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'
NQ_TRAIN_JSONL_PATH = '../input/tensorflow2-question-answering/simplified-nq-train.jsonl'

TEST_DS_TYPE = 'public' if os.path.getsize(NQ_TEST_JSONL_PATH) < 20000000 else 'private'

FEATURE_DESCRIPTION = {
    "example_id": tf.io.FixedLenFeature([], tf.int64),
    "unique_id": tf.io.FixedLenFeature([], tf.int64),
    "input_ids": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64),
    "input_mask": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64),
}

with open('../input/bert-joint-baseline/bert_config.json', 'r') as f:
    config = json.load(f)
print(json.dumps(config, indent=4))

{
    "attention_probs_dropout_prob": 0.1,
    "num_attention_heads": 16,
    "hidden_size": 1024,
    "num_hidden_layers": 24,
    "initializer_range": 0.02,
    "hidden_act": "gelu",
    "vocab_size": 30522,
    "hidden_dropout_prob": 0.1,
    "intermediate_size": 4096,
    "type_vocab_size": 2,
    "max_position_embeddings": 512
}


In [6]:
# Detect hardware, return appropriate distribution strategy
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', TPU.cluster_spec().as_dict()['worker'])
except ValueError:
    TPU = None

if TPU:
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 32
    # drop_remainder must be True if running on TPU, maybe a bug
    # so we pad some examples.
    nq_test_jsonl_path2 = NQ_TEST_JSONL_PATH + '.pad'
    !cp $NQ_TEST_JSONL_PATH $nq_test_jsonl_path2
    !tail -n 3 $NQ_TEST_JSONL_PATH >> $nq_test_jsonl_path2
    NQ_TEST_JSONL_PATH = nq_test_jsonl_path2
else:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 16

print("REPLICAS: ", strategy.num_replicas_in_sync)

with strategy.scope():
    model = mk_model(config)
    model.summary()
    cpkt = tf.train.Checkpoint(model=model)
    cpkt.restore(CPKT_PATH).assert_consumed()

Running on TPU  ['192.168.21.2:8470']
INFO:tensorflow:Initializing the TPU system: tyu
INFO:tensorflow:Clearing out eager caches
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, 

INFO:tensorflow:Initializing the TPU system: tyu
INFO:tensorflow:Clearing out eager caches
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Availab

In [7]:
# small_config = config.copy()
# small_config['vocab_size']=16
# small_config['hidden_size']=64
# small_config['max_position_embeddings'] = 32
# small_config['num_hidden_layers'] = 4
# small_config['num_attention_heads'] = 4
# small_config['intermediate_size'] = 256
# small_config


In [8]:
import tqdm

# if not url_exists(NQ_TEST_JSONL_PATH):
if True:
    # tf2baseline.FLAGS.max_seq_length = 512
    eval_writer = bert_utils.FeatureWriter(filename=NQ_TEST_TFRECORD_PATH,
                                           is_training=False)
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_PATH,
                                           do_lower_case=True)
    features = []
    convert = bert_utils.ConvertExamples2Features(tokenizer=tokenizer,
                                                  is_training=False,
                                                  output_fn=eval_writer.process_feature,
                                                  collect_stat=False)
    n_examples = 0
    tqdm_notebook = tqdm.tqdm_notebook  # if not on_kaggle_server else None
    for examples in bert_utils.nq_examples_iter(input_file=NQ_TEST_JSONL_PATH,
                                                is_training=False,
                                                tqdm=tqdm_notebook):
        for example in examples:
            n_examples += convert(example)
    eval_writer.close()
    print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features))


Reading: ../input/tensorflow2-question-answering/simplified-nq-test.jsonl.pad

number of test examples: 9090, written to file: 9090


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [9]:
def _decode_record(record, feature_description=None):
    """Decodes a record to a TensorFlow example."""
    feature_description = feature_description or FEATURE_DESCRIPTION
    example = tf.io.parse_single_example(serialized=record, features=feature_description)
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for key in [k for k in example.keys() if k not in ['example_id', 'unique_id']]:
        example[key] = tf.cast(example[key], dtype=tf.int32)

    return example


In [10]:
raw_ds = tf.data.TFRecordDataset(NQ_TEST_TFRECORD_PATH)
decoded_ds = raw_ds.map(_decode_record)
batched_ds = decoded_ds.batch(batch_size=BATCH_SIZE, drop_remainder=(TPU is not None))

result = model.predict(batched_ds, verbose=1)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: can't assign to () (tmpcgb_oj3h.py, line 18)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: can't assign to () (tmpcgb_oj3h.py, line 18)
    284/Unknown - 41s 144ms/step

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: can't assign to () (tmpcgb_oj3h.py, line 18)


In [11]:
np.savez_compressed('bert-joint-baseline-output.npz',
                    **dict(zip(['uniqe_id', 'start_logits', 'end_logits', 'answer_type_logits'],
                               result)))

In [12]:
Span = collections.namedtuple("Span", ["start_token_idx", "end_token_idx"])


In [13]:
class ScoreSummary(object):
    def __init__(self):
        self.predicted_label = None
        self.short_span_score = None
        self.cls_token_score = None
        self.answer_type_logits = None


In [14]:
class EvalExample(object):
    """Eval data available for a single example."""

    def __init__(self, example_id, candidates):
        self.example_id = example_id
        self.candidates = candidates
        self.results = {}
        self.features = {}


In [15]:
def get_best_indexes(logits, n_best_size):
    """Get the n-best logits from a list."""
    index_and_score = sorted(
        enumerate(logits[1:], 1), key=lambda x: x[1], reverse=True)
    best_indexes = []
    for i in range(len(index_and_score)):
        if i >= n_best_size:
            break
        best_indexes.append(index_and_score[i][0])
    return best_indexes


def top_k_indices(logits, n_best_size, token_map):
    indices = np.argsort(logits[1:]) + 1
    indices = indices[token_map[indices] != -1]
    return indices[-n_best_size:]


## 1- Understanding the code
#### For a better understanding, I will briefly explain here.
#### In the item "answer_type", in the last lines of this block, it is responsible for storing the identified response type, which, according to [github project repository](https://github.com/google-research/language/blob/master/language/question_answering/bert_joint/run_nq.py) can be:
1.
UNKNOWN = 0
2.
YES = 1
3.
NO = 2
4.
SHORT = 3
5.
LONG = 4


In [16]:
def compute_predictions(example):
    """Converts an example into an NQEval object for evaluation."""
    predictions = []
    n_best_size = FLAGS.n_best_size
    max_answer_length = FLAGS.max_answer_length
    i = 0
    for unique_id, result in example.results.items():
        if unique_id not in example.features:
            raise ValueError("No feature found with unique_id:", unique_id)
        token_map = np.array(example.features[unique_id]["token_map"])  # .int64_list.value
        start_indexes = top_k_indices(result.start_logits, n_best_size, token_map)
        if len(start_indexes) == 0:
            continue
        end_indexes = top_k_indices(result.end_logits, n_best_size, token_map)
        if len(end_indexes) == 0:
            continue
        indexes = np.array(list(np.broadcast(start_indexes[None], end_indexes[:, None])))
        indexes = indexes[(indexes[:, 0] < indexes[:, 1]) * (indexes[:, 1] - indexes[:, 0] < max_answer_length)]
        for start_index, end_index in indexes:
            summary = ScoreSummary()
            summary.short_span_score = (
                    result.start_logits[start_index] +
                    result.end_logits[end_index])
            summary.cls_token_score = (
                    result.start_logits[0] + result.end_logits[0])
            summary.answer_type_logits = result.answer_type_logits - result.answer_type_logits.mean()
            start_span = token_map[start_index]
            end_span = token_map[end_index] + 1

            # Span logits minus the cls logits seems to be close to the best.
            score = summary.short_span_score - summary.cls_token_score
            predictions.append((score, i, summary, start_span, end_span))
            i += 1  # to break ties

    # Default empty prediction.
    score = -10000.0
    short_span = Span(-1, -1)
    long_span = Span(-1, -1)
    summary = ScoreSummary()

    if predictions:
        score, _, summary, start_span, end_span = sorted(predictions, reverse=True)[0]
        short_span = Span(start_span, end_span)
        for c in example.candidates:
            start = short_span.start_token_idx
            end = short_span.end_token_idx
            ## print(c['top_level'],c['start_token'],s_short_span,c['end_token'],end)
            if c["top_level"] and c["start_token"] <= start and c["end_token"] >= end:
                long_span = Span(c["start_token"], c["end_token"])
                break
    summary.predicted_label = {
        "example_id": int(example.example_id),
        "long_answer": {
            "start_token": int(long_span.start_token_idx),
            "end_token": int(long_span.end_token_idx),
            "start_byte": -1,
            "end_byte": -1
        },
        "long_answer_score": float(score),
        "short_answers": [{
            "start_token": int(short_span.start_token_idx),
            "end_token": int(short_span.end_token_idx),
            "start_byte": -1,
            "end_byte": -1
        }],
        "short_answer_score": float(score),
        "yes_no_answer": "NONE",
        "answer_type_logits": summary.answer_type_logits.tolist(),
        # here:
        "answer_type": int(np.argmax(summary.answer_type_logits))
    }
    
    return summary

In [17]:
def compute_pred_dict(candidates_dict, dev_features, raw_results, tqdm=None):
    """Computes official answer key from raw logits."""
    raw_results_by_id = [(int(res.unique_id), 1, res) for res in raw_results]
    examples_by_id = [(int(k), 0, v) for k, v in candidates_dict.items()]
    features_by_id = [(int(d['unique_id']), 2, d) for d in dev_features]
    # Join examples with features and raw results.
    examples = []
    print('merging examples...')
    merged = sorted(examples_by_id + raw_results_by_id + features_by_id)
    print('done.')
    for idx, type_, datum in merged:
        if type_ == 0:  # isinstance(datum, list):
            examples.append(EvalExample(idx, datum))
        elif type_ == 2:  # "token_map" in datum:
            examples[-1].features[idx] = datum
        else:
            examples[-1].results[idx] = datum
    # Construct prediction objects.
    print('Computing predictions...')
    nq_pred_dict = {}
    # summary_dict = {}
    if tqdm is not None:
        examples = tqdm(examples)
    for e in examples:
        summary = compute_predictions(e)
        # summary_dict[e.example_id] = summary
        nq_pred_dict[e.example_id] = summary.predicted_label

    return nq_pred_dict


In [18]:
def read_candidates_from_one_split(input_path):
    """Read candidates from a single jsonl file."""
    candidates_dict = {}
    print("Reading examples from: %s" % input_path)
    if input_path.endswith(".gz"):
        with gzip.GzipFile(fileobj=tf.io.gfile.GFile(input_path, "rb")) as input_file:
            for index, line in enumerate(input_file):
                e = json.loads(line)
                candidates_dict[e["example_id"]] = e["long_answer_candidates"]
    else:
        with tf.io.gfile.GFile(input_path, "r") as input_file:
            for index, line in enumerate(input_file):
                e = json.loads(line)
                candidates_dict[e["example_id"]] = e["long_answer_candidates"]
                # candidates_dict['question'] = e['question_text']
    return candidates_dict


In [19]:
def read_candidates(input_pattern):
    """Read candidates with real multiple processes."""
    input_paths = tf.io.gfile.glob(input_pattern)
    final_dict = {}
    for input_path in input_paths:
        final_dict.update(read_candidates_from_one_split(input_path))
    return final_dict


In [20]:
result_df = pd.DataFrame({
    "example_id": result[0].squeeze().tolist(),
    "unique_id": result[1].squeeze().tolist(),
    "start_logits": result[2].tolist(),
    "end_logits": result[3].tolist(),
    "answer_type_logits": result[4].tolist()
}).set_index(['example_id', 'unique_id'])
# we pad some instances when using TPU, deduplicate it here.
if TPU is not None:
    print('result_df len before: ' + str(len(result_df)))
    result_df = result_df[~result_df.index.duplicated()]
    print('result_df len after : ' + str(len(result_df)))

result_df len before: 9088
result_df len after : 9070


In [21]:
# candidates_dict['8644948107288181312']

In [22]:
all_results = [bert_utils.RawResult(*x) for x in zip(*result[1:])]
print("Going to candidates file")
candidates_dict = read_candidates('../input/tensorflow2-question-answering/simplified-nq-test.jsonl')
candidates_df = pd.DataFrame.from_records(list(candidates_dict.items()),
                                          columns=['example_id', 'candidates'],
                                          index='example_id')
candidates_df.index = candidates_df.index.map(lambda x: int(x))
print("setting up eval features")

Going to candidates file
Reading examples from: ../input/tensorflow2-question-answering/simplified-nq-test.jsonl
setting up eval features


In [23]:
token_map_ds = raw_ds.map(lambda x: tf.io.parse_single_example(
    serialized=x,
    features={
        "example_id": tf.io.FixedLenFeature([], tf.int64),
        "unique_id": tf.io.FixedLenFeature([], tf.int64),
        # token_map: token to origin map.
        "token_map": tf.io.FixedLenFeature([SEQ_LENGTH], tf.int64)
    }
))

In [24]:
eval_features = list(token_map_ds)
print("compute_pred_dict")
tqdm_notebook = tqdm.tqdm_notebook

compute_pred_dict


In [25]:
token_map_df = pd.DataFrame.from_records(list(token_map_ds)).applymap(
    lambda x: x.numpy()
).set_index(['example_id', 'unique_id'])

# we pad some instances when using TPU, deduplicate it here.
if TPU is not None:
    print('token_map_df len before: ' + str(len(token_map_df)))
    token_map_df = token_map_df[~token_map_df.index.duplicated()]
    print('token_map_df len before: ' + str(len(token_map_df)))

token_map_df len before: 9090
token_map_df len before: 9070


In [26]:
# token_map_df.head(60)

In [27]:
joined = result_df.join(token_map_df, on=['example_id', 'unique_id']) \
    .join(candidates_df, on='example_id')

pred_df = pd.DataFrame(index=candidates_df.index,
                       columns=['score', 'short_span_start', 'short_span_end',
                                'long_span_start', 'long_span_end', 'answer_type'])

In [28]:
gg = joined.groupby('example_id')

In [29]:
for example_id, example_df in gg:
    # example_df: each row got a unique id(unique_id), all rows have a some example_id.
    # columns = ['answer_type_logits', 'end_logits', 'start_logits', 'token_map', 'candidates']
    example_df.reset_index(level='example_id', drop=True, inplace=True)
    for u_id, res in example_df.iterrows():
        msk_invalid_token = np.array(res['token_map']) == -1
        # filter logits corresponding to context token and rank top-k.
        s_logits = pd.Series(res['start_logits'])
        s_msk_not_top_k = s_logits.mask(msk_invalid_token)\
                              .rank(method='min', ascending=False) > FLAGS.n_best_size
        s_indexes = np.ma.masked_array(np.arange(s_logits.size),
                                       mask=s_msk_not_top_k | msk_invalid_token)
        e_logits = pd.Series(res['end_logits'])
        e_msk_not_top_k = e_logits.mask(msk_invalid_token)\
                              .rank(method='min', ascending=False) > FLAGS.n_best_size
        e_indexes = np.ma.masked_array(np.arange(e_logits.size),
                                       mask=e_msk_not_top_k | msk_invalid_token)
        # s_e_msk has shape: [512, 512], end index should greater than start index, otherwise, mask it.
        s_e_msk = e_indexes[np.newaxis, :] <= s_indexes[:, np.newaxis]
        # answer length should litter than max_answer_length, otherwise, mask it.
        s_e_msk |= (e_indexes[np.newaxis, :] - s_indexes[:, np.newaxis] >= FLAGS.max_answer_length)
        # full mask.
        s_e_msk = s_e_msk.filled(True)
        
        if s_e_msk.all():  # if all start-end combinations has been masked.
            example_df.loc[u_id, 'score'] = np.NAN
            example_df.loc[u_id, 's_short_span'] = np.NAN
            example_df.loc[u_id, 'e_short_span'] = np.NAN
        else:
            # broadcast to shape: [512, 512], and set mask=s_e_msk
            s_logits_bc = np.ma.array(
                np.broadcast_to(s_logits[:, np.newaxis], shape=[s_logits.size, e_logits.size]),
                mask=s_e_msk)
            e_logits_bc = np.ma.array(
                np.broadcast_to(e_logits[np.newaxis, :], shape=[s_logits.size, e_logits.size]),
                mask=s_e_msk)
            short_span_score = s_logits_bc + e_logits_bc
            cls_token_score = s_logits[0] + e_logits[0]
            score = short_span_score - cls_token_score
            s_short_idx, e_short_idx = divmod(score.argmax(), e_logits.size)
            
            example_df.loc[u_id, 'score'] = score.max()
            example_df.loc[u_id, 's_short_span'] = res['token_map'][s_short_idx]
            example_df.loc[u_id, 'e_short_span'] = res['token_map'][e_short_idx] + 1 # end span should be exclusive
        answer_type_logits = pd.Series(res['answer_type_logits'], 
                                       index=['UNKNOWN', 'YES', 'NO', 'SHORT', 'LONG'])
        example_df.loc[u_id, 'answer_type'] = answer_type_logits.idxmax()
        # break
    best_u_id = example_df['score'].idxmax()
    if best_u_id is not np.NAN:  # if all instances got no score
        short_span_start, short_span_end = example_df.loc[best_u_id, ['s_short_span', 'e_short_span']]
        pred_df.loc[example_id, 'score'] = example_df.loc[best_u_id, 'score']
        pred_df.loc[example_id, 'short_span_start'] = short_span_start
        pred_df.loc[example_id, 'short_span_end'] = short_span_end
        # # search for long answer span.
        for cand in example_df.iloc[0]['candidates']:
            if cand['top_level'] and cand['start_token'] <= short_span_start and short_span_end <= cand['end_token']:
                pred_df.loc[example_id, 'long_span_start'] = cand['start_token']
                pred_df.loc[example_id, 'long_span_end'] = cand['end_token']
                break
        pred_df.loc[example_id, 'answer_type'] = example_df.loc[best_u_id, 'answer_type']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [30]:
result_df.loc[(6801986500551995902, 6801986500551995902)]

answer_type_logits    [1.0000427961349487, -1.928816318511963, -3.04...
end_logits            [2.6496543884277344, -9.423047065734863, -9.98...
start_logits          [2.9686124324798584, -8.327146530151367, -7.87...
Name: (6801986500551995902, 6801986500551995902), dtype: object

In [31]:
nq_pred_dict = compute_pred_dict(candidates_dict, eval_features[:9070], all_results[:9070], tqdm=tqdm_notebook)
predictions_json = {"predictions": list(nq_pred_dict.values())}
print("writing json")
with tf.io.gfile.GFile('predictions.json', "w") as f:
    json.dump(predictions_json, f, indent=4)

merging examples...
done.
Computing predictions...

writing json


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=346.0), HTML(value='')))

In [32]:
for x in nq_pred_dict.items():
    print(x)
    break

(6801986500551995902, {'yes_no_answer': 'NONE', 'answer_type': 4, 'long_answer': {'start_byte': -1, 'start_token': 377, 'end_byte': -1, 'end_token': 415}, 'long_answer_score': -0.29973649978637695, 'answer_type_logits': [1.2898046970367432, -1.6390544176101685, -2.754089117050171, 0.5325103402137756, 2.5708281993865967], 'short_answer_score': -0.29973649978637695, 'short_answers': [{'start_byte': -1, 'start_token': 411, 'end_byte': -1, 'end_token': 414}], 'example_id': 6801986500551995902})


## 2- Main Change
#### Here is the small, but main change: we created an if to check the predicted response type and thus filter / identify the responses that are passed to the submission file.

### Filtering the Answers

In [33]:
def create_short_answer(entry):
    answer = []
    if entry['answer_type'] == 0:
        return ""
    elif entry['answer_type'] == 1:
        return 'YES'
    elif entry['answer_type'] == 2:
        return 'NO'
    elif entry["short_answer_score"] < 1.5:
        return ""
    else:
        for short_answer in entry["short_answers"]:
            if short_answer["start_token"] > -1:
                answer.append(str(short_answer["start_token"]) + ":" + str(short_answer["end_token"]))
        return " ".join(answer)


def get_short_span(pred_row: pd.Series):
    # score(best short answer) is np.NAN means: there's no short/long answers.
    if pred_row['score'] is np.NAN:
        return ''
    # answer_type can not be np.NAN if score is not np.NAN.
    if pred_row['answer_type'] == 'UNKNOWN':
        return ''
    if pred_row['answer_type'] in ['YES', 'NO']:
        return pred_row['answer_type']
    if pred_row['answer_type'] in ['SHORT', 'LONG']:
        if pred_row['score'] < 1.5:
            return ''
        else:
            return '%d:%d' % (pred_row['short_span_start'], pred_row['short_span_end'])


def create_long_answer(entry):
    answer = []
    if entry['answer_type'] == 0:
        return ''
    elif entry["long_answer_score"] < 1.5:
        return ""
    elif entry["long_answer"]["start_token"] > -1:
        answer.append(str(entry["long_answer"]["start_token"]) + ":" + str(entry["long_answer"]["end_token"]))
        return " ".join(answer)


def get_long_span(pred_row: pd.Series):
    # score(best short answer) is np.NAN means: there's no short/long answers.
    if pred_row['score'] is np.NAN:
        return ''
    # answer_type can not be np.NAN if score is not np.NAN.
    if pred_row['answer_type'] == 'UNKNOWN':
        return ''
    if pred_row['answer_type'] in ['YES', 'NO', 'SHORT', 'LONG']:
        if pred_row['score'] < 1.5 or pred_row['long_span_start'] is np.NAN:
            return ''
        else:
            return '%d:%d' % (pred_row['long_span_start'], pred_row['long_span_end'])

### Creating a DataFrame

In [34]:
prediction_df = pred_df.copy()
prediction_df['long_span'] = pred_df.apply(get_long_span, axis='columns')
prediction_df['short_span'] = pred_df.apply(get_short_span, axis='columns')
prediction_df.index = prediction_df.index.map(lambda x: str(x))

test_answers_df = pd.read_json("../working/predictions.json")
for var_name in ['long_answer_score', 'short_answer_score', 'answer_type']:
    test_answers_df[var_name] = test_answers_df['predictions'].apply(lambda q: q[var_name])
test_answers_df["long_answer"] = test_answers_df["predictions"].apply(create_long_answer)
test_answers_df["short_answer"] = test_answers_df["predictions"].apply(create_short_answer)
test_answers_df["example_id"] = test_answers_df["predictions"].apply(lambda q: str(q["example_id"]))

long_answers = dict(zip(test_answers_df["example_id"], test_answers_df["long_answer"]))
short_answers = dict(zip(test_answers_df["example_id"], test_answers_df["short_answer"]))

In [35]:
my = prediction_df.sort_index()
you = test_answers_df[['example_id', 'short_answer_score', 'answer_type', 'short_answer', 'long_answer']]\
    .set_index('example_id').sort_index()

In [46]:
def compare(theirs: pd.Series, mine: pd.Series):
    if theirs['short_answer_score'] - mine['score'] > 1e-3:
        return False
    if ['UNKNOWN', 'YES', 'NO', 'SHORT', 'LONG'][theirs['answer_type']] != mine['answer_type']:
        return False
    if theirs['short_answer'] != mine['short_span']:
        return False
    if theirs['long_answer'] != mine['long_span']:
        if not (theirs['long_answer'] is None and mine['long_span'] == ''):
            return False
    return True

for i in prediction_df.index:
    if compare(you.loc[i], my.loc[i]):
        print('OK')
    else:
        print(you.loc[i])
        print(my.loc[i])

OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
OK
O

### Generating the Submission File

In [39]:
sample_submission = pd.read_csv("../input/tensorflow2-question-answering/sample_submission.csv")

long_prediction_strings = sample_submission[sample_submission["example_id"].str.contains("_long")].apply(
    lambda q: long_answers[q["example_id"].replace("_long", "")], axis=1)
short_prediction_strings = sample_submission[sample_submission["example_id"].str.contains("_short")].apply(
    lambda q: short_answers[q["example_id"].replace("_short", "")], axis=1)

sample_submission.loc[
    sample_submission["example_id"].str.contains("_long"), "PredictionString"] = long_prediction_strings
sample_submission.loc[
    sample_submission["example_id"].str.contains("_short"), "PredictionString"] = short_prediction_strings

In [70]:
sample_submission2 = sample_submission.copy()
sample_submission2 = sample_submission2.set_index('example_id')
for eid, row in prediction_df.iterrows():
    sample_submission2.loc[eid + '_long', 'PredictionString'] = row['long_span']
    sample_submission2.loc[eid + '_short', 'PredictionString'] = row['short_span']
sample_submission2 = sample_submission2.reset_index()

In [89]:
sample_submission2

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-1011141123527297803_long,223:277
-1011141123527297803_short,224:226
-1028916936938579349_long,
-1028916936938579349_short,
-1055197305756217938_long,221:335
...,...
930196817123445627_short,2583:2586
934950704129184964_long,496:616
934950704129184964_short,515:517
958723574737344087_long,938:2597


In [92]:
# (sample_submission2 == sample_submission).all()
# sample_submission = sample_submission.set_index('example_id')
# sample_submission2 = sample_submission2.set_index('example_id')
jj = sample_submission.join(sample_submission2, on='example_id', rsuffix='_my')

In [95]:
# jj['cmp'] = jj['PredictionString'] == jj['PredictionString_my']
jj[jj['cmp'] != True]

Unnamed: 0_level_0,PredictionString,PredictionString_my,cmp
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-697706369125523422_long,,,False
4723874854788782295_long,,,False


In [40]:
sample_submission.to_csv('submission.csv', index=False)

In [91]:
sample_submission.head(20)

Unnamed: 0_level_0,PredictionString
example_id,Unnamed: 1_level_1
-1011141123527297803_long,223:277
-1011141123527297803_short,224:226
-1028916936938579349_long,
-1028916936938579349_short,
-1055197305756217938_long,221:335
-1055197305756217938_short,222:226
-1074129516932871805_long,3119:3210
-1074129516932871805_short,3153:3155
-1114334749483663139_long,
-1114334749483663139_short,


*Yes
Answers

In [42]:
yes_answers = sample_submission[sample_submission['PredictionString'] == 'YES']
yes_answers

Unnamed: 0,example_id,PredictionString
27,-1651666484583736653_short,YES
113,-3461207570097431362_short,YES
303,-8260765274544672220_short,YES
309,-871487000194429353_short,YES
553,5962215690907729115_short,YES


*No
Answers

In [43]:
no_answers = sample_submission[sample_submission['PredictionString'] == 'NO']
no_answers

Unnamed: 0,example_id,PredictionString
469,418890410382116795_short,NO


*Balnk
Answers

In [44]:
blank_answers = sample_submission[sample_submission['PredictionString'] == '']
blank_answers.head()

Unnamed: 0,example_id,PredictionString
2,-1028916936938579349_long,
3,-1028916936938579349_short,
8,-1114334749483663139_long,
9,-1114334749483663139_short,
10,-1152268629614456016_long,


In [45]:
blank_answers.count()

example_id          205
PredictionString    205
dtype: int64

### I am only sharing modifications that I believe may help. I left out Tunning and any significant code changes I made.

### We'll be grateful if someone gets a better understanding and can share what really impacts the assessment. No need to share code, just knowledge.
### Thank you!