In [None]:
# Changes (v4):
# - Bug fix: When creating batches, a few tokenized samples were omitted due to a missing 'append' statement and, therefore, not evaluated (thus, number of evaluated tokenized samples changed from 328115 to 328435)
# - In 'evaluate_sample_batch(...)' the case 'or answer["property"]["name"] == answer_text' has been added. 
# This is an edge case where the model predicts a property that is only partially contained in the presented context of the tokenized sample, but is the correct answer (although the tokenized sample has been marked as 'answer_out_of_span'

# Changes (v4.1):
# - option for verbose output when evaluating results added

# Install and Import Dependencies

## Install Dependencies

In [None]:
! pip install transformers datasets huggingface_hub

## Import Dependencies

In [None]:
# for loading the input dataset from disk
from datasets import load_dataset
from transformers import TFAutoModelForQuestionAnswering, AutoTokenizer

# for prediction (inference)
import tensorflow as tf

import numpy as np
from tqdm import tqdm
import random

from datetime import datetime
import os

## Check GPU Support

In [None]:
# Check TensorFlow version and whether GPU is available (there should be at least on physical device being listed)
tf.__version__, tf.config.list_physical_devices("GPU")

# Settings

In [None]:
# path to directory where checkpoints should be loaded from
checkpoint_directory = "/home/user/directory_containing_best_checkpoint/"

# path to directory where evaluation results should be stored
evaluation_path = "/home/user/directory_for_evaluation_results/"

#path to directory where validation samples should be loaded from
input_path = "/home/user/directory_containing_rephrased_samples/"

# number of samples within one batch feed into network for inference
batch_size = 1024
# number of predicted answers per tokenized sample that should be considered, e.g., if a 'n_best_size' of 20 is chosen, only the top 20 answers with highest score are considered
n_best_size = 20
# optional (can be 'None') parameter that defines a maximum length (number of tokens) that an answer could have
max_answer_length = None

# optional (can be 'None') parameter that defines the maximum number of tokenized samples that should be evaluated (this parameter is used for debugging)
limit = None

# Parameters for tokenizing:
max_length = 512  # Maximum number of features (i.e. indices) consisting of tokenized question and context in a tokenized sample
doc_stride = 128  # Allowed overlap of the tokenized context if a QA-sample must be split into multiple tokenized samples due to length limitations

# As a QA-sample might be split into multiple tokenized samples resulting into the situation that only one or a few tokenized samples contain the answer in its sub-context and the rest are no answerable samples,
# we limit the number of no answerable samples by the following threshold parameter:
max_no_answers_per_possible_answer = 3 # for each possible answer pick X no answer samples

# Checkpoint of base model on Hugging Face
base_model = "microsoft/codebert-base"

# Flag indicating whether context should be shuffled before inference or not
shuffle_context = False

# Option for verbose output when evaluating results. If this flag is 'True', the notebook prints detail results for each QA-sample
verbose_evaluation_output = False

In [None]:
# Variables for randomly picking X no answerable samples from all tokenized samples of a QA-sample
random.seed(42)
chosen_sample_indices = dict()
first_preprocessing_iteration = True

# Input Pipeline

In [None]:
# prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
pad_on_right = tokenizer.padding_side == "right" 

In [None]:
def get_answer_start(context: str,answer_text: str):
    """
    Returns the position of the first character of the passed answer text in the specified context or None if the answer is not in the context.
    
    Parameters
    ----------
    context : str
        context
    answer_text:
        answer text
    
    Returns
    -------
    Position of the first character of the answer text or None if the answer is not in the context
    """
    
    position = 0
    for property in context.split():
        if answer_text == property:
            return position
        else:
            position += len(property)
        position+=1 # for each space
    return None

def expand_samples(examples):
    """
    'Data Preparation V3' has created a dataset where multiple question-answer pairs, each sharing same context, are collated within one sample. 
    This function expands the dataset structure so that in the resulting dataset each row represents a single question-answer pair.
    Moreover, if enabled (see Settings), this function shuffles the context.
    
    Parameters
    ----------
    examples : dict()
        Dictionary having the format:
        {
            'id': [string],
            'title': [string],
            'context': [string],
            'questions':
                [
                    {
                        'id': string
                        'question': string
                        'question_length': int
                        'answers': 
                            {
                                'text': [string],
                                'answer_start': [int]
                            }
                    }
                ]
   
        }
        
    Returns
    -------
    Python dictionary having the format:
    {
        'id': [string],
        'title': [string],
        'context': [string],
        'question': [string],
        'question_id': [string],
        'answers': 
            [
                {
                    'text': [string],
                    'answer_start': [int]
                }
            ]
            
    }
    """
    d = {}
    d["id"] = []
    d["title"] = []
    d["context"] = []
    d["question"] = []
    d["question_id"] = []
    d["answers"] = []
  
    for i in range(len(examples["id"])):
        for question in examples["questions"][i]:
            d["id"].append(examples["id"][i])
            d["title"].append(examples["title"][i])
            
            context = ""
            # optionally (see settings) shuffle context
            if shuffle_context:
                context = examples["context"][i].split(" ")
                random.shuffle(context)
                context = " ".join(context)
                d["context"].append(context)
            else:
                context = examples["context"][i]
                d["context"].append(examples["context"][i])
                
            d["question"].append(question["question"])
            d["question_id"].append(question["id"])
            
            # recalculate answer start as context might have been shuffled
            question["answers"]["answer_start"][0] = get_answer_start(context,question["answers"]["text"][0])
            d["answers"].append(question["answers"])
    return d

In [None]:
def mask_offset_mapping(sequence_ids, context_index, offset_mapping):
    """
    Masks the passed 'offset_mapping', i.e. replaces all its entries with 'None' that represent tokens that are not part of the context.
    The method returns the masked offset mapping.
    
    Parameters
    ----------
    sequence_ids :
        Vector that defines for each token whether the token is part of the question, of the context, or is a special token.
        An entry representing a token that is part of the context must have the value specified in 'context_index'
    context_index : int
        Value that is used to mark a token as part of the context in 'sequence_ids'
    offset_mapping
        Vector that contains for each token its start and end index on character level in the original input (consisting of question, context, and special characters). 
    
    Returns
    -------
    Masked 'offset_mapping'; entries that represent tokens that are not part of the context have the value 'None'
    """
    masked_offset_mapping = [(o if sequence_ids[k] == context_index else None) for k, o in enumerate(offset_mapping)]
    return masked_offset_mapping

def calc_start_end_index(cls_index, offset_mapping, answers):
    """
    Calculates start and end index on token level of the passed answer based on the specified 'offset_mapping'.
    The method returns start and end index (token level) as well as flags indicating whether answer is out of span (third return parameter) and
    whether the sample is erroneous since the calculated start index is greater than end index (fourth return parameter). If the answer is out of span,
    start and end index have the value of the passed 'cls_index'.
    
    Parameters
    ----------
    cls_index : int
        Index of the [CLS] token that is used as start and end index if the answer is out of span
    offset_mapping 
        Vector that contains for each token its start and end index on character level in the original input (consisting of question, context, and special characters). 
        All entries that represent a token that is not part of the context must be 'None' (use the method 'mask_offset_mapping(...)' to mask the 'offset_mapping' vector before using this method).
    answers: dict()
        Dictionary having the format:
            {
                'text': [string],
                'answer_start': [int]
            }
    
    Returns
    -------
    start_index : int
        Calculated start index on token level (might have the value of 'cls_index' if the answer is out of span)
    end_index : int
        Calculated end index on token leven (might have the value of 'cls_index' if the answer is out of span)
    answer_out_of_span : bool
        Flag that indicates whether the answer is completely or partially out of span (True) or within span (False)
    erroneous : bool
        Flag that indicates whether the result of the calculation is erroneous. This might be the case if the answer 
        does not start or ends with the first/last character of a token, but elsewhere within the span of a token. 
    """
    
    # if no answer is given
    if not answers["answer_start"]:
        # return 'answer out of span' result
        return cls_index, cls_index, True, False
    
    # load answer start (on character level)
    start_char = answers["answer_start"][0]
    
    # calculate answer end (on character level)
    end_char = start_char + len(answers["text"][0])
    
    # initialize start and end index (on token level)
    start_token = 0
    end_token = len(offset_mapping) - 1
    
    # move start index (token level) to the first token of the context
    while offset_mapping[start_token] is None:
        start_token += 1
    
    # move end index (token level) to the last token of the context
    while offset_mapping[end_token] is None:
        end_token -= 1
        
    # check whether answer is within context span:
    # if start_char is smaller than index of first character of first context token OR end_char is greater than index of last character of last context token
    if start_char < offset_mapping[start_token][0] or end_char > offset_mapping[end_token][1]:
        # return 'answer out of span' result
        return cls_index, cls_index, True, False
    
    # move start index (token level) to the token whose start index on character level matches answer start
    while start_token < len(offset_mapping) and offset_mapping[start_token] is not None and start_char != offset_mapping[start_token][0]:
        start_token += 1
        
    # move end index (token level) to the token whose end index on character level matches answer end
    while end_token > 0 and offset_mapping[end_token] is not None and end_char != offset_mapping[end_token][1]:
        end_token -= 1
        
    # check whether start_token is greater than end_token
    # This could be the case if tokenization is not fine-grained enough, i.e. the answer does not start or ends with the first/last character of a token, but elsewhere within the span of a token 
    if start_token > end_token:
        # return error result
        return cls_index, cls_index, True, True
    
    return start_token, end_token, False, False

def prepare_no_answerable_collection(sample_mapping):
    """
    Prepares and returns a collection that contains for each original sample (indicated by its 'sample_index' in 'sample_mapping') an empty list.
    This collection is used to collect tokenized samples for each original sample that are not answerable. 
    Note that the method returns 'None', if the feature of limiting 'no answerable samples' is disabled (see 'max_no_answers_per_possible_answer').
    
    Parameter
    ---------
    sample_mapping
        List of indices where each item represents a tokenized samples and its value is the index of the original QA-sample
    
    Returns
    -------
    Collection that contains for each original sample an empty list. Use the 'sample_index' to access the respective empty list.
    The return value is 'None', if the feature of limiting 'no answerable samples' is disabled (see 'max_no_answers_per_possible_answer').
    """
    
    # create a collection that contains for each 'sample_index' an empty list (we will fill these lists with indices of 'no answerable samples')
    if max_no_answers_per_possible_answer is not None:
        # prepare a collection where each QA-sample has a list containg its no answerable tokenized samples
        no_answerable_collection = dict()
        
        # iterate over all indices of QA-samples ...
        for sample_index in sample_mapping:
            if sample_index not in no_answerable_collection:
                # ... and prepare an empty list
                no_answerable_collection[sample_index] = []
        return no_answerable_collection
    else:
        return None
    
def pick_no_answerable_samples(no_answerable_collection, original_sample_index, tokenized_examples, question_id:str):
    """
    Method randomly picks 'n' samples from the list in the passed no_answerable_collection that has the specified 'original_sample_index',
    where 'n' has the value of 'max_no_answers_per_possible_answer'. The method removes the ignore flag in the passed 'tokenized_examples' collection for each picked sample and
    adds the indicies of the picked samples to the global 'chosen_sample_indices' dictionary.
    
    Parameters
    ----------
    no_answerable_collection 
        Collection that contains for each original sample a list with indices of no answerable samples. Use the method prepare_no_answerable_collection(...) to prepare this collection. 
        Make sure that this collection is not 'None' before calling this method.
    original_sample_index : int
        Index of the original QA-sample
    tokenized_examples
        Tokenized samples structure created by tokenizer
    question_id : str
        Question ID of the original QA-sample
    """
    # If number of 'no answerable samples' is lower than number of samples to pick
    if len(no_answerable_collection[original_sample_index]) < max_no_answers_per_possible_answer:
        # pick them all
        picked_indices = no_answerable_collection[original_sample_index]
    else:
        # else ... pick randomly 'no answerable samples'
        picked_indices = random.sample(no_answerable_collection[original_sample_index],max_no_answers_per_possible_answer)

    # For each (randomly) pick 'no answerable sample' 
    for picked_index in picked_indices: 
        # ... remove the 'ignore' flag
        tokenized_examples["ignore"][picked_index] = False

    # Furthermore, in preparation for next iteration (epoch > 0), prepare a dictionary of chosen sample indices by memorizing their unique 'question_id' key of the original QA-sample and the list of samples indices of 'no answerable samples'
    if question_id in chosen_sample_indices.keys(): 
        print("Warning! Collision detected for: ",question_id,", sample index is ",original_sample_index)
    chosen_sample_indices[question_id] = picked_indices

def remove_ignore_flag_on_selected_answerable_samples(question_id: str, tokenized_examples):
    """
    Removes the ignore flag in the passed 'tokenized_examples' collection for each no answerable sample whose index is stored in 'chosen_sample_indices' (global attribute) for the passed 'question_id'.
    
    Parameters
    ----------
    question_id : str
        Question ID of the original QA-sample
    tokenized_examples
        Tokenized samples structure created by tokenizer
    """
    for selected_index in chosen_sample_indices[question_id]:
        # ... remove the 'ignore' flag
        tokenized_examples["ignore"][selected_index] = False
                 
def tokenize_validation_samples(examples):
    
    # Tokenizes the input dataset consisting of multiple QA-samples with truncation, padding, and overflow (stride).
    # More precisely, each QA-sample may result into multiple tokenized samples if the combination of tokenized question and context exceeds the 'max_length'.
    # In this case, the sub-context of the tokenized samples originating from the same QA-sample overlaps. In detail, the sub-context of the second tokenized sample starts with the last N token indices of the sub-context of the first sub-context and so further.
    # So in fact, the sub-context of the subsequent tokenized sample overlaps a bit the sub-context of the previous tokenized sample.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    # List of indices, where each item represents a tokenized sample and its value is the index of the original QA-sample, e.g. [0,0,1,2] which means that the first two tokenized samples belong to the first original QA-sample
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    # prepare a collection that contains for each original sample an empty list (we will fill these lists with indices of 'no answerable samples')
    no_answerable_collection = prepare_no_answerable_collection(sample_mapping)
    
    # Prepare output dataset structure
    # 'start_positions' (int) contains the token index of the start of the answer for each tokenized sample. It value might be the 'cls_index' if its 'no answerable' sample.
    tokenized_examples["start_positions"] = []
    # 'end_positions' (int) contains the token index of the end of the answer for each tokenized sample. It value might be the 'cls_index' if its 'no answerable' sample.
    tokenized_examples["end_positions"] = []
    
    # ID of the sample (string), e.g. "7fed77b9abe24a2db869c8b9919a1e9b"
    tokenized_examples["original_sample_id"] = []
    # Context of the sample (string), e.g. "users[*].id users[*].name _links.href _links.rel"
    tokenized_examples["original_context"] = []
    # Answer (string) of the sample, e.g. "users[*].name"
    tokenized_examples["original_answer_text"] = []
    # Start index of the answer of the sample (int), e.g. 12
    tokenized_examples["original_answer_start"] = []
    # Question of the sample (string), e.g. "The name of a user"
    tokenized_examples["original_question"] = []
    # ID of the question (string), e.g. "7fed77b9abe24a2db869c8b9919a1e9b_6sasd7b9abe24a2db869c8b9919a1e9b" (syntax: ID_Question ID)
    tokenized_examples["original_question_id"] = []
    # Number of tokens of the question of the sample (int), e.g. 5
    #tokenized_examples["original_question_length"] = []
    # Flag whether answer is out of span (i.e. not contain within truncated context) (boolean)
    tokenized_examples["answer_out_of_span"] = []
    # Index of the CLS token (int) 
    tokenized_examples["cls_index"] = []
    # Flag that indicates whether sample should be removed in the subsequent processing step (see remove_flagged_samples(...))
    tokenized_examples["ignore"] = []
    
    # Iterate over all tokenized samples and extract its offset_mapping
    for i, offset_mapping in enumerate(tokenized_examples["offset_mapping"]):
        
        # Mask offset mapping
        masked_offset_mapping = mask_offset_mapping(
            sequence_ids = tokenized_examples.sequence_ids(i),
            context_index = 1 if pad_on_right else 0,
            offset_mapping = offset_mapping)
        
        # Load the index of the original QA-sample
        sample_index = sample_mapping[i]
        
        # Load the answer
        answers = examples["answers"][sample_index]
        
        # Load cls_index
        cls_index = tokenized_examples["input_ids"][i].index(tokenizer.cls_token_id)
        
        # Calculate start and end index (on token level)
        start_index, end_index, answer_out_of_span, erroneous = calc_start_end_index(cls_index,masked_offset_mapping,answers)
        
        # skip if erroneous
        if erroneous:
            continue
        
        if answer_out_of_span:
            # If number of 'no answerable samples' should be limited ...
            if max_no_answers_per_possible_answer is not None:
                # ... preliminarily flag tokenized sample as 'ignore'
                tokenized_examples["ignore"].append(True)
                # ... add tokenized sample to 'no answerable samples' collection
                no_answerable_collection[sample_index].append(i)
            else:
                # Else do nothing (do not ignore sample)
                tokenized_examples["ignore"].append(False)
        else:
            tokenized_examples["ignore"].append(False) 
            
        # append meta data to tokenized examples
        tokenized_examples["original_sample_id"].append(examples["id"][sample_index])
        tokenized_examples["original_context"].append(examples["context"][sample_index])
        tokenized_examples["original_answer_text"].append(examples["answers"][sample_index]["text"][0])
        tokenized_examples["original_answer_start"].append(examples["answers"][sample_index]["answer_start"][0])
        tokenized_examples["original_question"].append(examples["question"][sample_index])
        tokenized_examples["original_question_id"].append(examples["question_id"][sample_index])
        tokenized_examples["start_positions"].append(start_index)
        tokenized_examples["end_positions"].append(end_index)
        tokenized_examples["cls_index"].append(cls_index)
        tokenized_examples["answer_out_of_span"].append(answer_out_of_span) 
        tokenized_examples["offset_mapping"][i] = masked_offset_mapping
        
        
    # If number of 'no answerable samples' should be limited ...
    if max_no_answers_per_possible_answer is not None:
        # For each original QA-sample
        for sample_index in no_answerable_collection.keys():
            # If it is the first iteration (epoc), we will pick 'no answerable samples' randomly
            if first_preprocessing_iteration:
                pick_no_answerable_samples(no_answerable_collection,sample_index,tokenized_examples,examples["question_id"][sample_index])
            # If it is the second, third, ... iteration (epoch > 0), select 'no answerable samples' by their unique 'question_id' key
            else:
                remove_ignore_flag_on_selected_answerable_samples(examples["question_id"][sample_index],tokenized_examples)
                
    return tokenized_examples

In [None]:
def remove_flagged_samples(examples):
    """
    Removes all tokenized samples that have a ignore flag from the passed dataset and returns this dataset
    
    Parameters
    ----------
    examples : dict()
        Tokenized samples
    
    Returns
    -------
    Tokenized samples without samples that have been marked as ignored
    """
    
    d = {}
    d["start_positions"] = []
    d["end_positions"] = []
    d["input_ids"] = []
    d["attention_mask"] = []
    d["offset_mapping"] = []
    
    # ID of the sample (string), e.g. "7fed77b9abe24a2db869c8b9919a1e9b"
    d["original_sample_id"] = []
    # Context of the sample (string), e.g. "users[*].id users[*].name _links.href _links.rel"
    d["original_context"] = []
    # Answer (string) of the sample, e.g. "users[*].name"
    d["original_answer_text"] = []
    # Start index of the answer of the sample (int), e.g. 12
    d["original_answer_start"] = []
    # Question of the sample (string), e.g. "The name of a user"
    d["original_question"] = []
    # ID of the question (string), e.g. "7fed77b9abe24a2db869c8b9919a1e9b_6sasd7b9abe24a2db869c8b9919a1e9b" (syntax: ID_Question ID) 
    d["original_question_id"] = []
    # Flag whether answer is out of scope (i.e. not contain within truncated context) (boolean)
    d["answer_out_of_span"] = []
    # Index of the CLS token (int) 
    d["cls_index"] = []
    # Flag that indicates whether sample should be removed in the subsequent processing step (see remove_flagged_samples(...))
    
    for i in range(len(examples["input_ids"])):
        if not examples["ignore"][i]:
            d["start_positions"].append(examples["start_positions"][i])
            d["end_positions"].append(examples["end_positions"][i])
            d["input_ids"].append(examples["input_ids"][i])
            d["attention_mask"].append(examples["attention_mask"][i])
            d["offset_mapping"].append(examples["offset_mapping"][i])
            d["original_sample_id"].append(examples["original_sample_id"][i])
            d["original_context"].append(examples["original_context"][i])
            d["original_answer_text"].append(examples["original_answer_text"][i])
            d["original_answer_start"].append(examples["original_answer_start"][i])
            d["original_question"].append(examples["original_question"][i])
            d["original_question_id"].append(examples["original_question_id"][i])
            d["answer_out_of_span"].append(examples["answer_out_of_span"][i])
            d["cls_index"].append(examples["cls_index"][i])
    return d

In [None]:
# Load validation data set:
validation_dataset = load_dataset('json', data_files={'validation':input_path+"validation/*.json"}, streaming=True)

In [None]:
# Connects the data source with expand_samples function
expanded_dataset = validation_dataset["validation"].map(expand_samples, batched=True, remove_columns=["questions"], batch_size=1024)

In [None]:
# Connects the expanded results with the tokenizer
tokenized_dataset = expanded_dataset.map(tokenize_validation_samples, batched=True, remove_columns=["id","title","context","question","answers","question_id"], batch_size=128)

In [None]:
# Finalizes the dataset
final_dataset = tokenized_dataset.map(remove_flagged_samples, batched=True, remove_columns=["ignore"], batch_size=1024)

# Evaluation

In [None]:
def identify_properties(context,start_char_index,end_char_index):
    """
    Identifies the properties that are (partially) covered by the span starting at 'start_char_index' and ending at 'end_char_index' in the specified context.
    The method returns the list of properties that are fully or partially covered. 
    Each property contained in this list is a dictionary and has the following structure:
    {
        'name': full property name in XPath style (str)
        'partial_name': concatenated characters of the property name that are covered (str)
        'length': number of characters of the property that are covered (int)
        'partial': flag indicating whether property is fully (False) or partially (True) covered
        'start_char_index': start index of the property on character level in the context (int)
        'end_char_index': end index of the property on character level in the context (int)
    }
    Note: If the property is only partially covered, the fields 'start_char_index' and 'end_char_index' point to the start and end of the partial, not the full property
    
    Parameters
    ----------
    context : str
        Original context
    start_char_index : int
        Start index of the span on character level in the original context
    end_char_index : int
        End index of the span on character level in the original context
    
    Returns
    -------
    List of identified properties
    """
    properties = []
    current_property = None
    is_on_property = False
    
    # iterate over each character of the span
    for i in range(end_char_index-start_char_index):
        index = start_char_index+i
        c = context[index]
       
        # if character is a property separator
        if c == " ":
            # if there is still an unfinished property
            if is_on_property:
                # finalize property
                current_property["end_char_index"] = index
                
                # if start index of property is start index of whole span (answer), i.e. there are characters belonging to the property that are before span
                if current_property["start_char_index"] == start_char_index:
                    # go backward in context and determine full property name
                    back_counter = start_char_index-1
                    while back_counter >= 0 and context[back_counter] != " ":
                        current_property["name"] = context[back_counter] + current_property["name"]
                        current_property["partial"] = True # only True (i.e partial), if we have to go backward
                        back_counter-=1
                properties.append(current_property)
                current_property = None
            is_on_property = False
        else:
            # if this is the first character of a new property
            if not is_on_property:
                # prepare a new property
                current_property = {
                    "name": c,
                    "partial_name": c,
                    "length": 1,
                    "partial": False,
                    "start_char_index": index,
                    "end_char_index": None
                }
            else:
                # else, append character to current property
                current_property["name"]+= c
                current_property["partial_name"]+= c
                current_property["length"]+=1
            
            # in both cases (either a new or an existing one), we are on a property 
            is_on_property = True
    
    # after iterating over all characters of span
    # check whether there is an unfinished property:
    if current_property:
        current_property["end_char_index"] = end_char_index
        
        # go forward in context and determine full property name
        forward_counter = end_char_index
        while forward_counter < len(context) and context[forward_counter] != " ":
            current_property["name"] = current_property["name"] + context[forward_counter]
            current_property["partial"] = True # only True (i.e partial), if we have to go forward
            forward_counter+=1
            
        
        # Special case: If the unfinished property is the only identified property (i.e. properties is empty),
        # we had not the chance to go backward yet (since going backward is only possible, when finalizing a property, see code above)
        # Thus go backward in context and determine full property name
        if len(properties) == 0:
            back_counter = start_char_index-1
            while back_counter >= 0 and context[back_counter] != " ":
                current_property["name"] = context[back_counter] + current_property["name"]
                current_property["partial"] = True # only True (i.e partial), if we have to go backward
                back_counter-=1
                
        properties.append(current_property)
        
    return properties

def determine_best_property(properties):
    """
    Determines the 'best' property from the passed list of properties. First, the method scans the input list for properties that are fully covered.
    Only if exactly one fully covered property exists, then the method determines and returns this properties as the best property. If two or more
    fully covered properties exist, the method will return 'None' due to this conflict. If the list does not contain any fully covered property,
    the method scans for partial properties in step two: The method determines and returns the partial covered property with the longest sequence of
    covered characters. If multiple partial covered properties share this longest sequence (i.e. having the same length), the method will return 'None'
    due to this conflict. Furthermore, the method returns 'None' if the input list is empty.
    
    Parameters
    ----------
    properties : [dict()]
        List of properties (use identify_properties(...) to identify these properties)
    
    Returns
    -------
    The 'best' property or 'None'
    """
    if not properties:
        # no properties
        return None
    else:
        # first, search for full properties
        full_property = None
        for p in properties:
            if not p["partial"]:
                if full_property is None:
                    full_property = p
                else:
                    # at least two full properties (--> conflict)
                    return None
        if full_property:
            # one full property
            return full_property
        else:
            # then, search for partial properties
            partial_property = None
            length_conflict = False
            for p in properties:
                if p["partial"]:
                    if partial_property is None:
                        partial_property = p
                    else:
                        if partial_property["length"] < p["length"]:
                            partial_property = p
                            length_conflict = False
                        elif partial_property["length"] == p["length"]:
                            length_conflict = True
            
            if length_conflict:
                # two longest partial properties have same length (--> conflict)
                return None
            else:
                return partial_property
            
def predict(batched_samples, model):
    """
    Converts the passed batch of tokenized samples into a tensor that is feed into the passed transformer model for prediction.
    The method returns the model's output as well as the number of input samples.
    
    Parameters
    ----------
    batched_samples : [dict()]
        Batch of samples where each sample is a dictionary have the fields 'attention_mask' and 'input_ids'
        
    Returns
    -------
    The output of the model (first return parameter) and the number of input samples (second return parameter)
    """
    
    batch_counter = 0
    for sample in batched_samples:
        # create input tensor for each sample of batch:
        # attention mask is a binary tensor so that the model knows to which token it has to attend to (typically 0 for padded indices)
        attention_mask_t = tf.constant([sample["attention_mask"]])
        input_ids_t = tf.constant([sample["input_ids"]])
        
        if batch_counter == 0:
            batch = dict()
            batch["attention_mask"] = attention_mask_t
            batch["input_ids"] = input_ids_t
        else:
            batch["attention_mask"] = tf.concat([batch["attention_mask"], attention_mask_t],axis=0)
            batch["input_ids"] = tf.concat([batch["input_ids"],input_ids_t],axis=0)
        batch_counter+=1
    
    output = model.predict(batch, verbose=0) #with default batch_size=32 (see Tensorflow 2.10)
    return output, batch_counter


def are_indices_out_of_context(start_index, end_index, offset_mapping):
    """
    Returns 'True' if the span defined by the passed start and end index (token level) does not completely lies within the context, else 'False'.
    The boundaries of the context (i.e. start and end index) are defined within the passed offset_mapping: 
    Every entry that represents a token that is not part of the context has the value 'None'.
    
    Parameters
    ----------
    start_index : int
        Start index (on token level) of the span
    end_index : int
        End index (on token level) of the span
    offset_mapping
        Vector that contains for each token its start and end index on character level in the original input (consisting of question, context, and special characters). 
        All entries that represent a token that is not part of the context must be 'None' (use the method 'mask_offset_mapping(...)' to mask the 'offset_mapping' vector before using this method).
        
    Returns
    -------
    'True' if the span does not completely lies within the context, else 'False'
    """
    return (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) #if indices are out of bound (should never happen????)
            or offset_mapping[start_index] is None or offset_mapping[end_index] is None)

def is_end_before_start(start_index, end_index):
    """
    Returns 'True' if the passed 'end_index' is smaller than (i.e. before) the passed 'start_index', else 'False'.
    
    Parameters
    ----------
    start_index : int
        Start index (on token level) of the span
    end_index : int
        End index (on token level) of the span
        
    Returns
    -------
    'True' if the passed 'end_index' is smaller than (i.e. before) the passed 'start_index', else 'False'.
    """
    return end_index < start_index

def is_answer_too_long(start_index, end_index,max_length):
    """
    Returns 'True', if the number of tokens in the span defined by the passed start and end index (token level) exceeds the passed length, else 'False'.
    
    Parameters
    ----------
    start_index : int
        Start index (on token level) of the span
    end_index : int
        End index (on token level) of the span
    max_length : int
        Maximum length
        
    Returns
    -------
    'True', if the number of tokens in the span exceeds the passed length, else 'False'.
    """
    if max_length:
        return end_index - start_index + 1 > max_length
    else:
        return False
    

def get_answers(sample, predicted_start_logits, predicted_end_logits):
    offset_mapping = sample["offset_mapping"]
    cls_index = sample["cls_index"]
    context = sample["original_context"]
    
    # Gather the indices for the best start/end logits (index syntax is: [stop:start:steps] with steps = -1 --> negative order)
    # np.argsort returns a sorted list of indices in ascending order, therefore, we gather the last 'n_best_size' indices
    # in reverse order (syntax: [stop:start:steps] with steps = -1 --> negative order)
    #(see https://towardsdatascience.com/the-basics-of-indexing-and-slicing-python-lists-2d12c90a94cf)
    best_start_indices = np.argsort(predicted_start_logits)[-1 : -n_best_size - 1 : -1].tolist()
    best_end_indices = np.argsort(predicted_end_logits)[-1 : -n_best_size - 1 : -1].tolist()   
    
    # prepare list for answers
    valid_answers = []
    
    for start_index in best_start_indices:
        for end_index in best_end_indices:
            # Do not consider....
            
            # Case 1:) Answers that are out of context
            # In this case, either start_index or end_index (or both) point to a token positions outside the context
            # Remember: We have set all positions of tokens, which are out of context, with 'None' in "offset_mapping" (see tokenize_validation_samples)
            if are_indices_out_of_context(start_index,end_index,offset_mapping):
                continue
                
            # Case 2:) Answers where end is before start index
            if is_end_before_start(start_index, end_index):
                continue
            
            # Optional case 3:) Answers that are too long
            if is_answer_too_long(start_index, end_index,max_answer_length):
                continue
            
            
            start_char_index = offset_mapping[start_index][0]
            end_char_index = offset_mapping[end_index][1]
            
            # identify properties and determine best property
            properties = identify_properties(context,start_char_index,end_char_index)
            best_property = determine_best_property(properties)
            
            # Case 4:) Answers that do not point clearly to a property (without conflicts)
            if best_property is None:
                continue
            
            # add answer:
            valid_answers.append(
                {
                    "score": predicted_start_logits[start_index] + predicted_end_logits[end_index],
                    "span": context[start_char_index:end_char_index],
                    "start_char_index": start_char_index,
                    "end_char_index": end_char_index,
                    "property": best_property
                }
            )
    
    # finally, add NULL answer as valid answer
    valid_answers.append(
        {
            "score": predicted_start_logits[cls_index] + predicted_end_logits[cls_index],
            "span": None,
            "start_char_index": cls_index,
            "end_char_index": cls_index,
            "property": None
        }
    )
    
    # sort valid answers by score in descending order
    sorted_valid_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)
    return sorted_valid_answer

    



In [None]:
def evaluate_sample_batch(batched_samples, model):
    predictions = []
    
    output, batch_size = predict(batched_samples, model)
    
    # for each tokenized sample of the batch...
    for i in range(batch_size):
        # ... we create a prediction object (see comments below)
        
        # load sample from batch and predicted start and end logits
        sample = batched_samples[i]
        start_logits = output.start_logits[i]
        end_logits = output.end_logits[i]
        
        # load correct answer of original QA sample and the flag indicating whether this correct answer is out of span for the tokenized sample
        answer_out_of_span = sample["answer_out_of_span"]
        answer_text = sample["original_answer_text"]
        
        # Interpret start and end logits (output of the model). get_answers returns a ranked list of predicted answers
        answers = get_answers(sample, start_logits, end_logits)
        
        # prepare prediction object with:
        prediction = {
            # the ID of the QA sample (only for documentation) #TODO: better use original_question_id here (however, does not affect results)
            "id": sample["original_sample_id"],
            # the correct answer of the entire QA sample
            "theoretical_answer": answer_text,
            # flag indicating whether the correct answer of the entire QA sample is out of span for this tokenized sample
            "theoretical_answer_out_of_span": answer_out_of_span,
            # the rank (i.e. position) of the correct answer in the ranked list of predicted answers (will be determined in the following)
            "rank": None,
            # the predicted answer that matches the correct answer (will be determined in the following)
            "correct_answer": None,
            # the ranked list of predicted answers
            "answers": answers
        }
        
        if verbose_evaluation_output:
            print("Question ID",sample["original_question_id"])
            print("Question",sample["original_question"])
            print("Correct Answer",answer_text)
        
        
        # determine correct answer by iterating ranked list of predicted answers (starting with the higest ranked answer)
        for i, answer in enumerate(answers):
            if verbose_evaluation_output:
                if answer["property"] is not None: 
                    print("Answer ["+str(i)+"]: "+answer["property"]["name"])
                else:
                    print("Answer ["+str(i)+"]: <   >")
            
            # if correct answer for tokenized sample is out of span
            if answer_out_of_span:
                # and if predicted answer is out of span as well
                # 2023-04-18 (v4): 'or answer["property"]["name"] == answer_text' added. This is an edge case where the model predicts a property that is only partially contained in the presented context of the tokenized sample, but is the correct answer (although the tokenized sample has been marked as 'answer_out_of_span'
                if answer["property"] is None or answer["property"]["name"] == answer_text:
                    prediction["correct_answer"] = answer
                    prediction["rank"] = i+1
                    break
            # if correct answer for tokenized sample is NOT out of span
            else:
                # and if predicted answer is equals correct answer
                if answer["property"] is not None and answer["property"]["name"] == answer_text:
                    # set predicted answer at rank = i+1 as correct answer
                    prediction["correct_answer"] = answer
                    prediction["rank"] = i+1
                    break
        
        # finally, add prediction object to predictions list (again: predictions will contain for each tokenized sample of the batch one prediction object)
        predictions.append(prediction)
    
    return predictions

In [None]:
top_k = [1,2,3,4,5,6,7,8,9,10]
    
now = datetime.now()
now = now.strftime("%Y_%d_%m-%H:%M:%S")
evaluation_log = os.path.join(evaluation_path, now+".csv")

with open(evaluation_log, 'a') as f:
    line = "Checkpoint Path;"
    for k in top_k:
        line += "Rank@"+str(k)+";"
        line += "Accuracy@"+str(k)+";"
        line += "Accuracy Answerable Samples@"+str(k)+";"
        line += "Accuracy Non Answerable Samples@"+str(k)+";"
        line += "Correct Predictions@"+str(k)+";"
        line += "Correct Predictions Answerable Samples@"+str(k)+";"
        line += "Correct Predictions Non Answerable Samples@"+str(k)+";"
        line += "Total Predictions@"+str(k)+";"
        line += "Total Predictions Answerable Samples@"+str(k)+";"
        line += "Total Predictions Non Answerable Samples@"+str(k)+";"    
    f.write(line+"\n")

In [None]:
for directory in os.listdir(checkpoint_directory):
    checkpoint_path = os.path.join(checkpoint_directory, directory)
    if os.path.isdir(checkpoint_path):

        results = []
        for k in top_k:
            results.append({
                "rank": k,
                "accuracy": 0,
                "accuracy_answerable_samples": 0, 
                "accuracy_non_answerable_samples": 0,
                "correct_predictions": 0,
                "correct_predictions_answerable_samples": 0,
                "correct_predictions_non_answerable_samples": 0,
                "total_predictions": 0,
                "total_predictions_answerable_samples": 0,
                "total_predictions_non_answerable_samples": 0
            })
        
        # load model from checkpoint
        model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint_path)
        print("Checkpoint '",checkpoint_path,"' loaded:")
        print(model.summary())
                    

        all_predictions = []
        counter = 0
        
        batch = []

        # Run prediction:
        for validation_sample in tqdm(final_dataset):
            if limit and counter > limit:
                break

            # append sample to batch
            batch.append(validation_sample)
            counter+=1
            
            # if sample size has reached maximum
            if len(batch) == batch_size:
                predictions = evaluate_sample_batch(batch, model)
                all_predictions += predictions
                
                batch.clear()
            #else:
                # BUG (fixed in v4): sample must always be appended (i.e. before evaluating batch size), otherwise we loose a few tokenized samples
                #batch.append(validation_sample)
                #counter+=1
                
        # if there are unpredicted items remaining
        if len(batch) != 0:
            predictions = evaluate_sample_batch(batch, model)
            all_predictions += predictions
            batch.clear()

        batch.clear()
        
        # Evaluate results:
        for prediction in all_predictions:
            for k in results:
                if prediction["rank"] is not None and prediction["rank"] <= k["rank"]:
                    k["correct_predictions"] += 1
                    if prediction["theoretical_answer_out_of_span"]:
                        k["correct_predictions_non_answerable_samples"] += 1
                    else:
                        k["correct_predictions_answerable_samples"] += 1
                        
                k["total_predictions"] += 1
                if prediction["theoretical_answer_out_of_span"]:
                    k["total_predictions_non_answerable_samples"] += 1
                else:
                    k["total_predictions_answerable_samples"] +=1
                        
        # Finalize results
        for k in results:
            if k["total_predictions"] > 0:
                k["accuracy"] = k["correct_predictions"] / k["total_predictions"]
            if k["total_predictions_non_answerable_samples"] > 0:
                k["accuracy_non_answerable_samples"] = k["correct_predictions_non_answerable_samples"] / k["total_predictions_non_answerable_samples"] 
            if k["total_predictions_answerable_samples"] > 0:
                k["accuracy_answerable_samples"] = k["correct_predictions_answerable_samples"] / k["total_predictions_answerable_samples"] 
    

        print("Total samples: ",len(all_predictions))
        for k in results:
            print("Accuracy@K ",k["rank"],": ",k["accuracy"])
        
        with open(evaluation_log, 'a') as f:
            line = checkpoint_path+";"
            for k in results:
                line += str(k["rank"])+";"
                line += str(k["accuracy"])+";"
                line += str(k["accuracy_answerable_samples"])+";"
                line += str(k["accuracy_non_answerable_samples"])+";"
                line += str(k["correct_predictions"])+";"
                line += str(k["correct_predictions_answerable_samples"])+";"
                line += str(k["correct_predictions_non_answerable_samples"])+";"
                line += str(k["total_predictions"])+";"
                line += str(k["total_predictions_answerable_samples"])+";"
                line += str(k["total_predictions_non_answerable_samples"])+";"
            f.write(line+"\n")        
        
        global first_preprocessing_iteration
        first_preprocessing_iteration = False
        