In [None]:
# Changes (v3.1):
# - We added and renamed several counters (metrics)
# - Bug-Fix: In v3, the script counts all properties with too deep paths and having no descriptions, but only parameters, i.e., properties being a leaf, should be considered. 
# - We removed the verification whether the context is empty in create_question_answer_samples_for_payload

# Changes (v3.2):
# - We analyse the description length of all schema properties
# - As well as the description length in the final QA samples and the length in the schema and the number of parameters per schema

# Install and Import Dependencies

## Install Dependencies

In [None]:
!pip install transformers

## Import Dependencies

In [None]:
import json
import random
import uuid
import operator
import os

# tqdm is used to visualize the progress while processing input files
from tqdm import tqdm

# for embedding current time into log file name 
from datetime import datetime

# We will use a pre-trained tokenizer to determine the length of strings
from transformers import AutoTokenizer

# for description lenght analysis (added in v3.2)
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import statistics

# Settings

In [None]:
# if set, only the first 'n' API tree models will be loaded
api_limit = None

remove_uris = True  # remove URIs from description
sort_by_name = True # sort context by name

max_depth = 8 # max. depth of XPath in both context and answer
min_question_length = 3 # min. number of tokens that must be in a question
max_question_length = 96 # max. number of tokens that may be in a question

max_questions_per_sample = 32 # max. number of Question-Answer pairs per sample. If number is exceeded, additional sample is created

number_of_chunks = 10 # number of containers where samples are distributed to

# Variable that specifies how many times the generated sample set is repeated. A value of '1' means that each sample is only created once.
original_retakes = 1
shuffled_retakes = 0

random.seed(42)

# Specify the base model that is used for training later. We will use its pre-trained tokenizer in this notebook.
base_model = "microsoft/codebert-base"

# List of special tokens that should be removed from XPaths while creating context string
to_be_removed = ["<?>","<str>","<num>","<int>","<bool>","{_}","$."]

input_path = "/home/user/input_directory/"
output_path = "/home/user/output_directory/"

# APIs (identified by their keys) that should be excluded from processing after loading and parsing them (e.g. due to too many payloads)
excluded_api_keys = []

http_verb_position = "suffix"
remove_path_parameter_indicator = False
remove_path_fragment_indicator = False

# (added in v3.2)
pre_description_length_analysis = False

In [None]:
# number of processed operations
cnt_operations = 0
# number of operations that do not satisfy the contraints required to generate a question-answer pair
cnt_invalid_operations = 0

# (added in v3.2) number of operations whose descriptions do not satisfy at least one constraint
cnt_operation_with_description_constraint_violation = 0
# number of operations that must be excluded due to missing descriptions
cnt_operations_without_descriptions = 0
# number of operations whose path exceeds the maximum depth
cnt_operations_with_too_deep_path = 0
# number of operations whose description are too short
cnt_operations_with_too_short_descriptions = 0
# number of operations whose description cannot be truncated, since:
# 1.) The description is still too long, even after removing trailing sentences (i.e., truncate_question returns an empty description)
# 2.) The description could be truncated, but the resultung description does not contain enough tokens
cnt_operations_with_descriptions_that_could_not_be_truncated = 0 

# operations with descriptions that are truncated
cnt_operations_with_truncated_descriptions = 0

cnt_split_samples = 0

# apis without any QA sample
cnt_apis_without_samples = 0

# Load and Parse API Tree Models

In [None]:
def remove_data_types_from_xpath(xpath: str):
    """
    Removes special tokens defined in 'to_be_removed' from the passed XPath (input string) and returns the modified string
    
    Parameters
    ----------
    xpath : str
        input string
        
    Returns
    -------
    Modified input string
    """
    for data_type in to_be_removed:
        xpath = xpath.replace(data_type,"")
    return xpath


class ApiInterfaceNode:
    """
    Represents a generic node in an API tree.
    ...
    Attributes
    ----------
    key : str
        key that uniquely identifies the node among all children of the parent node
    value
        optional value of the node
    node_type: str
        type (i.e. role) of the node, allowed values are 'api', 'path', 'method', 'response', 'payload', and 'property'
    id : str
        unique identifier of the node among all nodes of the API tree
    elements : ApiInterfaceNode
        contains all children of the node
    parent : ApiInterfaceNode
        contains the parent of this node
    raw_node
        contains the original JSON structure of the node and the sub tree as Python dictionary
    api_key : str
        contains the API key, this attribute is only present if node is type of 'api'
    api_name: str
        contains the API name, this attribute is only present if node is type of 'api'
    api_version_key : str
        contains the API version key, this attribute is only present if node is type of 'api'
    api_version_name : str
        contains the API version name, this attribute is only present if node is type of 'api'
    method_summary : str
        contains the summary of the method, this attribute is only present if node is type of 'method'
    method_description : str
        contains the description of the method, this attribute is only present if node is type of 'method'
    response_description : str
        contains the description of the response, this attribute is only present if node is type of 'response'
    property_name : str
        contains the name of the property, this attribute is only present if node is type of 'property'
    property_data_type : str
        contains the data type of the property, this attribute is only present if node is type of 'property'
    property_xpath : str
        contains the XPath of the property, this attribute is only present if node is type of 'property'
    property_format : str
        contains the format of the property, this attribute is only present if node is type of 'property'
    property_pattern : str
        contains the pattern of the property, this attribute is only present if node is type of 'property'
    property_description : str
        contains the description of the property, this attribute is only present if node is type of 'property'

    Methods
    -------
    is_type(node_type : str):
        Returns true if the node's type is equal the passed type
    __str__(): 
        Returns a JSON object as string containing all attributes of the node
    """
    def __init__(self, api_documentation_raw_node, parent_node):
        """
        Constructs the sub tree consisting of ApiInterfaceNodes based on the passed raw API tree model (parsed JSON structrure as Python dictionary).

        Parameters
        ----------
        api_documentation_raw_node
            parsed JSON structure of the API tree model as Python dictionary
        """
        self.raw_node = api_documentation_raw_node

        # Generic attributes
        self.key = api_documentation_raw_node["key"]
        self.value = api_documentation_raw_node["value"]
        self.node_type = api_documentation_raw_node["type"]
        self.id = api_documentation_raw_node["id"].replace("-",".")
        #self.id = parent_id+"."+self.key

        self.elements = [ApiInterfaceNode(api_documentation_raw_node["elements"][i],self) for i in range(len(api_documentation_raw_node["elements"]))]
        self.parent = parent_node
        
        if self.node_type == "api":
            self.api_key = api_documentation_raw_node["apiKey"]
            self.api_name = api_documentation_raw_node["apiName"]
            self.api_version_key = api_documentation_raw_node["versionKey"]
            self.api_version_name = api_documentation_raw_node["versionName"]
    
        if self.node_type == "pathSegment":
            # no individual attributes for path type
            pass

        if self.node_type == "method":
            self.method_summary = api_documentation_raw_node["summary"]
            self.method_description = api_documentation_raw_node["description"]

        if self.node_type == "response":
            self.response_description = api_documentation_raw_node["description"]
    
        if self.node_type == "payload":
            # no individual attributes for payload type
            pass

        if self.node_type == "property":
            self.property_name = api_documentation_raw_node["name"]
            self.property_data_type = api_documentation_raw_node["dataType"]
            self.property_xpath = remove_data_types_from_xpath(api_documentation_raw_node["xpath"].replace(' ','').replace('\t','').replace('\n',''))
            self.property_format = api_documentation_raw_node["format"]
            self.property_pattern = api_documentation_raw_node["pattern"]
            self.property_description = api_documentation_raw_node["description"]
        
    def is_type(self, node_type: str):
        """
        Returns true, if the node's type is equal the passed type, else false.

        Parameters
        ----------
        node_type : str
            type that should be compared with the type of the node
    
        Returns
        -------
        True or False
        """
        return self.node_type == node_type
  
    def __str__(self):
        """
        Returns a JSON object as string containing all attributes of the node
    
        Returns
        -------
        JSON object as string containing all attributes of the node
        """
        json_dict = {}
        json_dict["key"] = self.key
        json_dict["value"] = self.value
        json_dict["id"] = self.id
        json_dict["type"] = self.node_type
        json_dict["number_of_elements"] = len(self.elements)

        if self.node_type == "api":
            json_dict["apiKey"] = self.api_key
            json_dict["apiName"] = self.api_name
            json_dict["versionKey"] = self.api_version_key
            json_dict["versionName"] = self.api_version_name
    
        if self.node_type == "pathSegment":
            # no individual attributes for path type
            pass

        if self.node_type == "method":
            json_dict["summary"] = self.method_summary
            json_dict["description"] = self.method_description

        if self.node_type == "response":
            json_dict["description"] = self.response_description
    
        if self.node_type == "payload":
            # no individual attributes for payload type
            pass

        if self.node_type == "property":
            json_dict["name"] = self.property_name 
            json_dict["dataType"] = self.property_data_type 
            json_dict["xpath"] = self.property_xpath 
            json_dict["format"] = self.property_format
            json_dict["pattern"] = self.property_pattern
            json_dict["description"] = self.property_description
    
        return json.dumps(json_dict)

def load_and_parse_api(path: str):
    """
    Loads and parses an API tree model file located under the passed path and converts it structure into a structure of ApiInterfaceNodes.
  
    Parameters
    ----------
    path : str
        Path of the API tree model file
  
    Returns
    -------
    ApiInterfaceNode representing the root of the loaded and parsed API tree model
    """
    with open(path,"r",encoding="utf-8") as json_file:
        json_api = json.load(json_file)
    return ApiInterfaceNode(json_api, None)

def load_and_parse_apis_from_directory(path: str, limit: int = None):
    """
    Loads and parses multiple API tree model files located in the specified directory.
  
    Parameters
    ----------
    path : str
        Path to directory where the API tree model files are located
      
    limit : int
        Optional limit. If specified, only the first 'n' API tree model files are loaded and parsed
  
    Returns
    -------
    List of ApiInterfaceNodes where each node represents the root of a loaded and parsed API tree model
    """
    apis = []
    if limit:
        filesnames = os.listdir(path)[:limit]
    else:
        filesnames = os.listdir(path)

    for filename in tqdm(filesnames):
        if filename.endswith(".json"):
            apis.append(load_and_parse_api(os.path.join(path,filename)))
    return apis

def extract_nodes(node: ApiInterfaceNode, node_type: str):
    """
    Extracts all nodes matching the passed node type from the passed API tree model.
  
    Parameters
    ----------
    node : ApiInterfaceNode
      API tree model (input)
    node_type : str
      The type of the node that should be extracted

    Returns
    -------
    List of ApiInterfaceNodes matching the passed node type
    """
    nodes = []
    if node.node_type == node_type:
        nodes.append(node)
    for element in node.elements:
        nodes += extract_nodes(element, node_type)
    return nodes

def extract_nodes_in_apis(nodes: [ApiInterfaceNode], node_type: str):
    """
    Extracts all nodes matching the passed node type from the passed list of API tree models.
  
    Parameters
    ----------
    node : [ApiInterfaceNodes]
      List of API tree models (input)
    node_type : str
      The type of the node that should be extracted

    Returns
    -------
    List of ApiInterfaceNodes matching the passed node type
    """
    extracted_nodes = []
    for node in nodes:
        extracted_nodes += extract_nodes(node,node_type)
    return extracted_nodes


In [None]:
# Load and parse API tree models
apis = load_and_parse_apis_from_directory(input_path,limit=api_limit)

## Prepare Pre-Trained Tokenizer for Length Calculation

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
def get_length(input: str):
    """
    Calculates and returns the length (i.e. number of tokens) of the passed input string.
    The returned length is '0', if the input is 'None'.
    
    Parameters
    ----------
    input : str
        Input string whose length should be calculated
    
    Returns
    -------
    Number of tokens
    """
    if input:
        return len(tokenizer.encode(input, add_special_tokens=False))
    else:
        return 0

In [None]:
# Test tokenizer
test_string = "users.{userId}.get"
get_length(test_string)

In [None]:
# Testing length limitations
s = " ".join([str(x) for x in range(99999)])
get_length(s)

# Description Length Analysis (added in v3.2)

In [None]:
# extract payload nodes
operation_nodes = extract_nodes_in_apis(apis,node_type="method")

In [None]:
print("Number of operation nodes: ",len(operation_nodes))

In [None]:
if pre_description_length_analysis:    
    num_operations_with_summary = 0
    num_operations_with_descriptions = 0
    num_operations_with_summary_and_descriptions = 0
    
    operation_descriptions = []
    operation_summaries = [] 
    
    for operation in operation_nodes:
        if operation.method_summary and not operation.method_description:
            num_operations_with_summary+=1
            
        if not operation.method_summary and operation.method_description:
            num_operations_with_descriptions+=1
        
        if operation.method_summary and operation.method_description:
            num_operations_with_summary_and_descriptions+=1
        
        if operation.method_summary:
            operation_summaries.append(operation.method_summary)
            
        if operation.method_description:
            operation_descriptions.append(operation.method_description)
        

In [None]:
if pre_description_length_analysis: 
    print("Number of operations: ",len(operation_nodes))
    print("Number of operations with summary only: ",num_operations_with_summary)
    print("Number of operations with description only: ",num_operations_with_descriptions)
    print("Number of operations with summary and description: ",num_operations_with_summary_and_descriptions)

In [None]:
if pre_description_length_analysis: 
    operation_description_word_len = []
    operation_description_token_len = []
    operation_summary_word_len = []
    operation_summary_token_len = []

    for description in tqdm(operation_descriptions):
        operation_description_token_len.append(get_length(description))
        operation_description_word_len.append(len(word_tokenize(description)))

    for summary in tqdm(operation_summaries):
        operation_summary_token_len.append(get_length(summary))
        operation_summary_word_len.append(len(word_tokenize(summary)))

In [None]:
if pre_description_length_analysis: 
    print("Mean|median|stdev|min|max for description token length: ", statistics.mean(operation_description_token_len),"|",statistics.median(operation_description_token_len),"|",statistics.stdev(operation_description_token_len),"|",min(operation_description_token_len),"|",max(operation_description_token_len))
    print("Mean|median|stdev|min|max for description word length: ", statistics.mean(operation_description_word_len),"|",statistics.median(operation_description_word_len),"|",statistics.stdev(operation_description_word_len),"|",min(operation_description_word_len),"|",max(operation_description_word_len))
    print("Mean|median|stdev|min|max for summary token length: ", statistics.mean(operation_summary_token_len),"|",statistics.median(operation_summary_token_len),"|",statistics.stdev(operation_summary_token_len),"|",min(operation_summary_token_len),"|",max(operation_summary_token_len))
    print("Mean|median|stdev|min|max for summary word length: ", statistics.mean(operation_summary_word_len),"|",statistics.median(operation_summary_word_len),"|",statistics.stdev(operation_summary_word_len),"|",min(operation_summary_word_len),"|",max(operation_summary_word_len))

# Create QA-Samples

In [None]:
class QuestionAnswer:
    """
    Represents a question-answer pair consisting of a unique identifier, a question, its length (number of tokens), the answer of the question, and the character based index of the start of the answer within the context.
    
    Attributes
    ----------
    id : str
        unique identifier of a question-answer pair
    question : str
        question text
    question_length : int
        number of tokens of question text
    answer : str
        answer text
    answer_start : int
        position (index) of the first character of the answer text within original context
    
    Methods
    -------
    as_dict():
        Converts the question-answer pair into a Python dictionary following the structure for QA pairs recommended in https://huggingface.co/course/chapter7/7?fw=pt  
    """
    
    def __init__(self, question, question_length, answer, answer_position_in_index):
        """
        Constructs a question-answer pair having the passed parameters
        
        Parameters
        ----------
        question : str
            question text
        question_length : int
            number of tokens of question text
        answer : str
            answer text
        answer_position_in_index : int
            position (index) of the first character of the answer text within original context
        """
        self.id = uuid.uuid4().hex;
        self.question = question
        self.question_length = question_length
        self.answer = answer
        self.answer_start = answer_position_in_index
  
    def as_dict(self):
        """
        Converts this question-answer pair into a Python dictionary following the structure for QA pairs recommended in https://huggingface.co/course/chapter7/7?fw=pt
        {
            'id': ....,
            'question': ....,
            'question_length': .....,
            'answers':{
                'text': [....],
                'answer_start : [....]
            }
        }
        Note that 'answers.text' and 'answers.answer_start' both are lists as in classical NL QA a question might have multiple answers. In our case, every question has exactly one answer, thus, one item per list.
        
        Returns
        -------
        Python dictionary
        """
        a = {}
        a["text"] = [self.answer]
        a["answer_start"] = [self.answer_start]
        q = {}
        q["id"] = self.id
        q["question"] = self.question
        q["question_length"] = self.question_length
        q["answers"] = a
        return q

class QuestionAnswerSample:
    """
    Represents a question-answer sample consisting of a unique identifier, a title, a context, and multiple question-answer pairs extracted from the context 
    
    Attributes
    ----------
    id : str
        unique identifier of the sample
    title : str
        title of the sample
    context : str
        context (text)
    questionAnswers : [QuestionAnswer]
        question-answer pairs extracted from the context 
    
    Methods
    -------
    __str__():
        Converts the sample including its question-answer pairs into a JSON structure following the structure for QA pairs recommended in https://huggingface.co/course/chapter7/7?fw=pt
    """
    
    def __init__(self, context, questionAnswers: list, title = None):
        """
        Constructs a question-answer sample having the passed parameters
        
        Parameters
        ----------
        context : str
            context (text)
        questionAnswers : [QuestionAnswer]
            list of question-answer pairs
        title : str
            optional title of the sample
        """
        self.id = uuid.uuid4().hex;
        for qa in questionAnswers:
            qa.id = self.id +"_"+qa.id

        self.title = title;
        self.context = context
        self.questionAnswers = questionAnswers

    def __str__(self):
        """
        Converts the sample including its question-answer pairs into a JSON structure following the structure for QA pairs recommended in https://huggingface.co/course/chapter7/7?fw=pt
        {
            "id": ....,
            "title": ....,
            "context": ....,
            "questions": [ //see QuestionAnswer.as_dict()]
        }
        """
        json_dict = {}
        json_dict["id"] = self.id
        json_dict["title"] = self.title
        json_dict["context"] = self.context
        json_dict["questions"] = [x.as_dict() for x in self.questionAnswers]
        return json.dumps(json_dict)


## Methods for Creating Context

In [None]:
def build_path(operation_node: ApiInterfaceNode, p_http_verb_position = "suffix" , p_remove_path_parameter_indicator = False, p_remove_path_fragment_indicator = False):
    path_segments = []
    node = operation_node.parent
    
    if p_http_verb_position == "suffix":
        path_segments.append(operation_node.key)
    
    while node is not None:
        key = node.key.replace("/","").replace(" ","-").replace("!","-")
        
        if p_remove_path_parameter_indicator:
            key = key.replace("{","").replace("}","")
        if p_remove_path_fragment_indicator:
            key = key.replace("#",".")
            
        path_segment = key
        
        if path_segment:
            path_segments.append(path_segment)
        
        if node.parent.node_type =="pathSegment":
            node = node.parent
        else:
            node = None
    
    if p_http_verb_position == "prefix":
         path_segments.append(operation_node.key)
    
    path_segments.reverse()
    return ".".join(path_segments)
        

def filter_endpoints(endpoints: list, max_depth: int):
    """
    Removes all endpoints from the passed list that exceed the specified maximum depth and returns the modified list.
    Example: 'users.{userId}.get' has a depth of 3.
    If 'max_depth' is None, the passed list will be returned without any removal.
    
    Parameters
    ----------
    endpoints : [str]
        List of endpoints (strings)
    max_depth : int
        maximum depth (can be None)
        
    Returns
    -------
    Modified list of endpoints
    """
    filtered_endpoints = []
    for endpoint in endpoints:
        if max_depth == None or len(endpoint.split(".")) <= max_depth:
            filtered_endpoints.append(endpoint)
    return filtered_endpoints


def build_context_string(endpoints: list, p_sort_by_name: bool = False, p_shuffle: bool = False):
    """
    Removes duplicate endpoints from the passed endpoint list, optionally sort (ascending order) or shuffle the list and finally concatenates the remaining endpoint items to a string with spaces as speparator between items.
    The method returns this resulting string. The returned string is empty, i.e., '', if the list of endpoints is empty.
    
    Parameters
    ----------
    endpoints : [string]
        list of endpoints
    p_sort_by_name : bool
        if set to True (default value is False), the method will sort the list of endpoint items in ascending order
    p_shuffle : bool
        if set to True (default value is False), the method will shuffle the list of endpoint items
        
    Returns
    -------
    String containing the concatenated endpoint items
    """
    
    if p_shuffle:
        assert not p_sort_by_name, "Cannot shuffle and sort endpoints at the same time"
    if p_sort_by_name:
        assert not p_shuffle, "Cannot shuffle and sort endpoints at the same time"
    
    endpoints_without_duplicate_items = []
    deduplication_set = set()
    
    for endpoint in endpoints:
        if not endpoint in deduplication_set:
            endpoints_without_duplicate_items.append(endpoint)
            deduplication_set.add(endpoint)
            
    # shuffle (if enabled)
    if p_shuffle:
        random.shuffle(endpoints_without_duplicate_items)
        
    # sort by name (if enabled)
    if p_sort_by_name:
        endpoints_without_duplicate_items.sort()
    
    return " ".join(endpoints_without_duplicate_items)

## Methods for Creating Questions

In [None]:
def remove_inline_uris(text: str):
    """
    Removes URIs from the passed string and returns the modified string.
    
    Parameters
    ----------
    text : str
        input string that should be scanned for URIs
        
    Returns
    -------
    Modified string
    """
    tokens = text.split(' ')
    for token in tokens:
        if "http:" in token.lower() or "https:" in token.lower():
            text = text.replace(token," ")
    return text

def truncate_question(question: str, max_question_length: int):
    """
    Truncates the passed question (string) if the number of tokens exceeds the passed maximum question length. If the passed question is too long, the method tries to split it into sentences and concatenates these sentences
    until maximum question length is exceeded. The method returns the truncated question, its length (number of tokens), and the whether it has been  truncated (True) or not (False)
    
    Parameters
    ----------
    question : str
        Original question that should be truncated
    max_question_length : int
        Maximum number of tokens
    
    Returns
    -------
    The truncated question, its length, and whether it has been truncated (True) or not (False)
    
    """
    # determine length of original question
    length = get_length(question)
    
    # if question length (number of tokens determined by tokenizer) does not exceed max. question length
    if length <= max_question_length:
        # return question without any modifications
        return question, length, False
    
    # if question length exceeds max. question length
    else:
        # (Try to) split question into sentences
        sentences = question.split(".")
        
        # calculate length for each sentende 
        sentences_and_lengths = [(x,get_length(x)) for x in sentences] # generates a tuple of (sentence,length) for each sentence
        
        # reset question and length
        question = ""
        length = 0
        
        # concatenate sentences until max. question length is reached
        for sentence in sentences_and_lengths:
            sentence_text = sentence[0]
            sentence_length = sentence[1]
            
            # if sentence is not empty and new total length does not exceed max. question length
            if sentence_text is not None and length + sentence_length < max_question_length:
                question = question + sentence_text+"."
                length = length + sentence_length + 1
            else:
                break
                
        return question, get_length(question), True

In [None]:
## Test truncate_question
question = "This is a short example. We want to test, whether truncate_question works as expected. Hello World"
truncate_question(question,25)

In [None]:
get_length("This is a short")

## Methods for Creating Question-Answer Pairs and Samples

In [None]:
def get_answer_start(context: str,answer_text: str):
    """
    Returns the position of the first character of the passed answer text in the specified context or None if the answer is not in the context.
    
    Parameters
    ----------
    context : str
        context
    answer_text:
        answer text
    
    Returns
    -------
    Position of the first character of the answer text or None if the answer is not in the context
    """
    
    position = 0
    for property in context.split():
        if answer_text == property:
            return position
        else:
            position += len(property)
        position+=1 # for each space
    return None

In [None]:
## Test get_answer_start
context = "user.address user.address.street user.address.city user"
get_answer_start(context,"user")

In [None]:
for i,char in enumerate(context):
    print(i," ",char)

In [None]:
def create_question_answer_pair(operation_node: ApiInterfaceNode, context: str, path: str, p_min_question_length: int = None , p_max_question_length: int = None, p_remove_uris: bool = False, p_max_depth: int = None):
    """
    Create and returns a Question-Answer pair for the passed operation node.
    
    Parameters
    ----------
    node : ApiInterfaceNode
        the operation node
    context : str
        context, required for calculating the position (character-based index) of the answer
    path : str
        the endpoint path of the operation
    p_min_question_length : int
        Optional parameter (default is None) that specifies the minimum length (number of tokens) that a question must have. If the parameter is None, the minimum length is one token
    p_max_question_length : int
        Optional parameter (default is None) that specifies the maximum length (number of tokens) that a question may have. If the parameter is None, the maximum length is unlimited
    p_remove_uris : bool
        Optional parameter (default is False) indicating whether URIs should be removed from questions
    p_max_depth : int
        Optional parameter (default is None) that specifies the maximum depth that an answer (i.e. the endpoint) may have. If the parameter is None, there is no depth limitation
    
    Results
    -------
    Created Question-Answer pair
    """
    global cnt_operations
    cnt_operations+=1
    
    valid = True
    
    if get_length(operation_node.method_description) >= get_length(operation_node.method_summary):
        description = operation_node.method_description
    else:
        description = operation_node.method_summary
    
    # check whether description is empty
    if not description:
        global cnt_operations_without_descriptions
        cnt_operations_without_descriptions += 1 
    
    # check whether description consists of at least X tokens (only if min. question length is enabled and description is not empty):
    if p_min_question_length and description: 
        length = get_length(description)
        if length < p_min_question_length:
            global cnt_operations_with_too_short_descriptions
            cnt_operations_with_too_short_descriptions+=1
            # set description to none in order to skip the following steps
            description = None
    
        
    # continue only if there is a description
    if description:
    
        # Step 1 (optional): remove inline URIs:
        if p_remove_uris:
            description = remove_inline_uris(description)
                
        # Step 2: remove unecessary whitespaces
        while '  ' in description:
            description = description.replace('  ',' ')
            
        # Step 3 (optional): truncate description; this step ensures that the length of the returned description does not exceed the maximum length by truncating it.
        # Therefore truncate_question might return an empty question, i.e., question with len = 0, if the question cannot be truncated
        if p_max_question_length is not None:
            description, length, truncated = truncate_question(description, p_max_question_length)
            if truncated:
                global cnt_operations_with_truncated_descriptions
                cnt_operations_with_truncated_descriptions += 1
        else:
            length = get_length(description)
            
        # Step 4: again, check whether description contains enough tokens after truncation
        if p_min_question_length:
            min_len = p_min_question_length
        else:
            min_len = 1

        if length < min_len:
            # The description could not be truncated means that:
            # 1.) The description is still too long, even after removing trailing sentences (i.e., truncate_question returns an empty description)
            # 2.) The description could be truncated, but the resultung description does not contain enough tokens
            global cnt_operations_with_descriptions_that_could_not_be_truncated 
            cnt_operations_with_descriptions_that_could_not_be_truncated += 1
            valid = False
    else:
        valid = False
        
    # count overall issues with description
    if not valid:
        global cnt_operation_with_description_constraint_violation
        cnt_operation_with_description_constraint_violation+=1
        
    # check whether endpoint path exceeds max. depth
    if p_max_depth is not None and len(path.split(".")) > p_max_depth:
        global cnt_operations_with_too_deep_path 
        cnt_operations_with_too_deep_path += 1
        valid = False
    
    # if all constraints are satisfied 
    if valid:
            
        # Build answer:
        answer = path
            
        # Calculate answer position in context
        answer_start = get_answer_start(context,answer)
        assert answer_start is not None, "Answer '"+answer+"' of question generated from ID '"+operation_node.id+"' is not in context"

        # Create Question-Answer pair and add it to list
        question_answer = QuestionAnswer(description,length,answer,answer_start)
        return question_answer
    else:
        global cnt_invalid_operations
        cnt_invalid_operations+=1
        return None
    

In [None]:
def create_question_answer_samples_for_api(
    api_node: ApiInterfaceNode, 
    p_min_question_length: int = None, 
    p_max_question_length: int = None, 
    p_remove_uris: bool = False, 
    p_max_depth: int = None, 
    p_http_verb_position = "suffix" , 
    p_remove_path_parameter_indicator = False, 
    p_remove_path_fragment_indicator = False,  
    p_max_questions_per_sample: int = None, 
    p_sort_by_name: bool = False, 
    p_shuffle_context: bool = False):
    """
    Creates and returns one or multiple Question-answer samples for the passed API node. The decision whether one or multiple samples are created depends on the 'p_max_questions_per_sample' threshold as well as the size (number of endpoints) of the API.
    
    Parameters
    ----------
    api_node : ApiInterfaceNode
        API node
    p_max_questions_per_sample : int
        Optional parameter (default is None) that specifies the maximum number of question-answer pairs in one question-answer sample. There is no upper limit, if the parameter is None 
    p_min_question_length : int
        Optional parameter (default is None) that specifies the minimum length (number of tokens) that a question must have. If the parameter is None, the minimum length is one token
    p_max_question_length : int
        Optional parameter (default is None) that specifies the maximum length (number of tokens) that a question may have. If the parameter is None, the maximum length is unlimited
    p_remove_uris : bool
        Optional parameter (default is False) indicating whether URIs should be removed from questions
    p_max_depth : int
        Optional parameter (default is None) that specifies the maximum depth that an XPath (as answer as well as in context) may have. If the parameter is None, there is no depth limitation
    p_http_verb_position : str
        Optional parameter (default is "suffix") that specifies the position of the HTTP verb in the endpoint path. Allowed values are "prefix" and "suffix".
    p_remove_path_parameter_indicator : bool
        Optional parameter (default is False) that specifies whether path parameter indicators should be removed from path segments
    p_remove_path_fragment_indicator : bool
        Optional parameter (default is False) that specified whether fragment indicators should be removed from path segments 
    p_max_questions_per_sample : int
        Optional parameter (default is None) that specifies the maximum number of Question-Answer pairs per sample. The method distributes the Question-Answer pairs to mulitple samples if this limit is exceeded.
    p_sort_by_name : bool
        if set to True (default value is False), the method will sort XPaths in context in ascending order
    p_shuffle : bool
        if set to True (default value is False), the method will shuffle XPaths in context
        
    Returns
    -------
    List of created Question-Answer samples (even if one sample is created, it is a list)
    """
    
    operation_nodes = extract_nodes(api_node, node_type="method")
    
    question_answer_pairs = []
    
    # build context
    endpoints = []
    for operation_node in operation_nodes:
        endpoints.append(build_path(operation_node, p_http_verb_position, p_remove_path_parameter_indicator, p_remove_path_fragment_indicator))
    if p_max_depth:
        endpoints = filter_endpoints(endpoints,p_max_depth)
    context = build_context_string(endpoints,p_sort_by_name,p_shuffle_context)
    
    # create question-answer pair for each operation
    for operation_node in operation_nodes:
        # build endpoint path
        path = build_path(operation_node, p_http_verb_position, p_remove_path_parameter_indicator, p_remove_path_fragment_indicator)
        # create question-answer pair
        question_answer_pair = create_question_answer_pair(operation_node, context, path, min_question_length, max_question_length, remove_uris, max_depth)
        if question_answer_pair:
            question_answer_pairs.append(question_answer_pair)
        
    # check if at least one question-answer pair has been created:
    if question_answer_pairs:
        if p_max_questions_per_sample:
            samples = []
            
            while question_answer_pairs:
                counter = 0
                partial_question_answer_pairs = []
                while len(question_answer_pairs) > 0 and counter < p_max_questions_per_sample:
                    partial_question_answer_pairs.append(question_answer_pairs.pop())
                    counter += 1
                sample = QuestionAnswerSample(context, partial_question_answer_pairs, api_node.id)
                samples.append(sample)
            
            if len(samples) > 1:
                global cnt_split_samples
                cnt_split_samples += 1
            return samples
        else:
            sample = QuestionAnswerSample(context, question_answer_pairs, api_node.id)
            return [sample]
    else:
        global cnt_apis_without_samples
        cnt_apis_without_samples+=1
        return None

In [None]:
question_cnt = 0

results = []



for api in tqdm(apis):
    if int(api.api_key) in excluded_api_keys:
        print("Skip ",api.api_name," (",api.api_key,")")
        continue

    question_cnt_per_api = 0
    samples = []

    for i in range(original_retakes):
        samples_of_api = create_question_answer_samples_for_api(
            api,
            min_question_length,
            max_question_length,
            remove_uris,
            max_depth,
            http_verb_position, 
            remove_path_parameter_indicator, 
            remove_path_fragment_indicator,
            max_questions_per_sample,
            sort_by_name,
            False)
        if samples_of_api:
            samples += samples_of_api
    
    for i in range(shuffled_retakes):
        samples_of_api = create_question_answer_samples_for_api(
            api,
            min_question_length,
            max_question_length,
            remove_uris,
            max_depth,
            http_verb_position, 
            remove_path_parameter_indicator, 
            remove_path_fragment_indicator,
            max_questions_per_sample,
            sort_by_name,
            True)
        if samples_of_api:
            samples += samples_of_api

    if samples:
        for sample in samples:
            question_cnt_per_api += len(sample.questionAnswers)
            question_cnt += len(sample.questionAnswers)
        results.append({
            "samples":samples,
            "api_key":api.api_key,
            "api_name":api.api_name,
            "api_version_key":api.api_version_key,
            "api_version_name":api.api_version_name,
            "question_cnt_per_api":question_cnt_per_api
        })      

In [None]:
sorted_results = sorted(results, key=lambda item: item["question_cnt_per_api"],reverse=True) 

In [None]:
chunks = [[] for i in range(number_of_chunks)]

with open(output_path+datetime.now().strftime("%Y-%m-%dT%H-%M-%S")+".log.csv","w") as log_file:
    log_file.write("API Key;API Name;API Version Key; API Version;#Samples;#Questions;Out File\n")
    for result in sorted_results:
        smallest_chunk_index = 0
        smallest_chunk_size = None
        for i in range(number_of_chunks):
            num_questions = 0
            for sample in chunks[i]:
                num_questions+=result["question_cnt_per_api"]
            if smallest_chunk_size == None or num_questions < smallest_chunk_size:
                smallest_chunk_size = num_questions
                smallest_chunk_index = i
        chunks[smallest_chunk_index]+= result["samples"]
        filename = str(smallest_chunk_index)+".json"

        log_file.write(str(result["api_key"])+";"+str(result["api_name"])+";"+str(result["api_version_key"])+";"+str(result["api_version_name"])+";"+str(len(result["samples"]))+";"+str(result["question_cnt_per_api"])+";"+filename+"\n")

In [None]:
# number of processed operations
print("# Parameters: ",cnt_operations)

# number of operations that do not satisfy the contraints required to generate a question-answer pair
print("# Invalid Parameters: ",cnt_invalid_operations)

# number of generated question-answer pairs, which is equal to the number of paramters that satisfy all contraints for being a question-answer pair
print("# Questions: ",question_cnt)
assert question_cnt == cnt_operations-cnt_invalid_operations, "Mismatch between number of generated question-answer pairs and invalid parameters"

# number of APIs from which no question-answer pairs can be generated, since the API does not contain any endpoint that satisfies the constraints
print("# API without any QA samples: ", cnt_apis_without_samples)

# (added in v3.2) number of operations whose descriptions do not satisfy at least one constraint
print("# Operations with description violation: ", cnt_operation_with_description_constraint_violation)
# number of operations that must be excluded due to missing descriptions
print("# Operations without descriptions: ", cnt_operations_without_descriptions)
# number of operations whose path exceeds the maximum depth
print("# Operations with too deep XPaths: ", cnt_operations_with_too_deep_path)
# number of operations whose description are too short
print("# Operations with too short descriptions (even before truncation): ", cnt_operations_with_too_short_descriptions)

# number of operations whose description cannot be truncated, since:
# 1.) The description is still too long, even after removing trailing sentences (i.e., truncate_question returns an empty description)
# 2.) The description could be truncated, but the resultung description does not contain enough tokens
print("# Operations with descriptions could not be truncated (because they were too long or too short after truncation): ", cnt_operations_with_descriptions_that_could_not_be_truncated)

assert cnt_invalid_operations <= cnt_operations_without_descriptions+cnt_operations_with_too_deep_path+cnt_operations_with_too_short_descriptions+cnt_operations_with_descriptions_that_could_not_be_truncated, "Mismatch between number of invalid operations and numbers specifying reasons for being invalid"

# operations with descriptions that are truncated
print("# Operations with descriptions that could be truncated: ", cnt_operations_with_truncated_descriptions)


print("# Original samples that have been split into multiple samples: ", cnt_split_samples)

In [None]:
# shuffle samples
for i in range(number_of_chunks):
    random.shuffle(chunks[i])

In [None]:
# print chunks
for i in range(number_of_chunks):
    q_cnt = 0
    for sample in chunks[i]:
            q_cnt += len(sample.questionAnswers)
    print(i,": ",len(chunks[i])," samples / ",q_cnt," questions (", (q_cnt/question_cnt)*100,"%)")

# Analyze Question and Paragraph Length (added in v3.2)

In [None]:
question_word_len = []
question_token_len = []
parameter_xpath_len = []
parameters_per_schema = []
schema_token_len = []

unique_schema_set = set()

for i in range(number_of_chunks):
    for sample in tqdm(chunks[i]):
        token_s_len = get_length(sample.context)
        for question in sample.questionAnswers:
            word_q_len = len(word_tokenize(question.question))
            token_q_len = get_length(question.question)
            
            #if word_q_len == 117:
            #    print(question.question)
            
            question_word_len.append(word_q_len)
            question_token_len.append(token_q_len)
            parameters_per_schema.append(len(sample.context.split(" ")))
            schema_token_len.append(token_s_len)
            
            if sample.context not in unique_schema_set:
                 unique_schema_set.add(sample.context)

In [None]:
print("Mean|median|stdev|min|max for question token length: ", statistics.mean(question_token_len),"|",statistics.median(question_token_len),"|",statistics.stdev(question_token_len),"|",min(question_token_len),"|",max(question_token_len))
print("Mean|median|stdev|min|max for question word length: ", statistics.mean(question_word_len),"|",statistics.median(question_word_len),"|",statistics.stdev(question_word_len),"|",min(question_word_len),"|",max(question_word_len))
print("Mean|median|stdev|min|max for number of parameters per schema: ", statistics.mean(parameters_per_schema),"|",statistics.median(parameters_per_schema),"|",statistics.stdev(parameters_per_schema),"|",min(parameters_per_schema),"|",max(parameters_per_schema))
print("Mean|median|stdev|min|max for schema token length: ", statistics.mean(schema_token_len),"|",statistics.median(schema_token_len),"|",statistics.stdev(schema_token_len),"|",min(schema_token_len),"|",max(schema_token_len))

In [None]:
import matplotlib.pyplot as plt

plt.hist(parameters_per_schema, color = 'blue', edgecolor = 'black',
         bins = 100)

In [None]:
unique_context_token_len = []
parameters_per_unique_context = []

for context in tqdm(unique_schema_set):
    context_len = get_length(context)
    unique_context_token_len.append(context_len)
    parameters_per_unique_context.append(len(context.split(" ")))
    

In [None]:
print("Number of unique schemas: ",len(unique_schema_set))
print(sum(parameters_per_unique_context))
print("Mean|median|stdev|min|max for schema token length: ", statistics.mean(unique_context_token_len),"|",statistics.median(unique_context_token_len),"|",statistics.stdev(unique_context_token_len),"|",min(unique_context_token_len),"|",max(unique_context_token_len))
print("Mean|median|stdev|min|max for number of parameters per schema: ", statistics.mean(parameters_per_unique_context),"|",statistics.median(parameters_per_unique_context),"|",statistics.stdev(parameters_per_unique_context),"|",min(parameters_per_unique_context),"|",max(parameters_per_unique_context))

In [None]:
plt.hist(parameters_per_unique_context, color = 'blue', edgecolor = 'black',
         bins = 100)

In [None]:
# write chunks
for i in range(number_of_chunks):
  with open(output_path+str(i)+".json","w") as file:
    for sample in chunks[i]:
      file.write(str(sample))
      file.write("\n")

In [None]:
#write validation set
validation_index = 2
with open(output_path+"validation.json","w") as file:
    for sample in chunks[validation_index]:
        file.write(str(sample))
        file.write("\n")

In [None]:
#write test set
test_index = 9
with open(output_path+"test.json","w") as file:
    for sample in chunks[test_index]:
        file.write(str(sample))
        file.write("\n")

In [None]:
train_indices = [0,1,3,4,5,6,7,8]
train_samples = []
for i in train_indices:
    train_samples += chunks[i]
random.shuffle(train_samples)

with open(output_path+"train.json","w") as file:
    for sample in train_samples:
        file.write(str(sample))
        file.write("\n")