### Taxonomy Increments Analysis

The following Notebook provides an analysis of the taxonomy increments. It summarizes for each iteration the categories and their metadata allowing precise tracking of the sources for each category.

In [21]:
import os
import json

current_dir = os.getcwd()
tax_inc_dir = os.path.join("./taxonomy_increments.json")
cluster_inc_dir = os.path.join("../clustering/cluster_increments.json")
extraction_dir = os.path.join("../extraction/extraction.json")

with open(os.path.join(current_dir, cluster_inc_dir), "r", encoding="utf-8") as f:
    cluster_increments = json.load(f)

with open(os.path.join(current_dir, tax_inc_dir), "r", encoding="utf-8") as f:
    taxonomy_increments = json.load(f)

with open(os.path.join(current_dir, extraction_dir), "r", encoding="utf-8") as f:
    extraction = json.load(f)

In [22]:
# First we prepare the metadata from the literature research
# We can then get the metadata from a paper by using the title as a key
metadata_mapping = {}
for paper in extraction:
    title = paper.get("title", "No title")
    year = paper.get("year", "No year")
    authors = paper.get("authors", "No authors")
    doi = paper.get("doi", "No doi")
    category = paper.get("category", "No category")
    domain = paper.get("domain", "No domain")
    metadata_mapping[title] = {
        "year": year,
        "authors": authors,
        "doi": doi,
        "category": category,
        "domain": domain,
    }

# Here we prepare the summaries for the clusters. Note that the Clustering process has
# two iterations and only in the second iteration the clusters are final. This is why
# we only take the clusters from the second iteration.
# Note that Clustering Iteration != Taxonomy Iteration
cluster_summaries = {}
amount_subclusters = 0
for iteration in cluster_increments:
    if not iteration.get("iteration", -1) == 2:
        continue

    for cluster in iteration.get("clusters", []):
        cluster_id = cluster.get("id", "unknown")
        cluster_name = cluster.get("name", "unknown")
        cluster_info = cluster.get("info", "No info")
        subclusters = cluster.get("subclusters", [])
        subcluster_names = set()
        cluster_sources = set()
        cluster_domains = set()
        cluster_years = set()
        cluster_categories = set()
        for subcluster in subclusters:
            subcluster_types = subcluster.get("types", [])
            subcluster_id = subcluster.get("id", "unknown")
            subcluster_info = subcluster.get("info", "No info")
            subcluster_name = subcluster.get("name", "unknown")
            subcluster_names.add(subcluster_name)
            subcluster_sources = set()
            subcluster_domains = []
            subcluster_years = []
            subcluster_categories = []
            amount_subclusters += 1
            for type_dict in subcluster_types:
                name = type_dict.get("name", "unknown")
                source = type_dict.get("source", "unknown")
                domain = metadata_mapping.get(source, {}).get("domain", "unknown")
                year = metadata_mapping.get(source, {}).get("year", -1)
                category = metadata_mapping.get(source, {}).get("category", "unknown")
                subcluster_domains.append(domain)
                subcluster_years.append(year)
                subcluster_sources.add(source)
                subcluster_categories.append(category)
            cluster_sources.update(subcluster_sources)
            cluster_domains.update(subcluster_domains)
            cluster_years.update(subcluster_years)
            cluster_categories.update(subcluster_categories)
            cluster_summaries[subcluster_id] = {
                "name": subcluster_name,
                "parent_id": cluster_id,
                "info": subcluster_info,
                "sources": subcluster_sources,
                "domains": subcluster_domains,
                "years": subcluster_years,
                "categories": subcluster_categories,
            }
        cluster_summaries[cluster_id] = {
            "name": cluster_name,
            "info": cluster_info,
            "sources": cluster_sources,
            "domains": cluster_domains,
            "years": cluster_years,
            "categories": cluster_categories,
        }
print("Amount of subclusters:", amount_subclusters)

def print_category_summary(categories, tab_depth=1):
    for dim_cat in categories:
        cat_name = dim_cat.get("name", "Unknown Category")
        cat_desc = dim_cat.get("description", "No description")
        cat_clusters = dim_cat.get("based_on_clusters", [])
        print("\t" * tab_depth, "++++++++++++++++")
        print("\t" * tab_depth, "Category Name:", cat_name)
        print("\t" * tab_depth, "Description:", cat_desc)
        if cat_clusters:
            print("\t" * tab_depth, "Based On Clusters:")
            for cl_id in cat_clusters:
                cl_summary = cluster_summaries.get(cl_id, {})
                print("\t" * (tab_depth + 1), "****************")
                print("\t" * (tab_depth + 1), "Cluster Name:", cl_summary.get("name", "Unknown Cluster"))
                print("\t" * (tab_depth + 1), "Sources:", cl_summary.get("sources", []))
                print("\t" * (tab_depth + 1), "Domains:", cl_summary.get("domains", []))
                print("\t" * (tab_depth + 1), "Years:", sorted(cl_summary.get("years", [])))
                print("\t" * (tab_depth + 1), "Literature Categories:", cl_summary.get("categories", []))
        else:
            print("\t" * tab_depth, "Based On Clusters: None, added manually")
            
        if dim_cat.get("categories", []):
            print("\t" * tab_depth, "Subcategories:")
            print_category_summary(dim_cat.get("categories", []), tab_depth + 1)

def get_category_metadata(categories, dim_metadata):
    """
    Recursively gets the metadata for a category and its subcategories
    """
    for dim_cat in categories:
        cat_clusters = dim_cat.get("based_on_clusters", [])
        if cat_clusters:
            for cl_id in cat_clusters:
                cl_summary = cluster_summaries.get(cl_id, {})
                dim_metadata["sources"].update(cl_summary.get("sources", []))
                dim_metadata["domains"].update(cl_summary.get("domains", []))
                dim_metadata["years"].update(cl_summary.get("years", []))
                dim_metadata["categories"].update(cl_summary.get("categories", []))
        if dim_cat.get("categories", []):
            get_category_metadata(dim_cat.get("categories", []), dim_metadata)

def print_taxonomy_summary(iteration):
    """
    Prints a summary for the given taxonomy iteration
    """
    for tax_it in taxonomy_increments:
        if not tax_it.get("iteration") == iteration:
            continue
        iteration_info = tax_it.get("info", "No info")
        print(f"Iteration description: {iteration_info}")
        print("Taxonomy:")

        for dim in tax_it.get("taxonomy"):
            dim_name = dim.get("name", "Unknown Dimension")
            dim_desc = dim.get("description", "No description")
            dim_metadata = {
                "sources": set(),
                "domains": set(),
                "years": set(),
                "categories": set(),
            }
            get_category_metadata(dim.get("categories", []), dim_metadata)
            print("________________________")
            print("Dimension Name:", dim_name)
            print("Description:", dim_desc)
            print("Overall Sources:", len(dim_metadata.get("sources", [])))
            print("Overall Domains:", len(set(dim_metadata.get("domains", []))))
            print("Overall Years:", len(set(dim_metadata.get("years", []))))
            print("Overall literature Categories:", len(set(dim_metadata.get("categories", []))))
            print("Number of Clusters:", len(dim.get("categories", [])))
            print_category_summary(dim.get("categories", []))            

Amount of subclusters: 96


In [23]:
print_taxonomy_summary(iteration=1)

Iteration description: No info
Taxonomy:
________________________
Dimension Name: Graph Representation
Description: Classifies questions based on how the information that is required to answer the question is organized in the Knowledge Graph.
Overall Sources: 6
Overall Domains: 3
Overall Years: 5
Overall literature Categories: 2
Number of Clusters: 2
	 ++++++++++++++++
	 Category Name: Single Fact
	 Description: To fully answer the question, only a single fact has to be retrieved from the Knowledge Graph.
	 Based On Clusters:
		 ****************
		 Cluster Name: Single Fact
		 Sources: {'Ripple Down Rules for question answering', 'DBLP-QuAD: A Question Answering Dataset over the DBLP Scholarly Knowledge Graph', 'The SciQA Scientific Question Answering Benchmark for Scholarly Knowledge', 'Large-scale Simple Question Answering with Memory Networks', 'Question Answering on Scholarly Knowledge Graphs', 'LC-QuAD 2.0: A Large Dataset for Complex Question Answering over Wikidata and Dbpedia'}

In [24]:
print_taxonomy_summary(iteration=2)

Iteration description: No info
Taxonomy:
________________________
Dimension Name: Graph Representation
Description: Classifies questions based on how the information that is required to answer the question is organized in the Knowledge Graph.
Overall Sources: 6
Overall Domains: 3
Overall Years: 5
Overall literature Categories: 2
Number of Clusters: 2
	 ++++++++++++++++
	 Category Name: Single Fact
	 Description: To fully answer the question, only a single fact has to be retrieved from the Knowledge Graph.
	 Based On Clusters:
		 ****************
		 Cluster Name: Single Fact
		 Sources: {'Ripple Down Rules for question answering', 'DBLP-QuAD: A Question Answering Dataset over the DBLP Scholarly Knowledge Graph', 'The SciQA Scientific Question Answering Benchmark for Scholarly Knowledge', 'Large-scale Simple Question Answering with Memory Networks', 'Question Answering on Scholarly Knowledge Graphs', 'LC-QuAD 2.0: A Large Dataset for Complex Question Answering over Wikidata and Dbpedia'}

In [25]:
print_taxonomy_summary(iteration=3)

Iteration description: No info
Taxonomy:
________________________
Dimension Name: Graph Representation
Description: Classifies questions based on how the information that is required to answer the question is organized in the Knowledge Graph.
Overall Sources: 6
Overall Domains: 3
Overall Years: 5
Overall literature Categories: 2
Number of Clusters: 2
	 ++++++++++++++++
	 Category Name: Single Fact
	 Description: To fully answer the question, only a single fact has to be retrieved from the Knowledge Graph.
	 Based On Clusters:
		 ****************
		 Cluster Name: Single Fact
		 Sources: {'Ripple Down Rules for question answering', 'DBLP-QuAD: A Question Answering Dataset over the DBLP Scholarly Knowledge Graph', 'The SciQA Scientific Question Answering Benchmark for Scholarly Knowledge', 'Large-scale Simple Question Answering with Memory Networks', 'Question Answering on Scholarly Knowledge Graphs', 'LC-QuAD 2.0: A Large Dataset for Complex Question Answering over Wikidata and Dbpedia'}