## Cluster Analysis

This Notebook analyzes the manually created clusters of question types from the literature survey.

In [None]:
import os
import json

# Load the JSON files
current_dir = os.getcwd()
cluster_inc_dir = os.path.join("./cluster_increments.json")
extract_dir = os.path.join("../extraction/extraction.json")
references_dir = os.path.join("../literature_survey/research_process.json")

with open(os.path.join(current_dir, cluster_inc_dir), "r", encoding="utf-8") as f:
    cluster_increments = json.load(f)

with open(os.path.join(current_dir, extract_dir), "r", encoding="utf-8") as f:
    extraction_mapping = json.load(f)

with open(os.path.join(current_dir, references_dir), "r", encoding="utf-8") as f:
    research_process = json.load(f)

### First Iteration - Clusters and Types

In the following output, the clusters of the first iteration are shown with their respective types. 

In [None]:
import pandas as pd

types_from_extraction = {}
for paper_data in extraction_mapping:
    title = paper_data.get("title", "unknown")
    for type in paper_data.get("types", []):
        name = type.get("type", "unknown")
        types_from_extraction[(name, title)] = 1

types_from_clustering = {}
type_clusters = []
name_source_mapping = set()
for iteration in cluster_increments:
    iteration_count = iteration.get("iteration", -1)
    if iteration_count != 1:
        continue

    for cluster in iteration.get("clusters", []):
        types = []
        for item in cluster.get("types", []):
            name = item.get("name", "unknown")
            source = item.get("source", "unknown")
            types.append(name)
            types_from_clustering[(name, source)] = 1
            if (name, source) in name_source_mapping:
                print(f"Duplicate: {name} - {source}")
            name_source_mapping.add((name, source))
        type_clusters.append(types)
    
rows = []
for idx, types_list in enumerate(type_clusters):
    rows.append({"cluster id": idx, "types": " ||| ".join(types_list), "count": len(types_list)})

print(f"Total amount of types from extraction: {len(types_from_extraction)}")

# Missing types
missing_types = set(types_from_extraction.keys()) - name_source_mapping
for missing_type in missing_types:
    print(f"Missing type: {missing_type}")

print(f"Total amount of clusters: {len(rows)}")
print(f"Total amount of types: {sum([row['count'] for row in rows])}")
        
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

df_types = pd.DataFrame(rows)
df_types = df_types.sort_values(by="count", ascending=False)
df_types

### Iteration 1 - Clusters with Description
In the following output, the clusters of the first iteration are shown with their respective types and descriptions. 

In [None]:
type_descriptions = {}

for paper in extraction_mapping:
    paper_title = paper.get("title", "No title")
    for type_dict in paper.get("types", []):
        the_type = type_dict.get("type", "unknown")
        description = type_dict.get("description", "No description")
        type_descriptions[(paper_title, the_type)] = description


clusters_with_descriptions = []

for iteration in cluster_increments:
    iteration_count = iteration.get("iteration", -1)
    if iteration_count != 1:
        continue

    for cluster in iteration.get("clusters", []):
        cluster_data = []
        print("______________________")
        for item in cluster.get("types", []):
            print("     -------------------")
            name = item.get("name", "unknown")
            source = item.get("source", "unknown")
            description = type_descriptions.get((source, name), "No Mapping Found")
            print(f"    Name: {name}")
            print(f"    Source: {source}")
            print(f"    Description: {description}")
            if description == "No Mapping Found":
                print(f"No mapping found for {source} - {name}")
            cluster_data.append({"name": name, "source": source, "description": description})
        
        clusters_with_descriptions.append({
            "cluster_types_with_description": cluster_data,
        })        



### Iteration 2 - Clusters with Description and Parent Clusters

In the following output, the clusters of the second iteration are shown with their respective types, descriptions, and parent clusters.

In [None]:
type_descriptions = {}
for paper in extraction_mapping:
    paper_title = paper.get("title", "No title")
    for type_dict in paper.get("types", []):
        the_type = type_dict.get("type", "unknown")
        description = type_dict.get("description", "No description")
        type_descriptions[(paper_title, the_type)] = description


grouped_clusters_with_description = []
amount_of_clusters = 0
for iteration in cluster_increments:
    iteration_count = iteration.get("iteration", -1)
    if iteration_count != 2:
        continue
    
    for cluster in iteration.get("clusters", []):
        cluster_info = cluster.get("info", "No info")
        print("______________________")
        print(f"Cluster ID: {cluster.get('id', 'unknown')}")
        print(f"Cluster Name: {cluster.get('name', 'unknown')}")
        print(f"Cluster info: {cluster_info}")
        print(f"Subclusters: {len(cluster.get('subclusters', []))}: ")
        subcluster_data = []
        for index, subcluster in enumerate(cluster.get("subclusters", [])):
            subcluster_name = subcluster.get("name", "unknown")
            subcluster_info = subcluster.get("info", "No info")
            subcluster_id = subcluster.get("id", "unknown")
            print(f"     Subcluster ID: {subcluster_id}")
            print(f"     Name: {subcluster_name}")
            print(f"     Info: {subcluster_info}")
            print("     -------------------")
            cluster_data = []

            for item in subcluster.get("types", []):
                name = item.get("name", "unknown")
                source = item.get("source", "unknown")
                description = type_descriptions.get((source, name), "No Mapping Found")
                if description == "No Mapping Found":
                    print(f"No mapping found for {source} - {name}")
                cluster_data.append({"name": name, "source": source, "description": description})
            subcluster_data.append({
                "cluster_types_with_description": cluster_data,
                "subcluster_id": subcluster_id,
                "subcluster_name": subcluster_name,
                "subcluster_info": subcluster_info,
            })
            
        grouped_clusters_with_description.append(subcluster_data)
print(f"Amount of clusters: {amount_of_clusters}")   
    

### Summary Analysis of Clusters

In the following output, a total summary analysis is shown, including the number of clusters, types, and descriptions. The analysis also includes the number of clusters per type and the number of clusters per parent cluster.
It allows to get a better understanding of where each cluster originates from and what their characteristics are.

In [None]:
from collections import defaultdict

def print_with_occurence(prefix, data):
    """Helper Function to print the occurrence of items in a list."""
    count_dict = defaultdict(int)
    for item in data:
        count_dict[item] += 1
    output_text = ""
    for key, value in count_dict.items():
        output_text += f"{value}x {key}, "
    output_text = output_text[:-2]
    print(f"{prefix}: {output_text}")
    
# -----------
# Prepare the data for each iteration
first_iteration = None
second_iteration = None
for iteration in cluster_increments:
    iteration_count = iteration.get("iteration", -1)
    if iteration_count == 1:
        first_iteration = iteration
    if iteration_count == 2:
        second_iteration = iteration

for first_iteration_cluster in first_iteration.get("clusters", []):
    first_iteration_types = first_iteration_cluster.get("types", [])
        
    has_match = False

    for second_iteration_cluster in second_iteration.get("clusters", []):
        for second_iteration_subcluster in second_iteration_cluster.get("subclusters", []):
            second_iteration_types = second_iteration_subcluster.get("types", [])

            if len(first_iteration_types) != len(second_iteration_types):
                continue

            # Check if all types match
            all_match = True
            for first_iteration_type in first_iteration_types:
                first_iteration_name = first_iteration_type.get("name", "unknown")
                has_type_match = False
                for second_iteration_type in second_iteration_types:
                    second_iteration_name = second_iteration_type.get("name", "unknown")
                    if first_iteration_name == second_iteration_name:
                        has_type_match = True
                        break
                if not has_type_match:
                    all_match = False
                    break
            if all_match:
                has_match = True
                break
        if has_match:
            break
    if not has_match:
        print(f"First Iteration Types: {[type.get('name', 'unknown') for type in first_iteration_types]}")
        print("______________________")
# ----------
# ----------
# Create a mapping of metadata for each paper
metadata_mapping = {}
for paper in extraction_mapping:
    title = paper.get("title", "No title")
    year = paper.get("year", "No year")
    authors = paper.get("authors", "No authors")
    doi = paper.get("doi", "No doi")
    category = paper.get("category", "No category")
    domain = paper.get("domain", "No domain")
    metadata_mapping[title] = {
        "year": year,
        "authors": authors,
        "doi": doi,
        "category": category,
        "domain": domain,
    }
# ----------
# ----------
# Print the metadata for each cluster
for iteration in cluster_increments:
    if not iteration.get("iteration", -1) == 2:
        continue

    for cluster in iteration.get("clusters", []):
        cluster_id = cluster.get("id", "unknown")
        cluster_name = cluster.get("name", "unknown")
        cluster_info = cluster.get("info", "No info")
        print("______________________")
        print(f"Parent-Cluster ID: {cluster_id}")
        print(f"Parent-Cluster Name: {cluster_name}")
        print(f"Parent-Cluster Info: {cluster_info}")
        subclusters = cluster.get("subclusters", [])
        types = []
        subcluster_names = []
        sources = {}
        domains = set()
        years = set()
        categories = set()
        for subcluster in subclusters:
            subcluster_types = subcluster.get("types", [])
            subcluster_id = subcluster.get("id", "unknown")
            subcluster_info = subcluster.get("info", "No info")
            subcluster_name = subcluster.get("name", "unknown")
            subcluster_names.append(subcluster_name)
            for type_dict in subcluster_types:
                name = type_dict.get("name", "unknown")
                source = type_dict.get("source", "unknown")
                domain = metadata_mapping.get(source, {}).get("domain", "unknown")
                year = metadata_mapping.get(source, {}).get("year", -1)
                category = metadata_mapping.get(source, {}).get("category", "unknown")
                types.append(name)
                sources[source] = {
                    "domain": domain,
                    "year": year,
                    "category": category
                }
                domains.add(domain)
                years.add(year)
                categories.add(category)
        print(f"Amount of subclusters: {len(subclusters)}")
        print(f"Amount of types in all subclusters: {len(types)}")
        print(f"Amount of different domains: {len(domains)}")
        print(f"Amount of different years: {len(years)}")
        print(f"Amount of different categories: {len(categories)}")
        print(f"Amount of different sources: {len(sources)}")
        print(f"Names of Subclusters: {subcluster_names}")

        src_domains = [source["domain"] for source in sources.values()]
        src_years = [source["year"] for source in sources.values()]
        src_categories = [source["category"] for source in sources.values()]
        src_names = list(sources.keys())
        print_with_occurence("Domains", src_domains)
        print_with_occurence("Years", src_years)
        print_with_occurence("Categories", src_categories)
        print("Sources: ", src_names)
# -----------
