In [2]:
import json
import os

In [3]:
def max_nesting_levels(obj):
    is_dict = isinstance(obj, dict)
    is_list = isinstance(obj, list)
    if not is_dict and not is_list:
        return 0
    if is_list:
        return 1 + max(max_nesting_levels(value) for value in obj)
    return 1 + max(max_nesting_levels(value) for value in obj.values())

In [4]:
def total_values_count(obj):
    is_dict = isinstance(obj, dict)
    is_list = isinstance(obj, list)
    if not is_dict and not is_list:  # If the value is not a obj or a list, return 1
        return 1
    if is_list:
        return sum(total_values_count(value) for value in obj)
    return sum(total_values_count(value) for value in obj.values())

In [5]:
def total_nested_keys_count(obj):
    is_dict = isinstance(obj, dict)
    is_list = isinstance(obj, list)
    if not is_dict and not is_list:  # If the value is not a obj or a list, return 0
        return 0
    if is_list:
        return sum(total_nested_keys_count(value) for value in obj)
    return sum(total_nested_keys_count(value) for value in obj.values()) + len(obj)

In [57]:
def analyze_obj(obj, level=0):
    result = {}
    for key, value in obj.items():
        num_values = total_values_count(value)  # Calculate the total number of values for this key
        num_nesting_levels = 0  # Initialize the number of nesting levels
        num_nested_keys = 0  # Initialize the number of nested keys
        if isinstance(value, dict):  # If the value is a obj
            num_nesting_levels = max_nesting_levels(value)  # Calculate the number of nesting levels
            num_nested_keys = total_nested_keys_count(value)  # Calculate the total number of nested keys

        result[key] = {'num_values': num_values, 'num_nesting_levels': num_nesting_levels, 'num_nested_keys': num_nested_keys}
    return result

In [6]:
base_dir = '/home/workstation/samothrace-pseudonymization/dataset/extracted/'
counted_output_dir = '/home/workstation/samothrace-pseudonymization/dataset/analysis/counted/'
grouped_output_dir = '/home/workstation/samothrace-pseudonymization/dataset/analysis/grouped/'


In [59]:
resources = os.listdir(base_dir)
for resource in resources:
    files = os.listdir(base_dir + resource)
    for file in files:
        if file.endswith('.json'):
            with open(base_dir + resource + '/' + file) as f:
                data = json.load(f)
                analysis = analyze_obj(data)
                #print(analysis)
                #break
                if not os.path.exists(counted_output_dir + resource):
                    os.makedirs(counted_output_dir + resource)
                with open(counted_output_dir + resource + '/' + file, 'w') as output:
                    json.dump(analysis, output)

In [8]:
resources = os.listdir(counted_output_dir)
for resource in resources:
    files = os.listdir(counted_output_dir + resource)
    first = True
    first_data = None
    for file in files:
        with open(counted_output_dir + resource + '/' + file) as f:
            data = json.load(f)
            if first:
                first_data = data
                first = False
            else:
                if data != first_data:
                    print(data)
                    print(first_data)
                    print('Different data for ' + resource + '/' + file)
                    break

{'id': {'num_values': 1, 'num_nesting_levels': 0, 'num_nested_keys': 0}, 'meta': {'num_values': 1, 'num_nesting_levels': 2, 'num_nested_keys': 1}, 'status': {'num_values': 1, 'num_nesting_levels': 0, 'num_nested_keys': 0}, 'class': {'num_values': 1, 'num_nesting_levels': 1, 'num_nested_keys': 1}, 'type': {'num_values': 3, 'num_nesting_levels': 0, 'num_nested_keys': 0}, 'subject': {'num_values': 1, 'num_nesting_levels': 1, 'num_nested_keys': 1}, 'period': {'num_values': 2, 'num_nesting_levels': 1, 'num_nested_keys': 2}, 'reason': {'num_values': 3, 'num_nesting_levels': 3, 'num_nested_keys': 4}, 'serviceProvider': {'num_values': 1, 'num_nesting_levels': 1, 'num_nested_keys': 1}, 'resourceType': {'num_values': 1, 'num_nesting_levels': 0, 'num_nested_keys': 0}}
{'id': {'num_values': 1, 'num_nesting_levels': 0, 'num_nested_keys': 0}, 'meta': {'num_values': 1, 'num_nesting_levels': 2, 'num_nested_keys': 1}, 'status': {'num_values': 1, 'num_nesting_levels': 0, 'num_nested_keys': 0}, 'class': 

In [24]:
resources = os.listdir(counted_output_dir)
for resource in resources:
    different_data = {}
    temp_data = {}
    index = 0
    #debug_index = 0
    files = os.listdir(counted_output_dir + resource)
    for file in files:
        #print(file)
        #print(temp_data)
        #print(different_data)
        with open(counted_output_dir + resource + '/' + file) as f:
            data = json.load(f)
            present = False
            for key, value in temp_data.items():
                #print(data)
                #print(value)
                #print(data == value)
                if data == value:
                    present = True
                    different_data[key].append(file)
                break
            if not present:
                different_data['group_' + str(index)] = [file]
                temp_data['group_' + str(index)] = data
                index += 1
        #debug_index += 1
        #if debug_index == 3:
        #    break    
    #break
    if not os.path.exists(grouped_output_dir + resource):
        os.makedirs(grouped_output_dir + resource)
    for key, value in different_data.items():
        with open(grouped_output_dir + resource + '/' + key + '.json', 'w') as output:
            json.dump(value, output)
    #break

In [15]:
""" def group_dict_by_nested_values(input_dict, nested_value):
    grouped_dict = {}
    for key, value in input_dict.items():
        if value[nested_value] not in grouped_dict:
            grouped_dict[nested_value] = [key]
        else:
            grouped_dict[nested_value].append(key)
    return grouped_dict """

In [17]:
""" analyzed_resources = os.listdir(counted_output_dir)

for resource in analyzed_resources:
    files = os.listdir(counted_output_dir + resource)
    for file in files:
        with open(counted_output_dir + resource + '/' + file) as f:
            data = json.load(f)
            grouped_data = group_dict_by_nested_values(data, 'num_nesting_levels')
            print(grouped_data)
            break    """
                

{'num_nesting_levels': ['resourceType']}
{'num_nesting_levels': ['resourceType']}
{'num_nesting_levels': ['resourceType']}
{'num_nesting_levels': ['resourceType']}
{'num_nesting_levels': ['resourceType']}
