## Normalize Datatypes

In [2]:
import json
import re

1. Lowercase
2. Delete '_'
3. Delete '(xxx)' if it's the last part of datatype

In [3]:
# Load the updated_result_2_modified.json file
with open('updated_modalities_result_2.json', 'r', encoding='utf-8') as file:
    updated_result = json.load(file)

# Function to update datatypes titles to lowercase
def update_datatypes_to_lowercase(data):
    if isinstance(data, dict):
        for key, value in data.items():
            if key == 'modalities':
                for modality in value:
                    if 'data_types' in modality:
                        for data_type in modality['data_types']:
                            if 'title' in data_type:
                                data_type['title'] = data_type['title'].lower()
                                data_type['title']  = data_type['title'].replace('_', ' ')
                                data_type['title']= re.sub(r'\s*\(.*\)$', '', data_type['title'])
                            
            else:
                update_datatypes_to_lowercase(value)
    elif isinstance(data, list):
        for item in data:
            update_datatypes_to_lowercase(item)

# Update the datatypes titles in the updated_result data
update_datatypes_to_lowercase(updated_result)

# Save the updated result back to a file
with open('updated_modalities_result_2_lowercase.json', 'w', encoding='utf-8') as file:
    json.dump(updated_result, file, ensure_ascii=False, indent=4)

In [5]:
with open('updated_modalities_result_2_lowercase.json', 'r', encoding='utf-8') as file:
    updated_result = json.load(file)

# Set to store unique data types
unique_data_types = set()

# Function to extract data types from the data
def extract_data_types(data):
    if isinstance(data, dict):
        for key, value in data.items():
            if key == 'modalities':
                for modality in value:
                    if 'data_types' in modality:
                        for data_type in modality['data_types']:
                            unique_data_types.add(data_type['title'])
                            
            else:
                extract_data_types(value)
    elif isinstance(data, list):
        for item in data:
            extract_data_types(item)

# Extract data types from the updated_result data
extract_data_types(updated_result)

sorted_data_types = sorted(unique_data_types)
print("The amount of unique datatypes: ",len(sorted_data_types))

# Print all unique data types
for data_type in sorted_data_types:
    print(data_type)

The amount of unique datatypes:  15271
16s rdna sequences
1d benchmark data
1d numerical data
1d traces
1d/2d-nmr spectra
2-hop subgraphs
2d and 3d annotations
2d and 3d multi-object tracking annotations
2d and 3d object detection annotations
2d animation scenes
2d animations of social interactions
2d annotations
2d benchmark data
2d binary masks
2d biomedical images
2d birds-eye-view centroid
2d bounding box
2d bounding box labels
2d bounding boxes
2d boxes
2d brain slices
2d cad designs
2d coordinates
2d coordinates of reprojected joint positions
2d face images
2d facial expression videos
2d facial images
2d floor plans
2d fundus images
2d heterogeneous material blocks
2d histogram
2d human figures
2d human keypoints
2d human pose
2d human poses
2d images
2d instance segments
2d interaction images
2d joint annotations
2d joint coordinates
2d joint location annotations
2d keypoint annotations
2d keypoints
2d labels
2d lane annotations
2d laser scans
2d lidar data
2d lidar scans
2d lig

In [6]:
import re
import inflect

In [7]:
data_types_for_conversion = sorted_data_types
for dt in data_types_for_conversion:
    print(dt)

16s rdna sequences
1d benchmark data
1d numerical data
1d traces
1d/2d-nmr spectra
2-hop subgraphs
2d and 3d annotations
2d and 3d multi-object tracking annotations
2d and 3d object detection annotations
2d animation scenes
2d animations of social interactions
2d annotations
2d benchmark data
2d binary masks
2d biomedical images
2d birds-eye-view centroid
2d bounding box
2d bounding box labels
2d bounding boxes
2d boxes
2d brain slices
2d cad designs
2d coordinates
2d coordinates of reprojected joint positions
2d face images
2d facial expression videos
2d facial images
2d floor plans
2d fundus images
2d heterogeneous material blocks
2d histogram
2d human figures
2d human keypoints
2d human pose
2d human poses
2d images
2d instance segments
2d interaction images
2d joint annotations
2d joint coordinates
2d joint location annotations
2d keypoint annotations
2d keypoints
2d labels
2d lane annotations
2d laser scans
2d lidar data
2d lidar scans
2d light-field microscope images
2d mammograp

In [8]:
p = inflect.engine()
p.strict = False

custom_map = {
    'children': 'child',
    'spectra': 'spectrum',
    'indices': 'index',
    'vertices': 'vertex',
    'analyses': 'analysis',
    'data': 'data',
    'metadata': 'metadata',
    'rgb-d': 'rgb-d',
    'x-ray': 'x-ray',
}

PROTECTED_TERMS = {
    "rgb-d": "RGBD_PLACEHOLDER",
    "x-ray": "XRAY_PLACEHOLDER",
    "x-rays": "XRAY_PLACEHOLDER", 
}

In [9]:
def protect_special_terms(term: str) -> str:
    """
    change special terms to placeholders
    """
    for orig, placeholder in PROTECTED_TERMS.items():
        term = re.sub(r'(?i)\b' + re.escape(orig) + r'\b', placeholder, term)
    return term

def restore_special_terms(term: str) -> str:
    """
    restore special terms from placeholders
    """
    for orig, placeholder in PROTECTED_TERMS.items():
        term = term.replace(placeholder, orig)
    return term


def normalize_symbols(term: str) -> str:
    term=re.sub(r"[‘’']", "", term) # remove apostrophes
    term = re.sub(r'(\d+[dD])-', r'\1 ', term) # 6d- => 6d
    term = term.replace('º', '°').replace(' degrees', '°').replace('-degree', '°') # 90º => 90°
    term = term.replace('--', '-').replace('//', '/') # -- => -, // => /
    term = re.sub(r'(?i)meta[\s-]*data', 'metadata', term) # meta data => metadata
    term = re.sub(r'(?i)\b6dof\b', '6 dof', term) # 6dof => 6 dof
    term = re.sub(r'(?i)\b6-dof\b', '6 dof', term) # 6-dof => 6 dof
    term = re.sub(r'(?i)\b6dpose\b', '6d pose', term) # 6dpose => 6d pose

    term = protect_special_terms(term)

    term = re.sub(r'(?<=\D)(\w)-(\w)(?<=\D)', r'\1 \2', term) # a-b => a b

    term = restore_special_terms(term)
    return term


def convert_singular(word: str) -> str:
    """
    Convert the last word to singular form
    """
    if word in custom_map:
        return custom_map[word]
    
    uncountables = {'scenery', 'equipment', 'information'}
    if word in uncountables:
        return word
    
    singular = p.singular_noun(word)
    return singular if singular else word

def reconstruct_term(parts, processed):
    """
    Reconstruct the term from parts and processed last part
    """
    if not parts:
        return processed
    original_last = parts[-1]
    if original_last.isupper():
        processed = processed.upper()
    elif original_last and original_last[0].isupper():
        processed = processed.capitalize()
    return ''.join(parts[:-1] + [processed])

def process_term(term: str) -> str:
    """
    Process a single term
    """
    term = normalize_symbols(term)
    parts = re.split(r'(\W+)', term)  #
    last_part = parts[-1].lower() if parts else term.lower()
    processed = convert_singular(last_part)
    return reconstruct_term(parts, processed)

def batch_process(terms):
    """
    Process a batch of terms
    """
    return {term: process_term(term) for term in terms}

In [10]:
conversion_mapping = batch_process(data_types_for_conversion)
print("result:(original => converted)")
for orig, conv in conversion_mapping.items():
    print(f"{orig} => {conv}")

result:(original => converted)
16s rdna sequences => 16s rdna sequence
1d benchmark data => 1d benchmark data
1d numerical data => 1d numerical data
1d traces => 1d trace
1d/2d-nmr spectra => 1d/2d nmr spectrum
2-hop subgraphs => 2-hop subgraph
2d and 3d annotations => 2d and 3d annotation
2d and 3d multi-object tracking annotations => 2d and 3d multi object tracking annotation
2d and 3d object detection annotations => 2d and 3d object detection annotation
2d animation scenes => 2d animation scene
2d animations of social interactions => 2d animations of social interaction
2d annotations => 2d annotation
2d benchmark data => 2d benchmark data
2d binary masks => 2d binary mask
2d biomedical images => 2d biomedical image
2d birds-eye-view centroid => 2d birds eye view centroid
2d bounding box => 2d bounding box
2d bounding box labels => 2d bounding box label
2d bounding boxes => 2d bounding box
2d boxes => 2d box
2d brain slices => 2d brain slice
2d cad designs => 2d cad design
2d coordin

In [11]:
def update_data_types_in_json(data, mapping):
    """
    Update data types in the JSON data using the provided mapping
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if key == 'modalities' and isinstance(value, list):
                for modality in value:
                    if isinstance(modality, dict) and 'data_types' in modality and isinstance(modality['data_types'], list):
                        for dt in modality['data_types']:
                            if isinstance(dt, dict) and 'title' in dt:
                                original_title = dt['title']
                                if original_title in mapping:
                                    dt['title'] = mapping[original_title]
            else:
                update_data_types_in_json(value, mapping)
    elif isinstance(data, list):
        for item in data:
            update_data_types_in_json(item, mapping)

In [12]:
conversion_mapping_norm = {k.strip().lower(): v for k, v in conversion_mapping.items()}
update_data_types_in_json(updated_result, conversion_mapping_norm)

In [13]:
with open("updated_datatypes_result_2.json", "w", encoding="utf-8") as f:
    json.dump(updated_result, f, ensure_ascii=False, indent=4)
print("Updated JSON data saved to updated_datatypes_result_2.json")

Updated JSON data saved to updated_datatypes_result_2.json


In [14]:
with open('updated_datatypes_result_2.json', 'r', encoding='utf-8') as file:
    updated_result = json.load(file)

# Set to store unique data types
unique_data_types = set()

# Function to extract data types from the data
def extract_data_types(data):
    if isinstance(data, dict):
        for key, value in data.items():
            if key == 'modalities':
                for modality in value:
                    if 'data_types' in modality:
                        for data_type in modality['data_types']:
                            unique_data_types.add(data_type['title'])
                            
            else:
                extract_data_types(value)
    elif isinstance(data, list):
        for item in data:
            extract_data_types(item)

# Extract data types from the updated_result data
extract_data_types(updated_result)

sorted_data_types = sorted(unique_data_types)
print("The amount of unique datatypes: ",len(sorted_data_types))

# Print all unique data types
for data_type in sorted_data_types:
    print(data_type)

The amount of unique datatypes:  14652
16s rdna sequence
1d benchmark data
1d numerical data
1d trace
1d/2d nmr spectrum
2-hop subgraph
2d and 3d annotation
2d and 3d multi object tracking annotation
2d and 3d object detection annotation
2d animated segment
2d animation scene
2d animations of social interaction
2d annotation
2d benchmark data
2d binary mask
2d biomedical image
2d birds eye view centroid
2d bounding box
2d bounding box label
2d box
2d brain slice
2d cad design
2d coordinate
2d coordinates of reprojected joint position
2d face image
2d facial expression video
2d facial image
2d floor plan
2d fundus image
2d heterogeneous material block
2d histogram
2d human figure
2d human keypoint
2d human pose
2d image
2d instance segment
2d interaction image
2d joint annotation
2d joint coordinate
2d joint location annotation
2d keypoint
2d keypoint annotation
2d label
2d lane annotation
2d laser scan
2d lidar data
2d lidar scan
2d light field microscope image
2d mammographic image
2d

In [15]:
# save the mapping to a json file
with open("data_type_mapping.json", "w", encoding="utf-8") as f:
    json.dump(conversion_mapping, f, ensure_ascii=False, indent=4)