In [None]:
import json

with open('video_basics.json') as f:
    data = json.load(f)

# Create tag tree from existing tags

In [None]:
all_tags = set()
for video in data.values():
    tags = video['tags']
    for tag in tags:
        # Filter out non-ascii characters
        tag = ''.join([c for c in tag if ord(c) < 128])
        tag = tag.lower()
        tag = tag.split(' ')
        tag = [t for t in tag if t not in ['asmr','',' '] and t.isalpha()]
        tag = ' '.join(tag)
        if tag in ['', ' ']:
            continue
        all_tags.add(tag)

# Create tree structure for tags
# If a tag is a substring of another tag, it is a child of that tag
# If a tag is a superset of another tag, it is a parent of that tag
# If a tag is neither a parent nor a child of another tag, it is a root tag
def create_tree(tags):
    tags = list(tags)

    # First create a dictionary of all tags
    tag_dict = {}
    for tag in tags:
        tag_dict[tag] = {'children': set(), 'parents': set()}
    # Then add children and parents
    for i, tag in enumerate(tags):
        for other_tag in tags[i+1:]:
            if tag in other_tag:
                tag_dict[tag]['children'].add(other_tag)
                tag_dict[other_tag]['parents'].add(tag)
            elif other_tag in tag:
                tag_dict[tag]['parents'].add(other_tag)
                tag_dict[other_tag]['children'].add(tag)
    # Then find roots
    roots = set()
    for tag in tags:
        if len(tag_dict[tag]['parents']) == 0:
            roots.add(tag)
    # Each tag should have at most one parent, the longest parent
    for tag in tags:
        parents = list(tag_dict[tag]['parents'])
        if len(parents) > 1:
            longest_parent = max(parents, key=len)
            # Remove tag from all other parents
            for parent in parents:
                if parent != longest_parent:
                    tag_dict[parent]['children'].remove(tag)
            tag_dict[tag]['parents'] = set([longest_parent])
    # Then create tree
    tree = {}
    for root in roots:
        tree[root] = create_subtree(root, tag_dict)
    return tree

def create_subtree(tag, tag_dict):
    subtree = {}
    for child in tag_dict[tag]['children']:
        subtree[child] = create_subtree(child, tag_dict)
    return subtree

tree = create_tree(all_tags)

display(tree)