In [1]:
from datetime import datetime
from IPython.display import JSON

import ply.lex as lex
import ply.yacc as yacc

In [2]:
__file__ = "parsing.ipynb"

In [3]:
# declare the names of the keys to build the dictionary

# tags
ALSO_CALLED = 'also-called'
DESCRIPTOR = 'descriptor'
FULL_SUMMARY = 'full-summary'
GROUP = 'group'
HEALTH_TOPIC = 'health-topic'
HEALTH_TOPICS = 'health-topics'
INFORMATION_CATEGORY = 'information-category'
LANGUAGE_MAPPED_TOPIC = 'language-mapped-topic'
MESH_HEADING = 'mesh-heading'
ORGANIZATION = 'organization'
OTHER_LANGUAGE = 'other-language'
PRIMARY_INSTITUTE = 'primary-institute'
QUALIFIER = 'qualifier'
RELATED_TOPIC = 'related-topic'
SEE_REFERENCE = 'see-reference'
SITE = 'site'
STANDARD_DESCRIPTION = 'standard-description'

# attributes keys
DATE_CREATED = 'date-created'
DATE_GENERATED = 'date-generated'
ENCODING = 'encoding'
EXTERNAL_ID = 'external-id'
ID = 'id'
LANGUAGE = 'language'
LANGUAGE_MAPPED_URL = 'language-mapped-url'
META_DESC = 'meta-desc'
NAME = 'name'
TEXT = 'text'
TITLE = 'title'
TOTAL = 'total'
URL = 'url'
VERNACULAR_NAME = 'vernacular-name'
VERSION = 'xml_version'

# Lexer

In [4]:
# list html entities to their respective characters
entity_replacements = {
    '&amp;': '&',
    '&quot;': '"',
    '&#39;': "'",
    '&lt;': '<',
    '&gt;': '>',
    '&apos;': "'"
}


In [5]:
# List of token names.
tokens = [
    'StartTagClose',
    'XmlDeclStartTag',
    'XmlDeclEndTag',
    'DocTypeDeclStartTag',
    'HealthTopicsStartTagOpen',
    'HealthTopicsEndTag',
    'HealthTopicStartTagOpen',
    'HealthTopicEndTag',
    'AlsoCalledStartTagOpen',
    'AlsoCalledEndTag',
    'FullSummaryStartTagOpen',
    'FullSummaryEndTag',
    'GroupStartTagOpen',
    'GroupEndTag',
    'LanguageMappedTopicStartTagOpen',
    'LanguageMappedTopicEndTag',
    'MeshHeadingStartTagOpen',
    'MeshHeadingEndTag',
    'DescriptorStartTagOpen',
    'DescriptorEndTag',
    'QualifierStartTagOpen',
    'QualifierEndTag',
    'OtherLanguageStartTagOpen',
    'OtherLanguageEndTag',
    'PrimaryInstituteStartTagOpen',
    'PrimaryInstituteEndTag',
    'SeeReferenceStartTagOpen',
    'SeeReferenceEndTag',
    'SiteStartTagOpen',
    'SiteEndTag',
    'InformationCategoryStartTagOpen',
    'InformationCategoryEndTag',
    'OrganizationStartTagOpen',
    'OrganizationEndTag',
    'StandardDescriptionStartTagOpen',
    'StandardDescriptionEndTag',
    'RelatedTopicStartTagOpen',
    'RelatedTopicEndTag',
    'DocTypeDeclName',
    'DocTypeDeclExternalId',
    'VersionKey',
    'EncodingKey',
    'DateGeneratedKey',
    'TotalKey',
    'IdKey',
    'DateCreatedKey',
    'LanguageKey',
    'TitleKey',
    'UrlKey',
    'MetaDescKey',
    'VernacularNameKey',
    'LanguageMappedUrlKey',
    'Timestamp',
    'Integer',
    'Date',
    'Language',
    'Uri',
    'String',
    'Line'
]

# Regular expression rules for simple tokens
t_HealthTopicsStartTagOpen = rf'<{HEALTH_TOPICS}'
t_HealthTopicsEndTag = rf'</{HEALTH_TOPICS}>'
t_HealthTopicStartTagOpen = rf'<{HEALTH_TOPIC}'
t_HealthTopicEndTag = rf'</{HEALTH_TOPIC}>'
t_AlsoCalledStartTagOpen = rf'<{ALSO_CALLED}'
t_AlsoCalledEndTag = rf'</{ALSO_CALLED}>'
t_FullSummaryStartTagOpen = rf'<{FULL_SUMMARY}'
t_FullSummaryEndTag = rf'</{FULL_SUMMARY}>'
t_GroupStartTagOpen = rf'<{GROUP}'
t_GroupEndTag = rf'</{GROUP}>'
t_LanguageMappedTopicStartTagOpen = rf'<{LANGUAGE_MAPPED_TOPIC}'
t_LanguageMappedTopicEndTag = rf'</{LANGUAGE_MAPPED_TOPIC}>'
t_MeshHeadingStartTagOpen = rf'<{MESH_HEADING}'
t_MeshHeadingEndTag = rf'</{MESH_HEADING}>'
t_DescriptorStartTagOpen = rf'<{DESCRIPTOR}'
t_DescriptorEndTag = rf'</{DESCRIPTOR}>'
t_QualifierStartTagOpen = rf'<{QUALIFIER}'
t_QualifierEndTag = rf'</{QUALIFIER}>'
t_OtherLanguageStartTagOpen = rf'<{OTHER_LANGUAGE}'
t_OtherLanguageEndTag = rf'</{OTHER_LANGUAGE}>'
t_PrimaryInstituteStartTagOpen = rf'<{PRIMARY_INSTITUTE}'
t_PrimaryInstituteEndTag = rf'</{PRIMARY_INSTITUTE}>'
t_SeeReferenceStartTagOpen = rf'<{SEE_REFERENCE}'
t_SeeReferenceEndTag = rf'</{SEE_REFERENCE}>'
t_SiteStartTagOpen = rf'<{SITE}'
t_SiteEndTag = rf'</{SITE}>'
t_InformationCategoryStartTagOpen = rf'<{INFORMATION_CATEGORY}'
t_InformationCategoryEndTag = rf'</{INFORMATION_CATEGORY}>'
t_OrganizationStartTagOpen = rf'<{ORGANIZATION}'
t_OrganizationEndTag = rf'</{ORGANIZATION}>'
t_StandardDescriptionStartTagOpen = rf'<{STANDARD_DESCRIPTION}'
t_StandardDescriptionEndTag = rf'</{STANDARD_DESCRIPTION}>'
t_RelatedTopicStartTagOpen = rf'<{RELATED_TOPIC}'
t_RelatedTopicEndTag = rf'</{RELATED_TOPIC}>'

# Prolog tags
def t_XmlDeclStartTag(t):
    r'<\?xml'
    return t

def t_XmlDeclEndTag(t):
    r'\?>'
    return t

def t_StartTagClose(t):
    r'>'
    return t

def t_DocTypeDeclStartTag(t):
    r'<!DOCTYPE'
    return t

def t_DocTypeDeclName(t):
    r' health-topics'
    return t

def t_DocTypeDeclExternalId(t):
    r'PUBLIC\s+"-//NLM//DTD\s+health-topics\s+//EN"\s+"https://medlineplus.gov/xml/mplus_topics.dtd"'
    return t

# Keys of attributes
def t_VersionKey(t):
    r'version='
    return t

def t_EncodingKey(t):
    r'encoding='
    return t

def t_DateGeneratedKey(t):
    r'date-generated='
    return t

def t_TotalKey(t):
    r'total='
    return t

def t_IdKey(t):
    r'id='
    return t

def t_DateCreatedKey(t):
    r'date-created='
    return t

def t_LanguageKey(t):
    r'language='
    return t

def t_TitleKey(t):
    r'title='
    return t

def t_UrlKey(t):
    r'url='
    return t

def t_MetaDescKey(t):
    r'meta-desc='
    return t

def t_VernacularNameKey(t):
    r'vernacular-name='
    return t

def t_LanguageMappedUrlKey(t):
    r'language-mapped-url='
    return t

# Value types of attributes
def t_Date(t):
    r'"\d{2}/\d{2}/\d{4}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y").date()
    return t

def t_Timestamp(t):
    r'"\d{2}/\d{2}/\d{4}\s\d{2}:\d{2}:\d{2}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y %H:%M:%S")
    return t

def t_Integer(t):
    r'"\d+"'
    value = t.value
    t.value = int(value.replace('"', ''))
    return t

def t_Language(t):
    r'"(English|Spanish)"'
    t.value = t.value.replace('"', '')
    return t

def t_Uri(t):
    r'"(?:https?):\/\/[^\s/$.?#].[^\s"<>]*[^"<>]*"'
    t.value = replace_html_entities(t.value.replace('"', ''))
    return t

def t_String(t):
    r'"[^"]+"'
    t.value = replace_html_entities(t.value.replace('"', ''))
    return t

def t_Line(t):
    r'[\w ¿?¡!:;,#&=+°•\.\-\'\"%\*\{\}\[\]\(\)/\t]+'
    t.value = replace_html_entities(t.value)
    return t

# Define a rule so we can track line numbers
def t_newline(t):
    r'\n+'
    t._ply_lexer.lineno += len(t.value)

# Post processing function to replace html entities
def replace_html_entities(t_value):
    for entity, char in entity_replacements.items():
        t_value = t_value.replace(entity, char)
    return t_value

# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'

# Error handling rule
def t_error(t):
    print(f"Illegal character '{t.value[0]}' at line {t._ply_lexer.lineno}")
    t._ply_lexer.skip(1)


lexer = lex.lex()


# Parser

In [6]:
import os

output_folder = './parser'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [7]:
token_to_key = {
    'AlsoCalled': ALSO_CALLED,
    'AlsoCalledList': ALSO_CALLED,
    'DateCreatedKey': DATE_CREATED,
    'DateGeneratedKey': DATE_GENERATED,
    'Descriptor': DESCRIPTOR,
    'DocTypeDeclExternalId': EXTERNAL_ID,
    'DocTypeDeclName': NAME,
    'EncodingKey': ENCODING,
    'FullSummary': FULL_SUMMARY,
    'Group': GROUP,
    'GroupList': GROUP,
    'HealthTopicList': HEALTH_TOPIC,
    'HealthTopics': HEALTH_TOPICS,
    'IdKey': ID,
    'InformationCategoryList': INFORMATION_CATEGORY,
    'LanguageKey': LANGUAGE,
    'LanguageMappedTopic': LANGUAGE_MAPPED_TOPIC,
    'LanguageMappedTopicOpt': LANGUAGE_MAPPED_TOPIC,
    'LanguageMappedUrlKey': LANGUAGE_MAPPED_URL,
    'MeshHeading': MESH_HEADING,
    'MeshHeadingList': MESH_HEADING,
    'MetaDescKey': META_DESC,
    'OrganizationList': ORGANIZATION,
    'OtherLanguage': OTHER_LANGUAGE,
    'OtherLanguageList': OTHER_LANGUAGE,
    'PrimaryInstitute': PRIMARY_INSTITUTE,
    'PrimaryInstituteOpt': PRIMARY_INSTITUTE,
    'QualifierList': QUALIFIER,
    'QualifierListOpt': QUALIFIER,
    'RelatedTopic': RELATED_TOPIC,
    'RelatedTopicList': RELATED_TOPIC,
    'SeeReference': SEE_REFERENCE,
    'SeeReferenceList': SEE_REFERENCE,
    'Site': SITE,
    'SiteList': SITE,
    'StandardDescriptionList': STANDARD_DESCRIPTION,
    'Text': TEXT,
    'TitleKey': TITLE,
    'TotalKey': TOTAL,
    'UrlKey': URL,
    'VernacularNameKey': VERNACULAR_NAME,
    'VersionKey': VERSION
}

In [8]:
def parse_child_tokens(rule):
    return rule.split(':')[1].strip().split()

def handle_element(rule, p):
    child_tokens = parse_child_tokens(rule)
    
    children = p[1:]
    
    attr_idx =  next((i for i, string in enumerate(child_tokens) if 'Attributes' in string), -1)
    content_idx = next((i for i, string in enumerate(child_tokens) if 'Content' in string), -1)
    text_idx = child_tokens.index('Text') if 'Text' in child_tokens else -1
    
    parent = {}
    
    if attr_idx != -1:
        parent.update(children[attr_idx])
        
    if content_idx != -1:
        parent.update(children[content_idx])
    
    if text_idx != -1:
        text = children[text_idx].rstrip()
        
        if parent:
            parent[token_to_key['Text']] = text
        else:
            parent = text            
    
    return parent

def handle_content(rule, p):
    child_tokens = parse_child_tokens(rule)
    
    children = p[1:]
    
    existing_children_indices = [i for i, child in enumerate(children) if child is not None]
    
    parent = {token_to_key[child_tokens[i]]: children[i] for i in existing_children_indices}
            
    return parent

def handle_attributes(rule, p):
    child_tokens = parse_child_tokens(rule)
    
    children = p[1:]
    
    parent = {token_to_key[child_tokens[i]]: children[i + 1] for i in range(0, len(child_tokens), 2)}
    
    return parent

def handle_list(p):
    if len(p) == 3:
        parent = [p[1]] + p[2] if isinstance(p[2], list) else [p[1], p[2]]
    else:
        parent = p[1]  # Cant be empty
        
    return parent

def handle_optional_list(p):
    parent = None
    
    if len(p) == 3:
        if p[2] is None:
            parent = p[1]
        else:
            parent = [p[1]] + p[2] if isinstance(p[2], list) else [p[1], p[2]]
            
    return parent

In [9]:
# Parsing rules
def p_Document(p):
    'Document : Prolog HealthTopics'
    p[0] = {
        **p[1],
        token_to_key['HealthTopics']: p[2]
    }

def p_Prolog(p):
    'Prolog : XmlDeclStartTag XmlDeclAttributes XmlDeclEndTag DocTypeDeclStartTag DocTypeDeclName DocTypeDeclExternalId StartTagClose'
    p[0] = {
        **p[2],
        'doctype': {
            token_to_key['DocTypeDeclName']: p[5],
            token_to_key['DocTypeDeclExternalId']: p[6]
        }
    }

def p_XmlDeclAttributes(p):
    'XmlDeclAttributes : VersionKey String EncodingKey String'
    p[0] = handle_attributes(p_XmlDeclAttributes.__doc__, p)

def p_HealthTopics(p):
    'HealthTopics : HealthTopicsStartTagOpen HealthTopicsAttributes StartTagClose HealthTopicsContent HealthTopicsEndTag'
    p[0] = handle_element(p_HealthTopics.__doc__, p)
    
def p_HealthTopicsContent(p):
    'HealthTopicsContent : HealthTopicList'
    p[0] = handle_content(p_HealthTopicsContent.__doc__, p)

def p_HealthTopicsAttributes(p):
    'HealthTopicsAttributes : TotalKey Integer DateGeneratedKey Timestamp'
    p[0] = handle_attributes(p_HealthTopicsAttributes.__doc__, p)

def p_HealthTopicList(p):
    '''HealthTopicList : HealthTopic HealthTopicList
    | HealthTopic'''
    p[0] = handle_list(p)

def p_HealthTopic(p):
    'HealthTopic : HealthTopicStartTagOpen HealthTopicAttributes StartTagClose HealthTopicContent HealthTopicEndTag'
    p[0] = handle_element(p_HealthTopic.__doc__, p)

def p_HealthTopicAttributes(p):
    'HealthTopicAttributes : MetaDescKey String TitleKey String UrlKey Uri IdKey Integer LanguageKey Language DateCreatedKey Date'
    p[0] = handle_attributes(p_HealthTopicAttributes.__doc__, p)

def p_HealthTopicContent(p):
    'HealthTopicContent : AlsoCalledList FullSummary GroupList LanguageMappedTopicOpt MeshHeadingList OtherLanguageList PrimaryInstituteOpt RelatedTopicList SeeReferenceList SiteList'
    p[0] = handle_content(p_HealthTopicContent.__doc__, p)

def p_AlsoCalledList(p):
    '''AlsoCalledList : AlsoCalled AlsoCalledList
    | empty'''
    p[0] = handle_optional_list(p)

def p_GroupList(p):
    '''GroupList : Group GroupList
    | empty'''
    p[0] = handle_optional_list(p)

def p_LanguageMappedTopicOpt(p):
    '''LanguageMappedTopicOpt : LanguageMappedTopic
    | empty'''
    p[0] = p[1]

def p_MeshHeadingList(p):
    '''MeshHeadingList : MeshHeading MeshHeadingList
    | empty'''
    p[0] = handle_optional_list(p)

def p_OtherLanguageList(p):
    '''OtherLanguageList : OtherLanguage OtherLanguageList
    | empty'''
    p[0] = handle_optional_list(p)

def p_PrimaryInstituteOpt(p):
    '''PrimaryInstituteOpt : PrimaryInstitute
    | empty'''
    p[0] = p[1]

def p_RelatedTopicList(p):
    '''RelatedTopicList : RelatedTopic RelatedTopicList
    | empty'''
    p[0] = handle_optional_list(p)

def p_SeeReferenceList(p):
    '''SeeReferenceList : SeeReference SeeReferenceList
    | empty'''
    p[0] = handle_optional_list(p)

def p_SiteList(p):
    '''SiteList : Site SiteList
    | empty'''
    p[0] = handle_optional_list(p)

def p_FullSummary(p):
    'FullSummary : FullSummaryStartTagOpen StartTagClose Text FullSummaryEndTag'
    p[0] = handle_element(p_FullSummary.__doc__, p)

def p_Group(p):
    'Group : GroupStartTagOpen GroupAttributes StartTagClose Text GroupEndTag'
    p[0] = handle_element(p_Group.__doc__, p)

def p_GroupAttributes(p):
    'GroupAttributes : UrlKey Uri IdKey Integer'
    p[0] = handle_attributes(p_GroupAttributes.__doc__, p)

def p_LanguageMappedTopic(p):
    'LanguageMappedTopic : LanguageMappedTopicStartTagOpen LanguageMappedTopicAttributes StartTagClose Text LanguageMappedTopicEndTag'
    p[0] = handle_element(p_LanguageMappedTopic.__doc__, p)

def p_LanguageMappedTopicAttributes(p):
    'LanguageMappedTopicAttributes : UrlKey Uri IdKey Integer LanguageKey Language'
    p[0] = handle_attributes(p_LanguageMappedTopicAttributes.__doc__, p)

def p_MeshHeading(p):
    'MeshHeading : MeshHeadingStartTagOpen StartTagClose MeshHeadingContent MeshHeadingEndTag'
    p[0] = handle_element(p_MeshHeading.__doc__, p)
        
def p_MeshHeadingContent(p):
    'MeshHeadingContent : Descriptor QualifierListOpt'
    p[0] = handle_content(p_MeshHeadingContent.__doc__, p)
    
def p_QualifierListOpt(p):
    '''QualifierListOpt : QualifierList
    | empty'''
    p[0] = p[1]

def p_QualifierList(p):
    '''QualifierList : Qualifier QualifierList
    | empty'''
    p[0] = handle_optional_list(p)

def p_OtherLanguage(p):
    'OtherLanguage : OtherLanguageStartTagOpen OtherLanguageAttributes StartTagClose Text OtherLanguageEndTag'
    p[0] = handle_element(p_OtherLanguage.__doc__, p)

def p_OtherLanguageAttributes(p):
    'OtherLanguageAttributes : VernacularNameKey String UrlKey Uri'
    p[0] = handle_attributes(p_OtherLanguageAttributes.__doc__, p)

def p_PrimaryInstitute(p):
    'PrimaryInstitute : PrimaryInstituteStartTagOpen PrimaryInstituteAttributes StartTagClose Text PrimaryInstituteEndTag'
    p[0] = handle_element(p_PrimaryInstitute.__doc__, p)

def p_PrimaryInstituteAttributes(p):
    'PrimaryInstituteAttributes : UrlKey Uri'
    p[0] = handle_attributes(p_PrimaryInstituteAttributes.__doc__, p)

def p_RelatedTopic(p):
    'RelatedTopic : RelatedTopicStartTagOpen RelatedTopicAttributes StartTagClose Text RelatedTopicEndTag'
    p[0] = handle_element(p_RelatedTopic.__doc__, p)

def p_RelatedTopicAttributes(p):
    'RelatedTopicAttributes : UrlKey Uri IdKey Integer'
    p[0] = handle_attributes(p_RelatedTopicAttributes.__doc__, p)

def p_SeeReference(p):
    'SeeReference : SeeReferenceStartTagOpen StartTagClose Text SeeReferenceEndTag'
    p[0] = handle_element(p_SeeReference.__doc__, p)

def p_Site(p):
    'Site : SiteStartTagOpen SiteAttributes StartTagClose SiteContent SiteEndTag'
    p[0] = handle_element(p_Site.__doc__, p)

def p_SiteContent(p):
    'SiteContent : InformationCategoryList OrganizationList StandardDescriptionList'
    p[0] = handle_content(p_SiteContent.__doc__, p)

def p_InformationCategoryList(p):
    '''InformationCategoryList : InformationCategory InformationCategoryList
    | InformationCategory'''
    p[0] = handle_list(p)

def p_OrganizationList(p):
    '''OrganizationList : Organization OrganizationList
    | empty'''
    p[0] = handle_optional_list(p)

def p_StandardDescriptionList(p):
    '''StandardDescriptionList : StandardDescription StandardDescriptionList
    | empty'''
    p[0] = handle_optional_list(p)

def p_SiteAttributes_full(p):
    '''SiteAttributes : TitleKey String UrlKey Uri LanguageMappedUrlKey Uri'''
    p[0] = handle_attributes(p_SiteAttributes_full.__doc__, p)

def p_SiteAttributes_partial(p):
    '''SiteAttributes : TitleKey String UrlKey Uri'''
    p[0] = handle_attributes(p_SiteAttributes_partial.__doc__, p)

def p_AlsoCalled(p):
    'AlsoCalled : AlsoCalledStartTagOpen StartTagClose Text AlsoCalledEndTag'
    p[0] = handle_element(p_AlsoCalled.__doc__, p)

def p_Descriptor(p):
    'Descriptor : DescriptorStartTagOpen DescriptorAttributes StartTagClose Text DescriptorEndTag'
    p[0] = handle_element(p_Descriptor.__doc__, p)

def p_DescriptorAttributes(p):
    'DescriptorAttributes : IdKey String'
    p[0] = handle_attributes(p_DescriptorAttributes.__doc__, p)

def p_Qualifier(p):
    'Qualifier : QualifierStartTagOpen QualifierAttributes StartTagClose Text QualifierEndTag'
    p[0] = handle_element(p_Qualifier.__doc__, p)

def p_QualifierAttributes(p):
    'QualifierAttributes : IdKey String'
    p[0] = handle_attributes(p_QualifierAttributes.__doc__, p)

def p_InformationCategory(p):
    'InformationCategory : InformationCategoryStartTagOpen StartTagClose Text InformationCategoryEndTag'
    p[0] = handle_element(p_InformationCategory.__doc__, p)

def p_Organization(p):
    'Organization : OrganizationStartTagOpen StartTagClose Text OrganizationEndTag'
    p[0] = handle_element(p_Organization.__doc__, p)

def p_StandardDescription(p):
    'StandardDescription : StandardDescriptionStartTagOpen StartTagClose Text StandardDescriptionEndTag'
    p[0] = handle_element(p_StandardDescription.__doc__, p)

def p_Text(p):
    '''Text : Text Line
    | Text String
    | Line'''
    # Set the content of the element
    p[0] = p[1] + p[2] if len(p) == 3 else p[1]

def p_empty(p):
    'empty : '
    p[0] = None

# Error rule for syntax errors
syntax_error_occurred = False

def p_error(p):
    global syntax_error_occurred
    syntax_error_occurred = True
    if p:
        print(f"Syntax error at '{p.value}' on line {p.lineno}")
    else:
        print("Syntax error at EOF")

# Build the parser
parser = yacc.yacc(outputdir=output_folder, tabmodule='parser.parsetab')

In [None]:
with open('../data/mplus_topics_full.xml', 'r', encoding='utf-8') as file:
    data = file.read()

In [None]:
# Test the parser
global syntax_error_occurred
syntax_error_occurred = False  # Reset error flag before parsing

document_dict = parser.parse(data)

if syntax_error_occurred or document_dict is None:
    print("Input is invalid.")
else:
    print("Input is valid according to the CFG.")

In [None]:
from datetime import date

def resolve_datetime(obj):
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

In [None]:
import json

# Convert the dictionary to a JSON-serializable format
resolved_dict_str = json.dumps(document_dict, default=resolve_datetime)

resolved_dict = json.loads(resolved_dict_str)

JSON(resolved_dict)