In [93]:
from datetime import datetime
import ply.lex as lex
import ply.yacc as yacc

In [94]:
__file__ = "parser.ipynb"

# Lexer

In [95]:
# List of token names.
tokens = [
    'StartTagClose',
    'XmlDeclStartTag',
    'XmlDeclEndTag',
    'DocTypeDeclStartTag',
    'HealthTopicsStartTagOpen',
    'HealthTopicsEndTag',
    'HealthTopicStartTagOpen',
    'HealthTopicEndTag',
    'AlsoCalledStartTagOpen',
    'AlsoCalledEndTag',
    'FullSummaryStartTagOpen',
    'FullSummaryEndTag',
    'GroupStartTagOpen',
    'GroupEndTag',
    'LanguageMappedTopicStartTagOpen',
    'LanguageMappedTopicEndTag',
    'MeshHeadingStartTagOpen',
    'MeshHeadingEndTag',
    'DescriptorStartTagOpen',
    'DescriptorEndTag',
    'QualifierStartTagOpen',
    'QualifierEndTag',
    'OtherLanguageStartTagOpen',
    'OtherLanguageEndTag',
    'PrimaryInstituteStartTagOpen',
    'PrimaryInstituteEndTag',
    'SeeReferenceStartTagOpen',
    'SeeReferenceEndTag',
    'SiteStartTagOpen',
    'SiteEndTag',
    'InformationCategoryStartTagOpen',
    'InformationCategoryEndTag',
    'OrganizationStartTagOpen',
    'OrganizationEndTag',
    'StandardDescriptionStartTagOpen',
    'StandardDescriptionEndTag',
    'RelatedTopicStartTagOpen',
    'RelatedTopicEndTag',
    'DocTypeDeclName',
    'DocTypeDeclExternalId',
    'VersionKey',
    'EncodingKey',
    'DateGeneratedKey',
    'TotalKey',
    'IdKey',
    'DateCreatedKey',
    'LanguageKey',
    'TitleKey',
    'UrlKey',
    'MetaDescKey',
    'VernacularNameKey',
    'LanguageMappedUrlKey',
    'Timestamp',
    'Integer',
    'Date',
    'Language',
    'Uri',
    'String',
    'Text'
]

# Regular expression rules for simple tokens
t_HealthTopicsStartTagOpen = r'<health-topics'
t_HealthTopicsEndTag = r'</health-topics>'
t_HealthTopicStartTagOpen = r'<health-topic'
t_HealthTopicEndTag = r'</health-topic>'
t_AlsoCalledStartTagOpen = r'<also-called'
t_AlsoCalledEndTag = r'</also-called>'
t_FullSummaryStartTagOpen = r'<full-summary'
t_FullSummaryEndTag = r'</full-summary>'
t_GroupStartTagOpen = r'<group'
t_GroupEndTag = r'</group>'
t_LanguageMappedTopicStartTagOpen = r'<language-mapped-topic'
t_LanguageMappedTopicEndTag = r'</language-mapped-topic>'
t_MeshHeadingStartTagOpen = r'<mesh-heading'
t_MeshHeadingEndTag = r'</mesh-heading>'
t_DescriptorStartTagOpen = r'<descriptor'
t_DescriptorEndTag = r'</descriptor>'
t_QualifierStartTagOpen = r'<qualifier'
t_QualifierEndTag = r'</qualifier>'
t_OtherLanguageStartTagOpen = r'<other-language'
t_OtherLanguageEndTag = r'</other-language>'
t_PrimaryInstituteStartTagOpen = r'<primary-institute'
t_PrimaryInstituteEndTag = r'</primary-institute>'
t_SeeReferenceStartTagOpen = r'<see-reference'
t_SeeReferenceEndTag = r'</see-reference>'
t_SiteStartTagOpen = r'<site'
t_SiteEndTag = r'</site>'
t_InformationCategoryStartTagOpen = r'<information-category'
t_InformationCategoryEndTag = r'</information-category>'
t_OrganizationStartTagOpen = r'<organization'
t_OrganizationEndTag = r'</organization>'
t_StandardDescriptionStartTagOpen = r'<standard-description'
t_StandardDescriptionEndTag = r'</standard-description>'
t_RelatedTopicStartTagOpen = r'<related-topic'
t_RelatedTopicEndTag = r'</related-topic>'

# Regular expression for Text
t_Text = r'[\w ¿?¡!:;,#&=+°•\.\-\'\"%\*\{\}\[\]\(\)/\t]+'

# Prolog tags
def t_XmlDeclStartTag(t):
    r'<\?xml'
    return t

def t_XmlDeclEndTag(t):
    r'\?>'
    return t

def t_StartTagClose(t):
    r'>'
    return t

def t_DocTypeDeclStartTag(t):
    r'<!DOCTYPE'
    return t

def t_DocTypeDeclName(t):
    r' health-topics'
    return t

def t_DocTypeDeclExternalId(t):
    r'PUBLIC\s+"-//NLM//DTD\s+health-topics\s+//EN"\s+"https://medlineplus.gov/xml/mplus_topics.dtd"'
    return t

# Keys of attributes
def t_VersionKey(t):
    r'version='
    return t

def t_EncodingKey(t):
    r'encoding='
    return t

def t_DateGeneratedKey(t):
    r'date-generated='
    return t

def t_TotalKey(t):
    r'total='
    return t

def t_IdKey(t):
    r'id='
    return t

def t_DateCreatedKey(t):
    r'date-created='
    return t

def t_LanguageKey(t):
    r'language='
    return t

def t_TitleKey(t):
    r'title='
    return t

def t_UrlKey(t):
    r'url='
    return t

def t_MetaDescKey(t):
    r'meta-desc='
    return t

def t_VernacularNameKey(t):
    r'vernacular-name='
    return t

def t_LanguageMappedUrlKey(t):
    r'language-mapped-url='
    return t

# Value types of attributes
def t_Date(t):
    r'"\d{2}/\d{2}/\d{4}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y").date()
    return t

def t_Timestamp(t):
    r'"\d{2}/\d{2}/\d{4}\s\d{2}:\d{2}:\d{2}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y %H:%M:%S")
    return t

def t_Integer(t):
    r'"\d+"'
    value = t.value
    t.value = int(value.replace('"', ''))
    return t

def t_Language(t):
    r'"(English|Spanish)"'
    t.value = t.value.replace('"', '')
    return t

def t_Uri(t):
    r'"(?:https?):\/\/[^\s/$.?#].[^\s"<>]*[^"<>]*"'
    t.value = t.value.replace('"', '')
    return t

def t_String(t):
    r'"[^"]+"'
    t.value = t.value.replace('"', '')
    return t

# Define a rule so we can track line numbers
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)

# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'

# Error handling rule
def t_error(t):
    print(f"Illegal character '{t.value[0]}' at line {t.lexer.lineno}")
    t.lexer.skip(1)

lexer = lex.lex()

# Parser

In [96]:
import os
import pprint

output_folder = './parser'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [97]:
# Parsing rules
def p_Document(p):
    'Document : Prolog HealthTopics'
    pass

def p_Prolog(p):
    'Prolog : XmlDeclStartTag XmlDeclAttributes XmlDeclEndTag DocTypeDeclStartTag DocTypeDeclName DocTypeDeclExternalId StartTagClose'
    pass

def p_XmlDeclAttributes(p):
    'XmlDeclAttributes : VersionKey String EncodingKey String'
    pass

def p_HealthTopics(p):
    'HealthTopics : HealthTopicsStartTagOpen HealthTopicsAttributes StartTagClose HealthTopicList HealthTopicsEndTag'
    # Set the attributes of the health-topics and the list of health-topic
    p[0] = {
        "total": p[2]["total"], # TotalKey Value
        "date-generated": p[2]["date-generated"], # DateGeneratedKey Value
        "health-topic": p[4]  # Lista de HealthTopic
    }
    global data_dict
    data_dict = p[0]  # Optional return

def p_HealthTopicsAttributes(p):
    'HealthTopicsAttributes : TotalKey Integer DateGeneratedKey Timestamp'
    # Set the attributes of the health-topics
    p[0] = {
        "total": p[2],
        "date-generated": p[4]
    }

def p_HealthTopicList(p):
    '''HealthTopicList : HealthTopic HealthTopicList
    | HealthTopic'''
    # Set the list of health-topic
    p[0] = [p[1]] if len(p) == 2 else [p[1]] + p[2]

def p_HealthTopic(p):
    'HealthTopic : HealthTopicStartTagOpen HealthTopicAttributes StartTagClose HealthTopicContent HealthTopicEndTag'
    # Set the attributes and content of the health-topic
    p[0] = {**p[2], **p[4]} # Merge dicts


def p_HealthTopicAttributes(p):
    'HealthTopicAttributes : MetaDescKey String TitleKey String UrlKey Uri IdKey Integer LanguageKey Language DateCreatedKey Date'
    # Set the attributes of the health-topic
    p[0] = {
        "meta-desc": p[2],
        "title": p[4],
        "url": p[6],
        "id": p[8],
        "language": p[10],
        "date-created": p[12]
    }

def p_HealthTopicContent(p):
    'HealthTopicContent : AlsoCalledList FullSummary GroupList LanguageMappedTopicOpt MeshHeadingList OtherLanguageList PrimaryInstituteOpt RelatedTopicList SeeReferenceList SiteList'
    # Set the content of the health-topic
    p[0] = {}

    if p[1]:  # also-called
        p[0]["also-called"] = p[1]

    p[0]["full-summary"] = p[2]

    p[0]["group"] = p[3]

    if p[4]:  # language-mapped-topic
        p[0]["language-mapped-topic"] = p[4]
    if p[5]:  # mesh-heading
        p[0]["mesh-heading"] = p[5]
    if p[6]:  # other-language
        p[0]["other-language"] = p[6]
    if p[7]:  # primary-institute
        p[0]["primary-institute"] = p[7]
    if p[8]:  # related-topic
        p[0]["related-topic"] = p[8]
    if p[9]:  # see-reference
        p[0]["see-reference"] = p[9]

    p[0]["site"] = p[10]

def p_AlsoCalledList(p):
    '''AlsoCalledList : AlsoCalled AlsoCalledList
    | empty'''
    # Set the list of also-called
    if len(p) == 3:  # Means it's AlsoCalled AlsoCalledList
        # Add the new also-called to the list or create a new list
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = [] # Means it's empty

def p_GroupList(p):
    '''GroupList : Group GroupList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_LanguageMappedTopicOpt(p):
    '''LanguageMappedTopicOpt : LanguageMappedTopic
    | empty'''
    # There is 1 or 0 of this element
    if p[1] is not None:
        p[0] = p[1]

def p_MeshHeadingList(p):
    '''MeshHeadingList : MeshHeading MeshHeadingList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_OtherLanguageList(p):
    '''OtherLanguageList : OtherLanguage OtherLanguageList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_PrimaryInstituteOpt(p):
    '''PrimaryInstituteOpt : PrimaryInstitute
    | empty'''
    # There is 1 or 0 of this element
    if p[1] is not None:
        p[0] = p[1]

def p_RelatedTopicList(p):
    '''RelatedTopicList : RelatedTopic RelatedTopicList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_SeeReferenceList(p):
    '''SeeReferenceList : SeeReference SeeReferenceList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_SiteList(p):
    '''SiteList : Site SiteList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_FullSummary(p):
    'FullSummary : FullSummaryStartTagOpen StartTagClose TextContent FullSummaryEndTag'
    p[0] = p[3]


def p_Group(p):
    'Group : GroupStartTagOpen GroupAttributes StartTagClose TextContent GroupEndTag'
    p[0] = {
        "url": p[2]["url"],  # GroupAttributes
        "id": p[2]["id"],  # GroupAttributes
        "text": p[4]  # TextContent
    }

def p_GroupAttributes(p):
    'GroupAttributes : UrlKey Uri IdKey Integer'
    p[0] = {
        "url": p[2],
        "id": p[4]
    }

def p_LanguageMappedTopic(p):
    'LanguageMappedTopic : LanguageMappedTopicStartTagOpen LanguageMappedTopicAttributes StartTagClose TextContent LanguageMappedTopicEndTag'
    p[0] = {
        "url": p[2]["url"], # LanguageMappedTopicAttributes
        "id": p[2]["id"], # LanguageMappedTopicAttributes
        "language": p[2]["language"], # LanguageMappedTopicAttributes
        "text": p[4]
    }

def p_LanguageMappedTopicAttributes(p):
    'LanguageMappedTopicAttributes : UrlKey Uri IdKey Integer LanguageKey Language'
    p[0] = {
        "url": p[2],
        "id": p[4],
        "language": p[6]
    }

def p_MeshHeading(p):
    'MeshHeading : MeshHeadingStartTagOpen StartTagClose Descriptor QualifierList MeshHeadingEndTag'
    p[0] = {
        "descriptor": p[3],
    }

    if p[4]:
        p[0]['qualifier'] = p[4]

def p_QualifierList(p):
    '''QualifierList : Qualifier QualifierList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_OtherLanguage(p):
    'OtherLanguage : OtherLanguageStartTagOpen OtherLanguageAttributes StartTagClose TextContent OtherLanguageEndTag'
    p[0] = {
        'vernacular_name': p[2]['vernacular_name'],
        'url': p[2]['url'],
        'text': p[4]
    }

def p_OtherLanguageAttributes(p):
    'OtherLanguageAttributes : VernacularNameKey String UrlKey Uri'
    p[0] = {
        'vernacular_name': p[2],
        'url': p[4]
    }

def p_PrimaryInstitute(p):
    'PrimaryInstitute : PrimaryInstituteStartTagOpen PrimaryInstituteAttributes StartTagClose TextContent PrimaryInstituteEndTag'
    p[0] = {
        'url': p[2]['url'],
        'text': p[4]
    }

def p_PrimaryInstituteAttributes(p):
    'PrimaryInstituteAttributes : UrlKey Uri'
    p[0] = {
        'url': p[2]
    }

def p_RelatedTopic(p):
    'RelatedTopic : RelatedTopicStartTagOpen RelatedTopicAttributes StartTagClose TextContent RelatedTopicEndTag'
    p[0] = {
        'url': p[2]['url'],
        'id': p[2]['id'],
        'text': p[4]
    }

def p_RelatedTopicAttributes(p):
    'RelatedTopicAttributes : UrlKey Uri IdKey Integer'
    p[0] = {
        'url': p[2],
        'id': p[4]
    }
def p_SeeReference(p):
    'SeeReference : SeeReferenceStartTagOpen StartTagClose TextContent SeeReferenceEndTag'
    p[0] = p[3]

def p_Site(p):
    'Site : SiteStartTagOpen SiteAttributes StartTagClose SiteContent SiteEndTag'
    # cause site attributes can have different keys, we need to merge them for be dinamic
    p[0] = { **p[2], **p[4] } # Merge dicts

def p_SiteContent(p):
    'SiteContent : InformationCategoryList OrganizationList StandardDescriptionList'
    p[0] = {
        'information_category': p[1]
    }
    # optional ones
    if p[2]:
        p[0]['organization'] = p[2]

    if p[3]:
        p[0]['standard_description'] = p[3]


def p_InformationCategoryList(p):
    '''InformationCategoryList : InformationCategory InformationCategoryList
    | InformationCategory'''
    p[0] = [p[1]] if len(p) == 2 else [p[1]] + p[2]

def p_OrganizationList(p):
    '''OrganizationList : Organization OrganizationList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_StandardDescriptionList(p):
    '''StandardDescriptionList : StandardDescription StandardDescriptionList
    | empty'''
    if len(p) == 3:
        p[0] = [p[1]] if p[2] is None else [p[1]] + p[2]
    else:
        p[0] = []

def p_SiteAttributes(p):
    '''SiteAttributes : TitleKey String UrlKey Uri LanguageMappedUrlKey Uri
    | TitleKey String UrlKey Uri'''
    if len(p) == 6:
        p[0] = {
            'title': p[2],
            'url': p[4],
            'language_mapped_url': p[6]
        }
    else:
        p[0] = {
            'title': p[2],
            'url': p[4]
        }

def p_AlsoCalled(p):
    'AlsoCalled : AlsoCalledStartTagOpen StartTagClose TextContent AlsoCalledEndTag'
    # Set the content of the also-called
    p[0] = p[3]

def p_Descriptor(p):
    'Descriptor : DescriptorStartTagOpen DescriptorAttributes StartTagClose TextContent DescriptorEndTag'
    p[0] = {
        'id': p[2]['id'],
        'text': p[4]
    }

def p_DescriptorAttributes(p):
    'DescriptorAttributes : IdKey String'
    p[0] = {
        'id': p[2]
    }

def p_Qualifier(p):
    'Qualifier : QualifierStartTagOpen QualifierAttributes StartTagClose TextContent QualifierEndTag'
    p[0] = {
        'id': p[2]["id"],
        'text': p[4]
    }

def p_QualifierAttributes(p):
    'QualifierAttributes : IdKey String'
    p[0] = {
        'id': p[2]
    }

def p_InformationCategory(p):
    'InformationCategory : InformationCategoryStartTagOpen StartTagClose TextContent InformationCategoryEndTag'
    p[0] = p[3]

def p_Organization(p):
    'Organization : OrganizationStartTagOpen StartTagClose TextContent OrganizationEndTag'
    p[0] = p[3]

def p_StandardDescription(p):
    'StandardDescription : StandardDescriptionStartTagOpen StartTagClose TextContent StandardDescriptionEndTag'
    p[0] = p[3]

def p_TextContent(p):
    '''TextContent : TextContent Text
    | TextContent String
    | Text'''
    # Set the content of the element
    p[0] = p[1] + p[2] if len(p) == 3 else p[1]

def p_empty(p):
    'empty : '
    pass

# Error rule for syntax errors
syntax_error_occurred = False

def p_error(p):
    global syntax_error_occurred
    syntax_error_occurred = True
    if p:
        print(f"Syntax error at '{p.value}' on line {p.lineno}")
    else:
        print("Syntax error at EOF")

# Build the parser
parser = yacc.yacc(outputdir=output_folder, tabmodule='parser.parsetab')

In [98]:
with open('../data/mplus_topics.xml', 'r', encoding='utf-8') as file:
    data = file.read()

In [99]:
# Test the parser
global syntax_error_occurred
syntax_error_occurred = False  # Reset error flag before parsing
global data_dict
data_dict = {} # Reset data_dict before parsing

result = parser.parse(data)

if syntax_error_occurred:
    print("Input is invalid due to syntax errors.")
elif result is None:
    pprint.pprint(data_dict, sort_dicts=False)
    print("Input is valid according to the CFG.")
else:
    print("Input is invalid.")

{'total': 2044,
 'date-generated': datetime.datetime(2024, 8, 30, 2, 30, 25),
 'health-topic': [{'meta-desc': 'If you are being tested for Type 2 diabetes, '
                                'your doctor gives you an A1C test. The test '
                                'is also used to monitor your A1C levels.',
                   'title': 'A1C',
                   'url': 'https://medlineplus.gov/a1c.html',
                   'id': 6308,
                   'language': 'English',
                   'date-created': datetime.date(2015, 12, 22),
                   'also-called': ['Glycohemoglobin',
                                   'HbA1C',
                                   'Hemoglobin A1C test'],
                   'full-summary': '&lt;p&gt;A1C is a blood test for &lt;a '
                                   'href="https://medlineplus.gov/diabetestype2.html"&gt;type '
                                   '2 diabetes&lt;/a&gt; and &lt;a '
                                   'href="https://medl