In [None]:
from datetime import datetime
import ply.lex as lex
import ply.yacc as yacc

In [None]:
__file__ = "parser.ipynb"

# Lexer

In [None]:
# List of token names.
tokens = [
    'StartTagClose',
    'XmlDeclStartTag',
    'XmlDeclEndTag',
    'DocTypeDeclStartTag',
    'HealthTopicsStartTagOpen',
    'HealthTopicsEndTag',
    'HealthTopicStartTagOpen',
    'HealthTopicEndTag',
    'AlsoCalledStartTagOpen',
    'AlsoCalledEndTag',
    'FullSummaryStartTagOpen',
    'FullSummaryEndTag',
    'GroupStartTagOpen',
    'GroupEndTag',
    'LanguageMappedTopicStartTagOpen',
    'LanguageMappedTopicEndTag',
    'MeshHeadingStartTagOpen',
    'MeshHeadingEndTag',
    'DescriptorStartTagOpen',
    'DescriptorEndTag',
    'QualifierStartTagOpen',
    'QualifierEndTag',
    'OtherLanguageStartTagOpen',
    'OtherLanguageEndTag',
    'PrimaryInstituteStartTagOpen',
    'PrimaryInstituteEndTag',
    'SeeReferenceStartTagOpen',
    'SeeReferenceEndTag',
    'SiteStartTagOpen',
    'SiteEndTag',
    'InformationCategoryStartTagOpen',
    'InformationCategoryEndTag',
    'OrganizationStartTagOpen',
    'OrganizationEndTag',
    'StandardDescriptionStartTagOpen',
    'StandardDescriptionEndTag',
    'RelatedTopicStartTagOpen',
    'RelatedTopicEndTag',
    'DocTypeDeclName',
    'DocTypeDeclExternalId',
    'VersionKey',
    'EncodingKey',
    'DateGeneratedKey',
    'TotalKey',
    'IdKey',
    'DateCreatedKey',
    'LanguageKey',
    'TitleKey',
    'UrlKey',
    'MetaDescKey',
    'VernacularNameKey',
    'LanguageMappedUrlKey',
    'Timestamp',
    'Integer',
    'Date',
    'Language',
    'Uri',
    'String',
    'Text'
]

# Regular expression rules for simple tokens
t_HealthTopicsStartTagOpen = r'<health-topics'
t_HealthTopicsEndTag = r'</health-topics>'
t_HealthTopicStartTagOpen = r'<health-topic'
t_HealthTopicEndTag = r'</health-topic>'
t_AlsoCalledStartTagOpen = r'<also-called'
t_AlsoCalledEndTag = r'</also-called>'
t_FullSummaryStartTagOpen = r'<full-summary'
t_FullSummaryEndTag = r'</full-summary>'
t_GroupStartTagOpen = r'<group'
t_GroupEndTag = r'</group>'
t_LanguageMappedTopicStartTagOpen = r'<language-mapped-topic'
t_LanguageMappedTopicEndTag = r'</language-mapped-topic>'
t_MeshHeadingStartTagOpen = r'<mesh-heading'
t_MeshHeadingEndTag = r'</mesh-heading>'
t_DescriptorStartTagOpen = r'<descriptor'
t_DescriptorEndTag = r'</descriptor>'
t_QualifierStartTagOpen = r'<qualifier'
t_QualifierEndTag = r'</qualifier>'
t_OtherLanguageStartTagOpen = r'<other-language'
t_OtherLanguageEndTag = r'</other-language>'
t_PrimaryInstituteStartTagOpen = r'<primary-institute'
t_PrimaryInstituteEndTag = r'</primary-institute>'
t_SeeReferenceStartTagOpen = r'<see-reference'
t_SeeReferenceEndTag = r'</see-reference>'
t_SiteStartTagOpen = r'<site'
t_SiteEndTag = r'</site>'
t_InformationCategoryStartTagOpen = r'<information-category'
t_InformationCategoryEndTag = r'</information-category>'
t_OrganizationStartTagOpen = r'<organization'
t_OrganizationEndTag = r'</organization>'
t_StandardDescriptionStartTagOpen = r'<standard-description'
t_StandardDescriptionEndTag = r'</standard-description>'
t_RelatedTopicStartTagOpen = r'<related-topic'
t_RelatedTopicEndTag = r'</related-topic>'

# Regular expression for Text
t_Text = r'[\w ¿?¡!:;,#&=+°•\.\-\'\"%\*\{\}\[\]\(\)/\t]+'

# Prolog tags
def t_XmlDeclStartTag(t):
    r'<\?xml'
    return t

def t_XmlDeclEndTag(t):
    r'\?>'
    return t

def t_StartTagClose(t):
    r'>'
    return t

def t_DocTypeDeclStartTag(t):
    r'<!DOCTYPE'
    return t

def t_DocTypeDeclName(t):
    r' health-topics'
    return t

def t_DocTypeDeclExternalId(t):
    r'PUBLIC\s+"-//NLM//DTD\s+health-topics\s+//EN"\s+"https://medlineplus.gov/xml/mplus_topics.dtd"'
    return t

# Keys of attributes
def t_VersionKey(t):
    r'version='
    return t

def t_EncodingKey(t):
    r'encoding='
    return t

def t_DateGeneratedKey(t):
    r'date-generated='
    return t

def t_TotalKey(t):
    r'total='
    return t

def t_IdKey(t):
    r'id='
    return t

def t_DateCreatedKey(t):
    r'date-created='
    return t

def t_LanguageKey(t):
    r'language='
    return t

def t_TitleKey(t):
    r'title='
    return t

def t_UrlKey(t):
    r'url='
    return t

def t_MetaDescKey(t):
    r'meta-desc='
    return t

def t_VernacularNameKey(t):
    r'vernacular-name='
    return t

def t_LanguageMappedUrlKey(t):
    r'language-mapped-url='
    return t

# Value types of attributes
def t_Date(t):
    r'"\d{2}/\d{2}/\d{4}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y").date()
    return t

def t_Timestamp(t):
    r'"\d{2}/\d{2}/\d{4}\s\d{2}:\d{2}:\d{2}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y %H:%M:%S")
    return t

def t_Integer(t):
    r'"\d+"'
    value = t.value
    t.value = int(value.replace('"', ''))
    return t

def t_Language(t):
    r'"(English|Spanish)"'
    t.value = t.value.replace('"', '')
    return t

def t_Uri(t):
    r'"(?:https?):\/\/[^\s/$.?#].[^\s"<>]*[^"<>]*"'
    t.value = t.value.replace('"', '')
    return t

def t_String(t):
    r'"[^"]+"'
    t.value = t.value.replace('"', '')
    return t

# Define a rule so we can track line numbers
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)

# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'

# Error handling rule
def t_error(t):
    print(f"Illegal character '{t.value[0]}' at line {t.lexer.lineno}")
    t.lexer.skip(1)

lexer = lex.lex()

# Parser

In [None]:
import os
output_folder = './parser'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
# Parsing rules
def p_Document(p):
    'Document : Prolog HealthTopics'
    pass

def p_Prolog(p):
    'Prolog : XmlDeclStartTag XmlDeclAttributes XmlDeclEndTag DocTypeDeclStartTag DocTypeDeclName DocTypeDeclExternalId StartTagClose'
    pass

def p_XmlDeclAttributes(p):
    'XmlDeclAttributes : VersionKey String EncodingKey String'
    pass

def p_HealthTopics(p):
    'HealthTopics : HealthTopicsStartTagOpen HealthTopicsAttributes StartTagClose HealthTopicList HealthTopicsEndTag'
    pass

def p_HealthTopicsAttributes(p):
    'HealthTopicsAttributes : TotalKey Integer DateGeneratedKey Timestamp'
    pass

def p_HealthTopicList(p):
    '''HealthTopicList : HealthTopic HealthTopicList 
    | HealthTopic'''
    pass

def p_HealthTopic(p):
    'HealthTopic : HealthTopicStartTagOpen HealthTopicAttributes StartTagClose HealthTopicContent HealthTopicEndTag'
    pass

def p_HealthTopicAttributes(p):
    'HealthTopicAttributes : MetaDescKey String TitleKey String UrlKey Uri IdKey Integer LanguageKey Language DateCreatedKey Date'
    pass

def p_HealthTopicContent(p):
    'HealthTopicContent : AlsoCalledList FullSummary GroupList LanguageMappedTopicOpt MeshHeadingList OtherLanguageList PrimaryInstituteOpt RelatedTopicList SeeReferenceList SiteList'
    pass

def p_AlsoCalledList(p):
    '''AlsoCalledList : AlsoCalled AlsoCalledList 
    | empty'''
    pass

def p_GroupList(p):
    '''GroupList : Group GroupList 
    | empty'''
    pass

def p_LanguageMappedTopicOpt(p):
    '''LanguageMappedTopicOpt : LanguageMappedTopic 
    | empty'''
    pass

def p_MeshHeadingList(p):
    '''MeshHeadingList : MeshHeading MeshHeadingList 
    | empty'''
    pass

def p_OtherLanguageList(p):
    '''OtherLanguageList : OtherLanguage OtherLanguageList 
    | empty'''
    pass

def p_PrimaryInstituteOpt(p):
    '''PrimaryInstituteOpt : PrimaryInstitute 
    | empty'''
    pass

def p_RelatedTopicList(p):
    '''RelatedTopicList : RelatedTopic RelatedTopicList 
    | empty'''
    pass

def p_SeeReferenceList(p):
    '''SeeReferenceList : SeeReference SeeReferenceList 
    | empty'''
    pass

def p_SiteList(p):
    '''SiteList : Site SiteList 
    | empty'''
    pass

def p_FullSummary(p):
    'FullSummary : FullSummaryStartTagOpen StartTagClose TextContent FullSummaryEndTag'
    pass

def p_Group(p):
    'Group : GroupStartTagOpen GroupAttributes StartTagClose TextContent GroupEndTag'
    pass

def p_GroupAttributes(p):
    'GroupAttributes : UrlKey Uri IdKey Integer'
    pass

def p_LanguageMappedTopic(p):
    'LanguageMappedTopic : LanguageMappedTopicStartTagOpen LanguageMappedTopicAttributes StartTagClose TextContent LanguageMappedTopicEndTag'
    pass

def p_LanguageMappedTopicAttributes(p):
    'LanguageMappedTopicAttributes : UrlKey Uri IdKey Integer LanguageKey Language'
    pass

def p_MeshHeading(p):
    'MeshHeading : MeshHeadingStartTagOpen StartTagClose Descriptor QualifierList MeshHeadingEndTag'
    pass

def p_QualifierList(p):
    '''QualifierList : Qualifier QualifierList 
    | empty'''
    pass

def p_OtherLanguage(p):
    'OtherLanguage : OtherLanguageStartTagOpen OtherLanguageAttributes StartTagClose TextContent OtherLanguageEndTag'
    pass

def p_OtherLanguageAttributes(p):
    'OtherLanguageAttributes : VernacularNameKey String UrlKey Uri'
    pass

def p_PrimaryInstitute(p):
    'PrimaryInstitute : PrimaryInstituteStartTagOpen PrimaryInstituteAttributes StartTagClose TextContent PrimaryInstituteEndTag'
    pass

def p_PrimaryInstituteAttributes(p):
    'PrimaryInstituteAttributes : UrlKey Uri'
    pass

def p_RelatedTopic(p):
    'RelatedTopic : RelatedTopicStartTagOpen RelatedTopicAttributes StartTagClose TextContent RelatedTopicEndTag'
    pass

def p_RelatedTopicAttributes(p):
    'RelatedTopicAttributes : UrlKey Uri IdKey Integer'
    pass

def p_SeeReference(p):
    'SeeReference : SeeReferenceStartTagOpen StartTagClose TextContent SeeReferenceEndTag'
    pass

def p_Site(p):
    'Site : SiteStartTagOpen SiteAttributes StartTagClose SiteContent SiteEndTag'
    pass

def p_SiteContent(p):
    'SiteContent : InformationCategoryList OrganizationList StandardDescriptionList'
    pass

def p_InformationCategoryList(p):
    '''InformationCategoryList : InformationCategory InformationCategoryList 
    | InformationCategory'''
    pass

def p_OrganizationList(p):
    '''OrganizationList : Organization OrganizationList 
    | empty'''
    pass

def p_StandardDescriptionList(p):
    '''StandardDescriptionList : StandardDescription StandardDescriptionList 
    | empty'''
    pass

def p_SiteAttributes(p):
    '''SiteAttributes : TitleKey String UrlKey Uri LanguageMappedUrlKey Uri 
    | TitleKey String UrlKey Uri'''
    pass

def p_AlsoCalled(p):
    'AlsoCalled : AlsoCalledStartTagOpen StartTagClose TextContent AlsoCalledEndTag'
    pass

def p_Descriptor(p):
    'Descriptor : DescriptorStartTagOpen DescriptorAttributes StartTagClose TextContent DescriptorEndTag'
    pass

def p_DescriptorAttributes(p):
    'DescriptorAttributes : IdKey String'
    pass

def p_Qualifier(p):
    'Qualifier : QualifierStartTagOpen QualifierAttributes StartTagClose TextContent QualifierEndTag'
    pass

def p_QualifierAttributes(p):
    'QualifierAttributes : IdKey String'
    pass

def p_InformationCategory(p):
    'InformationCategory : InformationCategoryStartTagOpen StartTagClose TextContent InformationCategoryEndTag'
    pass

def p_Organization(p):
    'Organization : OrganizationStartTagOpen StartTagClose TextContent OrganizationEndTag'
    pass

def p_StandardDescription(p):
    'StandardDescription : StandardDescriptionStartTagOpen StartTagClose TextContent StandardDescriptionEndTag'
    pass

def p_TextContent(p):
    '''TextContent : TextContent Text
    | TextContent String
    | Text'''
    pass

def p_empty(p):
    'empty : '
    pass

# Error rule for syntax errors
syntax_error_occurred = False

def p_error(p):
    global syntax_error_occurred
    syntax_error_occurred = True
    if p:
        print(f"Syntax error at '{p.value}' on line {p.lineno}")
    else:
        print("Syntax error at EOF")

# Build the parser
parser = yacc.yacc(outputdir=output_folder, tabmodule='parser.parsetab')

In [None]:
with open('../data/mplus_topics.xml', 'r', encoding='utf-8') as file:
    data = file.read()

In [None]:
# Test the parser
global syntax_error_occurred
syntax_error_occurred = False  # Reset error flag before parsing

result = parser.parse(data)

if syntax_error_occurred:
    print("Input is invalid due to syntax errors.")
elif result is None:
    print("Input is valid according to the CFG.")
else:
    print("Input is invalid.")