In [None]:
from datetime import datetime
import ply.lex as lex

In [None]:
# List of token names.
tokens = [
    'StartTagClose',
    'XmlDeclStartTag',
    'XmlDeclEndTag',
    'DocTypeDeclStartTag',
    'HealthTopicsStartTagOpen',
    'HealthTopicsEndTag',
    'HealthTopicStartTagOpen',
    'HealthTopicEndTag',
    'AlsoCalledStartTagOpen',
    'AlsoCalledEndTag',
    'FullSummaryStartTagOpen',
    'FullSummaryEndTag',
    'GroupStartTagOpen',
    'GroupEndTag',
    'LanguageMappedTopicStartTagOpen',
    'LanguageMappedTopicEndTag',
    'MeshHeadingStartTagOpen',
    'MeshHeadingEndTag',
    'DescriptorStartTagOpen',
    'DescriptorEndTag',
    'OtherLanguageStartTagOpen',
    'OtherLanguageEndTag',
    'PrimaryInstituteStartTagOpen',
    'PrimaryInstituteEndTag',
    'SeeReferenceStartTagOpen',
    'SeeReferenceEndTag',
    'SiteStartTagOpen',
    'SiteEndTag',
    'InformationCategoryStartTagOpen',
    'InformationCategoryEndTag',
    'OrganizationStartTagOpen',
    'OrganizationEndTag',
    'StandardDescriptionStartTagOpen',
    'StandardDescriptionEndTag',
    'RelatedTopicStartTagOpen',
    'RelatedTopicEndTag',
    'DocTypeDeclName',
    'DocTypeDeclExternalId',
    'VersionKey',
    'EncodingKey',
    'DateGeneratedKey',
    'TotalKey',
    'IdKey',
    'DateCreatedKey',
    'LanguageKey',
    'TitleKey',
    'UrlKey',
    'MetaDescKey',
    'VernacularNameKey',
    'LanguageMappedUrlKey',
    'Timestamp',
    'Integer',
    'Date',
    'Language',
    'Uri',
    'String',
    'Text'
]

# Regular expression rules for simple tokens
t_HealthTopicsStartTagOpen = r'<health-topics'
t_HealthTopicsEndTag = r'</health-topics>'
t_HealthTopicStartTagOpen = r'<health-topic'
t_HealthTopicEndTag = r'</health-topic>'
t_AlsoCalledStartTagOpen = r'<also-called'
t_AlsoCalledEndTag = r'</also-called>'
t_FullSummaryStartTagOpen = r'<full-summary'
t_FullSummaryEndTag = r'</full-summary>'
t_GroupStartTagOpen = r'<group'
t_GroupEndTag = r'</group>'
t_LanguageMappedTopicStartTagOpen = r'<language-mapped-topic'
t_LanguageMappedTopicEndTag = r'</language-mapped-topic>'
t_MeshHeadingStartTagOpen = r'<mesh-heading'
t_MeshHeadingEndTag = r'</mesh-heading>'
t_DescriptorStartTagOpen = r'<descriptor'
t_DescriptorEndTag = r'</descriptor>'
t_OtherLanguageStartTagOpen = r'<other-language'
t_OtherLanguageEndTag = r'</other-language>'
t_PrimaryInstituteStartTagOpen = r'<primary-institute'
t_PrimaryInstituteEndTag = r'</primary-institute>'
t_SeeReferenceStartTagOpen = r'<see-reference'
t_SeeReferenceEndTag = r'</see-reference>'
t_SiteStartTagOpen = r'<site'
t_SiteEndTag = r'</site>'
t_InformationCategoryStartTagOpen = r'<information-category'
t_InformationCategoryEndTag = r'</information-category>'
t_OrganizationStartTagOpen = r'<organization'
t_OrganizationEndTag = r'</organization>'
t_StandardDescriptionStartTagOpen = r'<standard-description'
t_StandardDescriptionEndTag = r'</standard-description>'
t_RelatedTopicStartTagOpen = r'<related-topic'
t_RelatedTopicEndTag = r'</related-topic>'

# Regular expression for Text
t_Text = r'[\w ¿?!:;,&=\.\-\'\"%\*\(\)/\t]+'

# Prolog tags
def t_XmlDeclStartTag(t):
    r'<\?xml'
    return t

def t_XmlDeclEndTag(t):
    r'\?>'
    return t

def t_StartTagClose(t):
    r'>'
    return t

def t_DocTypeDeclStartTag(t):
    r'<!DOCTYPE'
    return t

def t_DocTypeDeclName(t):
    r' health-topics'
    return t

def t_DocTypeDeclExternalId(t):
    r'PUBLIC\s+"-//NLM//DTD\s+health-topics\s+//EN"\s+"https://medlineplus.gov/xml/mplus_topics.dtd"'
    return t

# Keys of attributes
def t_VersionKey(t):
    r'version='
    return t

def t_EncodingKey(t):
    r'encoding='
    return t

def t_DateGeneratedKey(t):
    r'date-generated='
    return t

def t_TotalKey(t):
    r'total='
    return t

def t_IdKey(t):
    r'id='
    return t

def t_DateCreatedKey(t):
    r'date-created='
    return t

def t_LanguageKey(t):
    r'language='
    return t

def t_TitleKey(t):
    r'title='
    return t

def t_UrlKey(t):
    r'url='
    return t

def t_MetaDescKey(t):
    r'meta-desc='
    return t

def t_VernacularNameKey(t):
    r'vernacular-name='
    return t

def t_LanguageMappedUrlKey(t):
    r'language-mapped-url='
    return t

# Value types of attributes
def t_Date(t):
    r'"\d{2}/\d{2}/\d{4}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y").date()
    return t

def t_Timestamp(t):
    r'"\d{2}/\d{2}/\d{4}\s\d{2}:\d{2}:\d{2}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y %H:%M:%S")
    return t

def t_Integer(t):
    r'"\d+"'
    value = t.value
    t.value = int(value.replace('"', ''))
    return t

def t_Language(t):
    r'"(English|Spanish)"'
    t.value = t.value.replace('"', '')
    return t

def t_Uri(t):
    r'"(?:https?):\/\/[^\s/$.?#].[^\s"<>]*[^"<>]*"'
    t.value = t.value.replace('"', '')
    return t

def t_String(t):
    r'"[^"]*"'
    t.value = t.value.replace('"', '')
    return t

# Define a rule so we can track line numbers
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)

# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'

# Error handling rule
def t_error(t):
    print(f"Illegal character '{t.value[0]}'")
    t.lexer.skip(1)

__file__ = "lexer.ipynb"
lexer = lex.lex()

with open('../data/mplus_topics.xml', 'r', encoding='utf-8') as file:
    data = file.read()

# Give the lexer some input
lexer.input(data)

# Tokenize
for i in range(0, 248):
    tok = lexer.token()
    if not tok:
        break  # No more input
    display(tok)

# while True:
#     tok = lexer.token()
#     if not tok:
#         break  # No more input
#     display(tok)