In [12]:
from datetime import datetime
import ply.lex as lex

In [13]:
# List of token names.
tokens = [
    'StartTagClose',
    'XmlDeclStartTag',
    'XmlDeclEndTag',
    'DocTypeDeclStartTag',
    'HealthTopicsStartTagOpen',
    'HealthTopicsEndTag',
    'HealthTopicStartTagOpen',
    'HealthTopicEndTag',
    'AlsoCalledStartTagOpen',
    'AlsoCalledEndTag',
    'FullSummaryStartTagOpen',
    'FullSummaryEndTag',
    'GroupStartTagOpen',
    'GroupEndTag',
    'LanguageMappedTopicStartTagOpen',
    'LanguageMappedTopicEndTag',
    'MeshHeadingStartTagOpen',
    'MeshHeadingEndTag',
    'DescriptorStartTagOpen',
    'DescriptorEndTag',
    'OtherLanguageStartTagOpen',
    'OtherLanguageEndTag',
    'PrimaryInstituteStartTagOpen',
    'PrimaryInstituteEndTag',
    'SeeReferenceStartTagOpen',
    'SeeReferenceEndTag',
    'SiteStartTagOpen',
    'SiteEndTag',
    'InformationCategoryStartTagOpen',
    'InformationCategoryEndTag',
    'OrganizationStartTagOpen',
    'OrganizationEndTag',
    'StandardDescriptionStartTagOpen',
    'StandardDescriptionEndTag',
    'RelatedTopicStartTagOpen',
    'RelatedTopicEndTag',
    'DocTypeDeclName',
    'DocTypeDeclExternalId',
    'VersionKey',
    'EncodingKey',
    'DateGeneratedKey',
    'TotalKey',
    'IdKey',
    'DateCreatedKey',
    'LanguageKey',
    'TitleKey',
    'UrlKey',
    'MetaDescKey',
    'VernacularNameKey',
    'LanguageMappedUrlKey',
    'Timestamp',
    'Integer',
    'Date',
    'Uri',
    'String',
    'Text'
]

# Regular expression rules for simple tokens
t_StartTagClose = r'>'
t_XmlDeclStartTag = r'<\?xml'
t_XmlDeclEndTag = r'\?>'
t_DocTypeDeclStartTag = r'<!DOCTYPE'
t_HealthTopicsStartTagOpen = r'<health-topics'
t_HealthTopicsEndTag = r'</health-topics>'
t_HealthTopicStartTagOpen = r'<health-topic'
t_HealthTopicEndTag = r'</health-topic>'
t_AlsoCalledStartTagOpen = r'<also-called'
t_AlsoCalledEndTag = r'</also-called>'
t_FullSummaryStartTagOpen = r'<full-summary'
t_FullSummaryEndTag = r'</full-summary>'
t_GroupStartTagOpen = r'<group'
t_GroupEndTag = r'</group>'
t_LanguageMappedTopicStartTagOpen = r'<language-mapped-topic'
t_LanguageMappedTopicEndTag = r'</language-mapped-topic>'
t_MeshHeadingStartTagOpen = r'<mesh-heading'
t_MeshHeadingEndTag = r'</mesh-heading>'
t_DescriptorStartTagOpen = r'<descriptor'
t_DescriptorEndTag = r'</descriptor>'
t_OtherLanguageStartTagOpen = r'<other-language'
t_OtherLanguageEndTag = r'</other-language>'
t_PrimaryInstituteStartTagOpen = r'<primary-institute'
t_PrimaryInstituteEndTag = r'</primary-institute>'
t_SeeReferenceStartTagOpen = r'<see-reference'
t_SeeReferenceEndTag = r'</see-reference>'
t_SiteStartTagOpen = r'<site'
t_SiteEndTag = r'</site>'
t_InformationCategoryStartTagOpen = r'<information-category'
t_InformationCategoryEndTag = r'</information-category>'
t_OrganizationStartTagOpen = r'<organization'
t_OrganizationEndTag = r'</organization>'
t_StandardDescriptionStartTagOpen = r'<standard-description'
t_StandardDescriptionEndTag = r'</standard-description>'
t_RelatedTopicStartTagOpen = r'<related-topic'
t_RelatedTopicEndTag = r'</related-topic>'

t_DocTypeDeclName = r' health-topics'
t_DocTypeDeclExternalId = r'PUBLIC\s+"-//NLM//DTD\s+health-topics\s+//EN"\s+"https://medlineplus.gov/xml/mplus_topics.dtd"'

t_VersionKey = r'version='
t_EncodingKey = r'encoding='
t_DateGeneratedKey = r'date-generated='
t_TotalKey = r'total='
t_IdKey = r'id='
t_DateCreatedKey = r'date-created='
t_LanguageKey = r'language='
t_TitleKey = r'title='
t_UrlKey = r'url='
t_MetaDescKey = r'meta-desc='
t_VernacularNameKey = r'vernacular-name='
t_LanguageMappedUrlKey = r'language-mapped-url='

t_Uri = r'"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"'
t_String = r'"[^"]+"'
t_Text = r'(?<=>)[^\r\n]+(?=<)'

def t_Date(t):
     r'"\d{2}/\d{2}/\d{4}"'
     value = t.value
     t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y").date()
     return t

def t_Timestamp(t):
     r'"\d{2}/\d{2}/\d{4}\s\d{2}:\d{2}:\d{2}"'
     value = t.value
     t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y %H:%M:%S")
     return t

def t_Integer(t):
     r'"\d+"'
     value = t.value
     t.value = int(value.replace('"', ''))
     return t

# Define a rule so we can track line numbers
def t_newline(t):
     r'\n+'
     t.lexer.lineno += len(t.value)

# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'

# Error handling rule
def t_error(t):
    print(f"Illegal character '{t.value[0]}'")
    t.lexer.skip(1)

__file__ = "lexer.ipynb"
lexer = lex.lex()

with open('../data/mplus_topics.xml', 'r') as file:
    data = file.read()

# Give the lexer some input
lexer.input(data)

# Tokenize
for i in range(0, 33):
    tok = lexer.token()
    if not tok:
        break  # No more input
    display(tok)

# while True:
#     tok = lexer.token()
#     if not tok:
#         break  # No more input
#     display(tok)

LexToken(XmlDeclStartTag,'<?xml',1,0)

LexToken(VersionKey,'version=',1,6)

LexToken(String,'"1.0"',1,14)

LexToken(EncodingKey,'encoding=',1,20)

LexToken(String,'"UTF-8"',1,29)

LexToken(XmlDeclEndTag,'?>',1,36)

LexToken(DocTypeDeclStartTag,'<!DOCTYPE',2,39)

LexToken(DocTypeDeclName,'health-topics',2,49)

LexToken(DocTypeDeclExternalId,'PUBLIC "-//NLM//DTD health-topics //EN" "https://medlineplus.gov/xml/mplus_topics.dtd"',2,63)

LexToken(StartTagClose,'>',2,149)

LexToken(HealthTopicsStartTagOpen,'<health-topics',3,151)

LexToken(TotalKey,'total=',3,166)

LexToken(Integer,2044,3,172)

LexToken(DateGeneratedKey,'date-generated=',3,179)

LexToken(Timestamp,datetime.datetime(2024, 8, 30, 2, 30, 25),3,194)

LexToken(StartTagClose,'>',3,215)

LexToken(HealthTopicStartTagOpen,'<health-topic',4,218)

LexToken(MetaDescKey,'meta-desc=',4,232)

LexToken(String,'"If you are being tested for Type 2 diabetes, your doctor gives you an A1C test. The test is also used to monitor your A1C levels."',4,242)

LexToken(TitleKey,'title=',4,374)

LexToken(String,'"A1C"',4,380)

LexToken(UrlKey,'url=',4,386)

LexToken(Uri,'"https://medlineplus.gov/a1c.html"',4,390)

LexToken(IdKey,'id=',4,425)

LexToken(Integer,6308,4,428)

LexToken(LanguageKey,'language=',4,435)

LexToken(String,'"English"',4,444)

LexToken(DateCreatedKey,'date-created=',4,454)

LexToken(Date,datetime.date(2015, 12, 22),4,467)

LexToken(StartTagClose,'>',4,479)

LexToken(AlsoCalledStartTagOpen,'<also-called',5,483)

LexToken(StartTagClose,'>',5,495)

LexToken(Text,'Glycohemoglobin',5,496)