In [1]:
from datetime import datetime
import ply.lex as lex

In [6]:
# List of token names.
tokens = [
    'StartTagClose',
    'XmlDeclStartTag',
    'XmlDeclEndTag',
    'DocTypeDeclStartTag',
    'HealthTopicsStartTagOpen',
    'HealthTopicsEndTag',
    'HealthTopicStartTagOpen',
    'HealthTopicEndTag',
    'AlsoCalledStartTagOpen',
    'AlsoCalledEndTag',
    'FullSummaryStartTagOpen',
    'FullSummaryEndTag',
    'GroupStartTagOpen',
    'GroupEndTag',
    'LanguageMappedTopicStartTagOpen',
    'LanguageMappedTopicEndTag',
    'MeshHeadingStartTagOpen',
    'MeshHeadingEndTag',
    'DescriptorStartTagOpen',
    'DescriptorEndTag',
    'OtherLanguageStartTagOpen',
    'OtherLanguageEndTag',
    'PrimaryInstituteStartTagOpen',
    'PrimaryInstituteEndTag',
    'SeeReferenceStartTagOpen',
    'SeeReferenceEndTag',
    'SiteStartTagOpen',
    'SiteEndTag',
    'InformationCategoryStartTagOpen',
    'InformationCategoryEndTag',
    'OrganizationStartTagOpen',
    'OrganizationEndTag',
    'StandardDescriptionStartTagOpen',
    'StandardDescriptionEndTag',
    'RelatedTopicStartTagOpen',
    'RelatedTopicEndTag',
    'DocTypeDeclName',
    'DocTypeDeclExternalId',
    'VersionKey',
    'EncodingKey',
    'DateGeneratedKey',
    'TotalKey',
    'IdKey',
    'DateCreatedKey',
    'LanguageKey',
    'TitleKey',
    'UrlKey',
    'MetaDescKey',
    'VernacularNameKey',
    'LanguageMappedUrlKey',
    'Timestamp',
    'Integer',
    'Date',
    'Language',
    'String',
    'Uri',
]

# Regular expression rules for simple tokens
t_StartTagClose = r'>'
t_XmlDeclStartTag = r'<\?xml'
t_XmlDeclEndTag = r'\?>'
t_DocTypeDeclStartTag = r'<!DOCTYPE'
t_HealthTopicsStartTagOpen = r'<health-topics'
t_HealthTopicsEndTag = r'</health-topics>'
t_HealthTopicStartTagOpen = r'<health-topic'
t_HealthTopicEndTag = r'</health-topic>'
t_AlsoCalledStartTagOpen = r'<also-called'
t_AlsoCalledEndTag = r'</also-called>'
t_FullSummaryStartTagOpen = r'<full-summary'
t_FullSummaryEndTag = r'</full-summary>'
t_GroupStartTagOpen = r'<group'
t_GroupEndTag = r'</group>'
t_LanguageMappedTopicStartTagOpen = r'<language-mapped-topic'
t_LanguageMappedTopicEndTag = r'</language-mapped-topic>'
t_MeshHeadingStartTagOpen = r'<mesh-heading'
t_MeshHeadingEndTag = r'</mesh-heading>'
t_DescriptorStartTagOpen = r'<descriptor'
t_DescriptorEndTag = r'</descriptor>'
t_OtherLanguageStartTagOpen = r'<other-language'
t_OtherLanguageEndTag = r'</other-language>'
t_PrimaryInstituteStartTagOpen = r'<primary-institute'
t_PrimaryInstituteEndTag = r'</primary-institute>'
t_SeeReferenceStartTagOpen = r'<see-reference'
t_SeeReferenceEndTag = r'</see-reference>'
t_SiteStartTagOpen = r'<site'
t_SiteEndTag = r'</site>'
t_InformationCategoryStartTagOpen = r'<information-category'
t_InformationCategoryEndTag = r'</information-category>'
t_OrganizationStartTagOpen = r'<organization'
t_OrganizationEndTag = r'</organization>'
t_StandardDescriptionStartTagOpen = r'<standard-description'
t_StandardDescriptionEndTag = r'</standard-description>'
t_RelatedTopicStartTagOpen = r'<related-topic'
t_RelatedTopicEndTag = r'</related-topic>'

t_DocTypeDeclName = r' health-topics'
t_DocTypeDeclExternalId = r'PUBLIC\s+"-//NLM//DTD\s+health-topics\s+//EN"\s+"https://medlineplus.gov/xml/mplus_topics.dtd"'

t_VersionKey = r'version='
t_EncodingKey = r'encoding='
t_DateGeneratedKey = r'date-generated='
t_TotalKey = r'total='
t_IdKey = r'id='
t_DateCreatedKey = r'date-created='
t_LanguageKey = r'language='
t_TitleKey = r'title='
t_UrlKey = r'url='
t_MetaDescKey = r'meta-desc='
t_VernacularNameKey = r'vernacular-name='
t_LanguageMappedUrlKey = r'language-mapped-url='

t_Uri = r'\b(?:https?):\/\/[^\s/$.?#].[^\s"<>]*'
t_String = r'"[^"]+"'

def t_Date(t):
    r'"\d{2}/\d{2}/\d{4}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y").date()
    return t

def t_Timestamp(t):
    r'"\d{2}/\d{2}/\d{4}\s\d{2}:\d{2}:\d{2}"'
    value = t.value
    t.value  = datetime.strptime(value.replace('"', ''), "%m/%d/%Y %H:%M:%S")
    return t

def t_Integer(t):
    r'"\d+"'
    value = t.value
    t.value = int(value.replace('"', ''))
    return t

 # can be Spanish or English only
def t_Language(t):
    r'"(Spanish|English)"'
    t.value = t.value.replace('"', '')
    return t

# Define a rule so we can track line numbers
def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)

# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t'

# Error handling rule
def t_error(t):
    print(f"Illegal character '{t.value[0]}'")
    t.lexer.skip(1)

__file__ = "lexer.ipynb"
lexer = lex.lex()

data = """
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE health-topics PUBLIC "-//NLM//DTD health-topics //EN" "https://medlineplus.gov/xml/mplus_topics.dtd">
<health-topics total="1833" date-generated="04/24/2012 04:35:44">
<health-topic meta-desc="Stomach aches can be painful. Find out what might be the cause of your abdominal pain. " title="Abdominal Pain" url="https://www.nlm.nih.gov/medlineplus/abdominalpain.html" id="3061" language="English" date-created="01/07/2003">
<also-called>Bellyache</also-called>
<full-summary><p>Your abdomen extends from below your chest to your groin. Some people call it the stomach, but your abdomen contains many other important organs. Pain in the abdomen can come from any one of them. The pain may start somewhere else, such as your chest. Severe pain doesn't always mean a serious problem. Nor does mild pain mean a problem is not serious. </p>

<p>Call your healthcare provider if mild pain lasts a week or more or if you have pain with other symptoms. Get medical help immediately if</p>
<ul>
<li>      You have abdominal pain that is sudden and sharp</li>
<li>You also have pain in your chest, neck or shoulder </li>
<li>      You're vomiting blood or have blood in your stool </li>
<li>      Your abdomen is stiff, hard and tender to touch </li>
<li>You can't move your bowels, especially if you're also vomiting </li>
</ul>
</full-summary>
<group url="https://www.nlm.nih.gov/medlineplus/digestivesystem.html" id="2">Digestive System</group>
<group url="https://www.nlm.nih.gov/medlineplus/symptoms.html" id="31">Symptoms</group>
<language-mapped-topic url="https://www.nlm.nih.gov/medlineplus/spanish/abdominalpain.html" id="3062" language="Spanish">Dolor abdominal</language-mapped-topic>
<mesh-heading>
<descriptor id="D015746">Abdominal Pain</descriptor>
</mesh-heading>
<other-language vernacular-name="español" url="https://www.nlm.nih.gov/medlineplus/spanish/abdominalpain.html">Spanish</other-language>
<related-topic url="https://www.nlm.nih.gov/medlineplus/pain.html" id="351">Pain</related-topic>
<related-topic url="https://www.nlm.nih.gov/medlineplus/pelvicpain.html" id="4486">Pelvic Pain</related-topic>
<see-reference>Bellyache</see-reference>
<see-reference>Pain, Abdominal</see-reference>
<see-reference>Stomach Ache</see-reference>
<site title="Abdominal CT scan" url="https://www.nlm.nih.gov/medlineplus/ency/article/003789.htm" language-mapped-url="https://www.nlm.nih.gov/medlineplus/spanish/ency/article/003789.htm">
<information-category>Encyclopedia</information-category>
<information-category>Patient Handouts</information-category>
</site>
<site title="Abdominal mass" url="https://www.nlm.nih.gov/medlineplus/ency/article/003274.htm" language-mapped-url="https://www.nlm.nih.gov/medlineplus/spanish/ency/article/003274.htm">
<information-category>Encyclopedia</information-category>
</site>
<site title="Abdominal Migraine" url="http://www.headaches.org/education/Headache_Topic_Sheets/Abdominal_Migraine">
<information-category>Specific Conditions</information-category>
<organization>National Headache Foundation</organization>
</site>
<site title="Your Colicky Baby" url="http://kidshealth.org/parent/growth/growing/colic.html" language-mapped-url="http://kidshealth.org/parent/en_espanol/padres/colic_esp.html">
<information-category>Children</information-category>
<organization>Nemours Foundation</organization>
</site>
<site title="Hypnosis May Help Kids' Stomach Woes Long-Term" url="https://www.nlm.nih.gov/medlineplus/news/fullstory_122279.html">
<information-category>Latest News</information-category>
<organization>Reuters Health</organization>
</site>
</health-topic>
</health-topics>"""

# Give the lexer some input
lexer.input(data)

# Tokenize
for i in range(0, 30):
    tok = lexer.token()
    if not tok:
        break  # No more input
    display(tok)

# while True:
#     tok = lexer.token()
#     if not tok:
#         break  # No more input
#     display(tok)

LexToken(XmlDeclStartTag,'<?xml',2,1)

LexToken(VersionKey,'version=',2,7)

LexToken(String,'"1.0"',2,15)

LexToken(EncodingKey,'encoding=',2,21)

LexToken(String,'"UTF-8"',2,30)

LexToken(XmlDeclEndTag,'?>',2,37)

LexToken(DocTypeDeclStartTag,'<!DOCTYPE',3,40)

LexToken(DocTypeDeclName,'health-topics',3,50)

LexToken(DocTypeDeclExternalId,'PUBLIC "-//NLM//DTD health-topics //EN" "https://medlineplus.gov/xml/mplus_topics.dtd"',3,64)

LexToken(StartTagClose,'>',3,150)

LexToken(HealthTopicsStartTagOpen,'<health-topics',4,152)

LexToken(TotalKey,'total=',4,167)

LexToken(Integer,1833,4,173)

LexToken(DateGeneratedKey,'date-generated=',4,180)

LexToken(Timestamp,datetime.datetime(2012, 4, 24, 4, 35, 44),4,195)

LexToken(StartTagClose,'>',4,216)

LexToken(HealthTopicStartTagOpen,'<health-topic',5,218)

LexToken(MetaDescKey,'meta-desc=',5,232)

LexToken(String,'"Stomach aches can be painful. Find out what might be the cause of your abdominal pain. "',5,242)

LexToken(TitleKey,'title=',5,332)

LexToken(String,'"Abdominal Pain"',5,338)

LexToken(UrlKey,'url=',5,355)

LexToken(String,'"https://www.nlm.nih.gov/medlineplus/abdominalpain.html"',5,359)

LexToken(IdKey,'id=',5,416)

LexToken(Integer,3061,5,419)

LexToken(LanguageKey,'language=',5,426)

LexToken(Language,'English',5,435)

LexToken(DateCreatedKey,'date-created=',5,445)

LexToken(Date,datetime.date(2003, 1, 7),5,458)

LexToken(StartTagClose,'>',5,470)