# Get our files

In [2]:
# Domain.dtd: https://drive.google.com/file/d/1hPvMD5AEv77L-0seoeNGRwDyMa2OduFu/view?usp=sharing

# Broken.xml: https://drive.google.com/file/d/1Az9hwTOq00nFtxJBpTWgyuKTHLW5pDUO/view?usp=sharing

# Sample.xml: https://drive.google.com/file/d/1VRVnjt_gOcc6PSLICHRHCFjzwlalhRBB/view?usp=sharing

# XML (valid)
!gdown 1VRVnjt_gOcc6PSLICHRHCFjzwlalhRBB -O sample.xml

# DTD
!gdown 1hPvMD5AEv77L-0seoeNGRwDyMa2OduFu -O domain.dtd

# XML (broken)
!gdown 1Az9hwTOq00nFtxJBpTWgyuKTHLW5pDUO -O broken.xml

Downloading...
From (original): https://drive.google.com/uc?id=1VRVnjt_gOcc6PSLICHRHCFjzwlalhRBB
From (redirected): https://drive.google.com/uc?id=1VRVnjt_gOcc6PSLICHRHCFjzwlalhRBB&confirm=t&uuid=a0e813e0-87b6-458f-a658-0667c44a5ca7
To: /content/sample.xml
100% 700/700 [00:00<00:00, 2.86MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hPvMD5AEv77L-0seoeNGRwDyMa2OduFu
To: /content/domain.dtd
100% 570/570 [00:00<00:00, 2.10MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1Az9hwTOq00nFtxJBpTWgyuKTHLW5pDUO
From (redirected): https://drive.google.com/uc?id=1Az9hwTOq00nFtxJBpTWgyuKTHLW5pDUO&confirm=t&uuid=b57ebb89-3ad7-4111-bc0e-23947e3a26d4
To: /content/broken.xml
100% 527/527 [00:00<00:00, 2.53MB/s]


# Install helpers

In [3]:
!pip -q install lxml pygments

In [4]:
from lxml import etree

# Load DTD
with open("domain.dtd", "rb") as f:
    dtd = etree.DTD(f)

# Parse XML
tree = etree.parse("sample.xml")

# Validate
if dtd.validate(tree):
    print("XML is valid ✅")
else:
    print("XML is NOT valid ❌")
    print(dtd.error_log.filter_from_errors())

XML is NOT valid ❌
sample.xml:3:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element coursework
sample.xml:4:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element assignment
sample.xml:4:0:ERROR:VALID:DTD_UNKNOWN_ATTRIBUTE: No declaration for attribute id of element assignment
sample.xml:5:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element title
sample.xml:6:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element type
sample.xml:7:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element dueDate
sample.xml:8:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element weight
sample.xml:9:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element submission
sample.xml:10:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element format
sample.xml:11:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element policy
sample.xml:14:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element assignment
sample.xml:14:0:ERROR:VALID:DTD_UNKNOWN_ATTRIBUTE: No declaration for 

In [5]:
from lxml import etree

with open("domain.dtd", "rb") as f:
    dtd = etree.DTD(f)

tree = etree.parse("broken.xml")

if dtd.validate(tree):
    print("XML is valid ✅")
else:
    print("XML is NOT valid ❌")
    print(dtd.error_log.filter_from_errors())

XML is NOT valid ❌
broken.xml:3:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element coursework
broken.xml:4:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element assignment
broken.xml:4:0:ERROR:VALID:DTD_UNKNOWN_ATTRIBUTE: No declaration for attribute id of element assignment
broken.xml:5:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element title
broken.xml:6:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element type
broken.xml:7:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element dueDate
broken.xml:8:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element weight
broken.xml:10:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element format
broken.xml:11:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element policy


## Let's look at our files

In [6]:
from lxml import etree
from IPython.display import HTML, display
from pygments import highlight
from pygments.lexers import XmlLexer, DtdLexer
from pygments.formatters import HtmlFormatter

def pretty_xml_text(path: str) -> str:
    """Load XML and return an indented, unicode string."""
    parser = etree.XMLParser(remove_blank_text=True)
    tree = etree.parse(path, parser)
    return etree.tostring(tree, pretty_print=True, encoding="unicode")

def show_code(text: str, lexer, title: str = None, max_height: str = "480px"):
    """Render syntax-highlighted code with line numbers in a scrollable box."""
    formatter = HtmlFormatter(linenos="table", style="friendly")
    css = formatter.get_style_defs('.highlight')
    html = [f"<style>{css}.codebox{{border:1px solid #e5e7eb;border-radius:10px;overflow:auto;max-height:{max_height}}}.title{{font-weight:600;margin:4px 0 8px}}</style>"]
    if title:
        html.append(f'<div class="title">{title}</div>')
    html.append(f'<div class="codebox">{highlight(text, lexer, formatter)}</div>')
    display(HTML("".join(html)))

# Show the pretty XML and the DTD (side by side calls)
show_code(pretty_xml_text("sample.xml"), XmlLexer(), "sample.xml (pretty-printed)")
show_code(open("domain.dtd", encoding="utf-8").read(), DtdLexer(), "domain.dtd")


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21,"<!DOCTYPE coursework SYSTEM ""domain.dtd""> <coursework>  <assignment id=""A1"">  <title>Research Paper</title>  <type>Essay</type>  <dueDate>2025-09-15</dueDate>  <weight>30</weight>  <submission>Canvas</submission>  <format>PDF</format>  <policy>Late penalty: -5% per day</policy>  </assignment>  <assignment id=""A2"">  <title>Midterm Exam</title>  <type>Exam</type>  <dueDate>2025-10-10</dueDate>  <weight>25</weight>  <submission>In-person</submission>  <format>Paper</format>  <policy>No late submission allowed</policy>  </assignment> </coursework>"


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17,"<!ELEMENT orders (order+)> <!ELEMENT order (restaurant, items, totalPrice, payment, delivery)> <!ATTLIST order id ID #REQUIRED> <!ELEMENT restaurant (#PCDATA)> <!ELEMENT items (item+)> <!ELEMENT item (name, quantity)> <!ELEMENT name (#PCDATA)> <!ELEMENT quantity (#PCDATA)> <!ELEMENT totalPrice (#PCDATA)> <!ELEMENT payment (#PCDATA)> <!-- e.g. Paid, Unpaid --> <!ELEMENT delivery (method, address, eta)> <!ELEMENT method (#PCDATA)> <!-- Pickup or Delivery --> <!ELEMENT address (#PCDATA)> <!ELEMENT eta (#PCDATA)> <!-- Estimated time in minutes -->"


In [7]:
show_code(pretty_xml_text("broken.xml"), XmlLexer(), "broken.xml (pretty-printed)")

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13,"<!DOCTYPE coursework SYSTEM ""domain.dtd""> <coursework>  <assignment id=""A1"">  <title>Final Project</title>  <type>Project</type>  <dueDate>2025-12-05</dueDate>  <weight>40</weight>  <!-- ERROR: Missing <submission> element, required by DTD -->  <format>DOCX</format>  <policy>Late penalty: -10% per day</policy>  </assignment> </coursework> <!-- ❌ This will fail because <submission> is missing, which is required by the DTD. -->"


# Display Variant

In [8]:
def show_collapsible(title: str, text: str, lexer, open_default=False):
    fmt = HtmlFormatter(linenos="table", style="friendly")
    css = fmt.get_style_defs('.highlight')
    details_attr = "open" if open_default else ""
    html = f"""
    <style>{css}</style>
    <details {details_attr} style="margin:6px 0">
      <summary style="cursor:pointer;font-weight:600">{title}</summary>
      <div style="border:1px solid #e5e7eb;border-radius:10px;overflow:auto;max-height:520px;margin-top:8px">
        {highlight(text, lexer, fmt)}
      </div>
    </details>
    """
    display(HTML(html))

show_collapsible("sample.xml (pretty-printed)", pretty_xml_text("sample.xml"), XmlLexer(), open_default=True)
show_collapsible("domain.dtd", open("domain.dtd", encoding="utf-8").read(), DtdLexer())

0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21,"<!DOCTYPE coursework SYSTEM ""domain.dtd""> <coursework>  <assignment id=""A1"">  <title>Research Paper</title>  <type>Essay</type>  <dueDate>2025-09-15</dueDate>  <weight>30</weight>  <submission>Canvas</submission>  <format>PDF</format>  <policy>Late penalty: -5% per day</policy>  </assignment>  <assignment id=""A2"">  <title>Midterm Exam</title>  <type>Exam</type>  <dueDate>2025-10-10</dueDate>  <weight>25</weight>  <submission>In-person</submission>  <format>Paper</format>  <policy>No late submission allowed</policy>  </assignment> </coursework>"


0,1
1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17,"<!ELEMENT orders (order+)> <!ELEMENT order (restaurant, items, totalPrice, payment, delivery)> <!ATTLIST order id ID #REQUIRED> <!ELEMENT restaurant (#PCDATA)> <!ELEMENT items (item+)> <!ELEMENT item (name, quantity)> <!ELEMENT name (#PCDATA)> <!ELEMENT quantity (#PCDATA)> <!ELEMENT totalPrice (#PCDATA)> <!ELEMENT payment (#PCDATA)> <!-- e.g. Paid, Unpaid --> <!ELEMENT delivery (method, address, eta)> <!ELEMENT method (#PCDATA)> <!-- Pickup or Delivery --> <!ELEMENT address (#PCDATA)> <!ELEMENT eta (#PCDATA)> <!-- Estimated time in minutes -->"


In [9]:
from lxml import etree

with open("domain.dtd","rb") as f:
    dtd = etree.DTD(f)

tree_ok = etree.parse("sample.xml")
tree_bad = etree.parse("broken.xml")

print("sample.xml →", "VALID ✅" if dtd.validate(tree_ok) else "NOT valid ❌")
print("broken.xml →", "VALID ✅" if dtd.validate(tree_bad) else "NOT valid ❌")
if not dtd.validate(tree_bad):
    # Show the last few errors for teaching
    for e in list(dtd.error_log)[-5:]:
        print("•", e)


sample.xml → NOT valid ❌
broken.xml → NOT valid ❌
• broken.xml:6:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element type
• broken.xml:7:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element dueDate
• broken.xml:8:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element weight
• broken.xml:10:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element format
• broken.xml:11:0:ERROR:VALID:DTD_UNKNOWN_ELEM: No declaration for element policy
