Skip to content

Commit

Permalink
Take a more structured approach to HTML parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
i-ky committed Dec 29, 2022
1 parent dcd71c6 commit 5d7570d
Showing 1 changed file with 119 additions and 28 deletions.
147 changes: 119 additions & 28 deletions road-sign-extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,45 +6,136 @@

url = "https://likumi.lv"
csn = url + "/ta/id/274865"
signs = []
doc = [(None, (), [])]

class Parser(HTMLParser):
def __init__(self):
self.started = False
self.ended = False
self.data_expected = False
self.first_column = True
super().__init__()
# https://www.w3.org/html/wg/spec/syntax.html#void-elements
void = ("area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr")

def close(self):
top = doc.pop()
doc[-1][-1].append(top)

def handle_starttag(self, tag, attrs):
if self.started and not self.ended:
if tag == "td":
if self.first_column:
self.data_expected = True
self.first_column = False
elif tag == "img":
signs[-1][-1].append(attrs)
assert doc[-1][0] not in self.void, f"void element <{doc[-1][0]}> cannot contain <{tag}>"
doc.append((tag, attrs, []))

if tag in self.void:
self.close()

def handle_endtag(self, tag):
if self.started and not self.ended:
if tag == "td":
self.data_expected = False
elif tag == "tr":
self.first_column = True
if tag not in self.void:
assert doc[-1][0] == tag, f"<{doc[-1][0]}> element cannot be closed by '{tag}' end tag"
self.close()

def handle_data(self, data):
if not self.started:
if data == "Ceļa zīmes":
self.started = True
elif not self.ended:
if data == "Ceļa apzīmējumi":
self.ended = True
elif self.data_expected:
signs.append((data, []))
assert doc[-1][0] not in self.void, f"void element <{doc[-1][0]}> cannot contain {data!r}"
doc[-1][-1].append(data)

Parser().feed(urlopen(csn).read().decode())
root, = doc

def body(tag, attributes, children):
if tag == "div" and dict(attributes).get("class") == "doc-body":
return children

for child in children:
if not isinstance(child, str):
result = body(*child)

if result is not None:
return result

def extract(elements):
interesting = False

for element in elements:
if isinstance(element, str):
assert element.strip() == ""
continue

tag, _, children = element
assert tag == "div"

if interesting is True:
yield children
interesting = False

strings = [child for child in children if isinstance(child, str)]

if len(strings) == 1:
text, = strings

if text in ("Ceļa zīmes", "Ceļa apzīmējumi"):
interesting = True

sign_section, marking_section = extract(body(*root))

def table(elements):
tbody, = elements
tag, _, rows = tbody
assert tag == "tbody"
rowspan = None
result = []

for tr in rows:
def get_text(tag, attributes, children):
assert tag == "td"
assert "colspan" not in dict(attributes)
text, = children
assert isinstance(text, str), text
result.append((text, []))
return int(dict(attributes).get("rowspan", 1))

def get_image(tag, attributes, children):
assert tag == "td"
assert "rowspan" not in dict(attributes)
assert "colspan" not in dict(attributes)
img, = children

if not isinstance(img, str):
tag, attrs, ch = img

if tag != "img":
assert tag == "p"
img, = ch
tag, attrs, ch = img

assert tag == "img"
assert len(ch) == 0
result[-1][-1].append(attrs)

tag, _, cells = tr
assert tag == "tr"

if len(cells) == 3:
assert rowspan is None or rowspan == 0
number, image, _ = cells
rowspan = get_text(*number) - 1
get_image(*image)
elif len(cells) == 1:
assert rowspan > 0
rowspan -= 1
image, = cells
get_image(*image)
else:
raise

return result

def signs(elements):
for element in elements:
assert not isinstance(element, str)
tag, _, children = element

if tag == "p":
continue

assert tag == "table"
for row in table(children):
yield row

for number, images in signs:
for number, images in signs(sign_section):
if len(images) > 0:
with open(f"{number}.html", "w") as html:
for image in images:
Expand Down

0 comments on commit 5d7570d

Please sign in to comment.