From 4a2a7dcadabef8b6a92891f794c7e688e762b9a9 Mon Sep 17 00:00:00 2001 From: Eric Norige <127622562+eanorige@users.noreply.github.com> Date: Fri, 20 Oct 2023 09:10:35 -0700 Subject: [PATCH 1/5] Speedup line_offset property * Replace dynamic regex with string find operation * Add cache of where each line starts so we don't have quadratic behavior identifying line numbers when importing large chunks of html --- markdown/htmlparser.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index bf70b73d..0ad2631f 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -83,6 +83,9 @@ def __init__(self, md, *args, **kwargs): # Block tags that should contain no content (self closing) self.empty_tags = set(['hr']) + self.lineno_start_cache = [0] + + # This calls self.reset super().__init__(*args, **kwargs) self.md = md @@ -94,6 +97,8 @@ def reset(self): self.stack = [] # When `inraw==True`, stack contains a list of tags self._cache = [] self.cleandoc = [] + self.lineno_start_cache = [0] + super().reset() def close(self): @@ -114,6 +119,15 @@ def close(self): @property def line_offset(self) -> int: """Returns char index in `self.rawdata` for the start of the current line. """ + for ii in range(len(self.lineno_start_cache)-1, self.lineno-1): + last_line_start_pos = self.lineno_start_cache[ii] + lf_pos = self.rawdata.find('\n', last_line_start_pos) + if lf_pos == -1: + # No more newlines found. Use end of rawdata. + lf_pos = len(self.rawdata) + self.lineno_start_cache.append(lf_pos+1) + + return self.lineno_start_cache[self.lineno-1] if self.lineno > 1 and '\n' in self.rawdata: m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata) if m: From 70b30e75d576918365e7b8ead7f63643a9ac358f Mon Sep 17 00:00:00 2001 From: Eric Norige <127622562+eanorige@users.noreply.github.com> Date: Fri, 20 Oct 2023 12:00:50 -0700 Subject: [PATCH 2/5] Update htmlparser.py Forgot to remove old implementation --- markdown/htmlparser.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 0ad2631f..8425c88c 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -128,15 +128,6 @@ def line_offset(self) -> int: self.lineno_start_cache.append(lf_pos+1) return self.lineno_start_cache[self.lineno-1] - if self.lineno > 1 and '\n' in self.rawdata: - m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata) - if m: - return m.end() - else: # pragma: no cover - # Value of `self.lineno` must exceed total number of lines. - # Find index of beginning of last line. - return self.rawdata.rfind('\n') - return 0 def at_line_start(self) -> bool: """ From e300ce44fcb1cb0bb30d6f5da1859dc3573ea862 Mon Sep 17 00:00:00 2001 From: Eric Norige Date: Sun, 22 Oct 2023 22:04:59 -0700 Subject: [PATCH 3/5] Add changelog entry --- docs/changelog.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 2f9e9250..614177c6 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -8,6 +8,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). See the [Contributing Guide](contributing.md) for details. +## [unreleased] + +### Fixed + +* Fix a performance problem with HTML extraction where large HTML input could trigger quadratic line counting behavior (PR#1392). + ## [3.5] -- 2023-10-06 ### Added From f32969affc2927ac4a5c8030a0224ae690dd3d29 Mon Sep 17 00:00:00 2001 From: Eric Norige Date: Sun, 22 Oct 2023 22:08:57 -0700 Subject: [PATCH 4/5] Remove extra blank line causing CI failure. --- markdown/htmlparser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 8425c88c..1a43efce 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -85,7 +85,6 @@ def __init__(self, md, *args, **kwargs): self.lineno_start_cache = [0] - # This calls self.reset super().__init__(*args, **kwargs) self.md = md From 7d403f2a681163342caa81b3a155502497ffa1ad Mon Sep 17 00:00:00 2001 From: Eric Norige Date: Sun, 22 Oct 2023 22:29:59 -0700 Subject: [PATCH 5/5] Improve comment to only use words --- markdown/htmlparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 1a43efce..4dbb1587 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -122,7 +122,7 @@ def line_offset(self) -> int: last_line_start_pos = self.lineno_start_cache[ii] lf_pos = self.rawdata.find('\n', last_line_start_pos) if lf_pos == -1: - # No more newlines found. Use end of rawdata. + # No more newlines found. Use end of raw data as start of line beyond end. lf_pos = len(self.rawdata) self.lineno_start_cache.append(lf_pos+1)