From 4a2a7dcadabef8b6a92891f794c7e688e762b9a9 Mon Sep 17 00:00:00 2001
From: Eric Norige <127622562+eanorige@users.noreply.github.com>
Date: Fri, 20 Oct 2023 09:10:35 -0700
Subject: [PATCH 1/5] Speedup line_offset property

* Replace dynamic regex with string find operation
* Add cache of where each line starts so we don't have quadratic behavior identifying line numbers when importing large chunks of html
---
 markdown/htmlparser.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index bf70b73d..0ad2631f 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -83,6 +83,9 @@ def __init__(self, md, *args, **kwargs):
         # Block tags that should contain no content (self closing)
         self.empty_tags = set(['hr'])
 
+        self.lineno_start_cache = [0]
+
+
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md
@@ -94,6 +97,8 @@ def reset(self):
         self.stack = []  # When `inraw==True`, stack contains a list of tags
         self._cache = []
         self.cleandoc = []
+        self.lineno_start_cache = [0]
+
         super().reset()
 
     def close(self):
@@ -114,6 +119,15 @@ def close(self):
     @property
     def line_offset(self) -> int:
         """Returns char index in `self.rawdata` for the start of the current line. """
+        for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
+            last_line_start_pos = self.lineno_start_cache[ii]
+            lf_pos = self.rawdata.find('\n', last_line_start_pos)
+            if lf_pos == -1:
+                # No more newlines found. Use end of rawdata.
+                lf_pos = len(self.rawdata)
+            self.lineno_start_cache.append(lf_pos+1)
+
+        return self.lineno_start_cache[self.lineno-1]
         if self.lineno > 1 and '\n' in self.rawdata:
             m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
             if m:

From 70b30e75d576918365e7b8ead7f63643a9ac358f Mon Sep 17 00:00:00 2001
From: Eric Norige <127622562+eanorige@users.noreply.github.com>
Date: Fri, 20 Oct 2023 12:00:50 -0700
Subject: [PATCH 2/5] Update htmlparser.py

Forgot to remove old implementation
---
 markdown/htmlparser.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 0ad2631f..8425c88c 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -128,15 +128,6 @@ def line_offset(self) -> int:
             self.lineno_start_cache.append(lf_pos+1)
 
         return self.lineno_start_cache[self.lineno-1]
-        if self.lineno > 1 and '\n' in self.rawdata:
-            m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
-            if m:
-                return m.end()
-            else:  # pragma: no cover
-                # Value of `self.lineno` must exceed total number of lines.
-                # Find index of beginning of last line.
-                return self.rawdata.rfind('\n')
-        return 0
 
     def at_line_start(self) -> bool:
         """

From e300ce44fcb1cb0bb30d6f5da1859dc3573ea862 Mon Sep 17 00:00:00 2001
From: Eric Norige <eric@bayasystems.com>
Date: Sun, 22 Oct 2023 22:04:59 -0700
Subject: [PATCH 3/5] Add changelog entry

---
 docs/changelog.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/changelog.md b/docs/changelog.md
index 2f9e9250..614177c6 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -8,6 +8,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). See the [Contributing Guide](contributing.md) for details.
 
+## [unreleased]
+
+### Fixed
+
+* Fix a performance problem with HTML extraction where large HTML input could trigger quadratic line counting behavior (PR#1392).
+
 ## [3.5] -- 2023-10-06
 
 ### Added

From f32969affc2927ac4a5c8030a0224ae690dd3d29 Mon Sep 17 00:00:00 2001
From: Eric Norige <eric@bayasystems.com>
Date: Sun, 22 Oct 2023 22:08:57 -0700
Subject: [PATCH 4/5] Remove extra blank line causing CI failure.

---
 markdown/htmlparser.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 8425c88c..1a43efce 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -85,7 +85,6 @@ def __init__(self, md, *args, **kwargs):
 
         self.lineno_start_cache = [0]
 
-
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md

From 7d403f2a681163342caa81b3a155502497ffa1ad Mon Sep 17 00:00:00 2001
From: Eric Norige <eric@bayasystems.com>
Date: Sun, 22 Oct 2023 22:29:59 -0700
Subject: [PATCH 5/5] Improve comment to only use words

---
 markdown/htmlparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 1a43efce..4dbb1587 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -122,7 +122,7 @@ def line_offset(self) -> int:
             last_line_start_pos = self.lineno_start_cache[ii]
             lf_pos = self.rawdata.find('\n', last_line_start_pos)
             if lf_pos == -1:
-                # No more newlines found. Use end of rawdata.
+                # No more newlines found. Use end of raw data as start of line beyond end.
                 lf_pos = len(self.rawdata)
             self.lineno_start_cache.append(lf_pos+1)