Skip to content

Commit

Permalink
Merge pull request #432 from Witiko/fix/parsers-punctuation-memory-is…
Browse files Browse the repository at this point in the history
…sues

Define `parsers.punctuation` in a streaming fashion
  • Loading branch information
Witiko committed Apr 3, 2024
2 parents 828e25a + dad61ee commit e2c6be1
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 54 deletions.
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Fixes:
intervals. (#408, #419)
- Do not misinterpret bracketed e-mails as citations. (#424, #426,
sponsored by @istqborg)
- Comply with CommonMark 0.31.2. (#416, 40b516ee, de8d137d,
- Comply with CommonMark 0.31.2. (#416, 40b516ee, de8d137d, #432,
contributed by @lostenderman)
- Do not end a paragraph before a `:::` in fenced divs.
(#407, lostenderman/markdown#157, #427, #428, lostenderman/markdown#158,
Expand Down
84 changes: 31 additions & 53 deletions markdown.dtx
Original file line number Diff line number Diff line change
Expand Up @@ -24510,38 +24510,6 @@ end
% \par
% \begin{markdown}
%
%### Unicode punctuation
% This section documents [the Unicode punctuation][unicode-punctuation]
% recognized by the markdown reader. The punctuation is organized in the
% \luamdef{punctuation} table according to the number of bytes occupied after
% conversion to \acro{utf}8.
%
% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character
% (CommonMark Spec, Version 0.31.2 (2024-01-28))
%
% \end{markdown}
% \begin{macrocode}
local punctuation = {}
(function()
local pathname = kpse.lookup("UnicodeData.txt")
local file = assert(io.open(pathname, "r"),
[[Could not open file "UnicodeData.txt"]])
for line in file:lines() do
local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
if major_category == "P" or major_category == "S" then
local code = unicode.utf8.char(tonumber(codepoint, 16))
if punctuation[#code] == nil then
punctuation[#code] = {}
end
table.insert(punctuation[#code], code)
end
end
assert(file:close())
end)()
% \end{macrocode}
% \par
% \begin{markdown}
%
%### Plain \TeX{} Writer {#tex-writer}
%
% This section documents the \luamref{writer} object, which implements the
Expand Down Expand Up @@ -25809,36 +25777,46 @@ parsers.fail = P(false)

parsers.internal_punctuation = S(":;,.?")
parsers.ascii_punctuation = S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
% \end{macrocode}
% \par
% \begin{markdown}
%
%### Unicode punctuation
% This section documents [the Unicode punctuation][unicode-punctuation]
% recognized by the markdown reader. The punctuation is organized in the
% \luamdef{parsers.punctuation} table according to the number of bytes occupied
% after conversion to \acro{utf}8.
%
% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character
% (CommonMark Spec, Version 0.31.2 (2024-01-28))
%
% \end{markdown}
% \begin{macrocode}
parsers.punctuation = {}
(function()
for size = 1, 4 do
local codepoint_parser = parsers.fail
if size == 1 then
codepoint_parser = codepoint_parser + parsers.ascii_punctuation
end
for _, code in ipairs(punctuation[size] or {}) do
local pathname = kpse.lookup("UnicodeData.txt")
local file = assert(io.open(pathname, "r"),
[[Could not open file "UnicodeData.txt"]])
for line in file:lines() do
local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
if major_category == "P" or major_category == "S" then
local code = unicode.utf8.char(tonumber(codepoint, 16))
if parsers.punctuation[#code] == nil then
parsers.punctuation[#code] = parsers.fail
end
local code_parser = parsers.succeed
assert(#code == size)
for i = 1, size do
for i = 1, #code do
local byte = code:sub(i, i)
local byte_parser = S(byte)
code_parser = code_parser * byte_parser
code_parser = code_parser
* byte_parser
end
codepoint_parser = codepoint_parser + code_parser
parsers.punctuation[#code] = parsers.punctuation[#code]
+ code_parser
end
parsers.punctuation[size] = codepoint_parser
end
assert(file:close())
end)()
% \end{macrocode}
% \par
% \begin{markdown}
%
% Here, we garbage-collect the \luamref{punctuation} table, since we won't need it anymore.
%
% \end{markdown}
% \begin{macrocode}
punctuation = nil
collectgarbage("collect")

parsers.escapable = parsers.ascii_punctuation
parsers.anyescaped = parsers.backslash / "" * parsers.escapable
Expand Down

0 comments on commit e2c6be1

Please sign in to comment.