Change to a lexer (Automa.jl) and hand-written parser

The PEG parser runs out of memory very quickly due to using PCRE at runtime. Automa.jl generates a DFA consuming essentially no memory and the parser is very simple.
OpenModelica · Feb 22, 2019 · 0c04ac7 · 0c04ac7
1 parent 31885f9
commit 0c04ac7
Show file tree

Hide file tree

Showing 9 changed files with 450 additions and 22 deletions.
diff --git a/.CI/docker/Dockerfile b/.CI/docker/Dockerfile
@@ -25,7 +25,6 @@ RUN export HOME=/home/julia && \
     julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add DataFrames;precompile");using DataFrames' && \
     julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add DataStructures  ;precompile");using DataStructures' && \
     julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add LightXML  ;precompile");using LightXML' && \
-    julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add PEG  ;precompile");using PEG' && \
     julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add Random  ;precompile");using Random' && \
     julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add Test  ;precompile");using Test' && \
     (cd /home/julia && tar cf /home/julia.tar .julia) && rm -rf /home/julia/.julia && chmod ugo+rwx /home/julia
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 Manifest.toml
+*.cov
diff --git a/Project.toml b/Project.toml
@@ -8,7 +8,6 @@ Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
-PEG = "12d937ae-5f68-53be-93c9-3a6f997a20a8"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ZMQ = "c2297ded-f4af-51ae-bb23-16f91089e4e1"
 

diff --git a/bin/generate_lexer.jl b/bin/generate_lexer.jl
@@ -0,0 +1,61 @@
+# Generate a Lexer for OpenModelica output (Values.Value)
+# =====================================================================
+
+import Automa
+import Automa.RegExp: @re_str
+import MacroTools
+const re = Automa.RegExp
+
+# Describe patterns in regular expression.
+t     = re"[tT][rR][uU][eE]"
+f     = re"[fF][aA][lL][sS][eE]"
+string   = re"\"([^\"\\x5c]|(\\x5c.))*\""
+ident    = re"[_A-Za-z][_A-Za-z0-9]*|'([^'\\x5c]|(\\x5c.))+'"
+int      = re"[-+]?[0-9]+"
+prefloat = re"[-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)"
+float    = prefloat | re.cat(prefloat | re"[-+]?[0-9]+", re"[eE][-+]?[0-9]+")
+operator = re"[={}(),;]|end"
+number   = int | float
+ws       = re"[ ]+"
+omtoken  = number | string | ident | operator
+omtokens = re.opt(ws) * re.rep(omtoken * re.opt(ws))
+
+# Compile a finite-state machine.
+tokenizer = Automa.compile(
+  t => :(emit(true)),
+  f => :(emit(false)),
+  operator => :(emit(Symbol(data[ts:te]))),
+  re"record" => :(emit(Record())),
+  string => :(emit(unescape_string(data[ts+1:te-1]))),
+  ident => :(emit(Identifier(unescape_string(data[ts:te])))), # Should this be a symbol instead?
+  int => :(emit(parse(Int, data[ts:te]))),
+  float => :(emit(parse(Float64, data[ts:te]))),
+  re"[\n\t ]" => :(),
+  re"." => :(failed = true)
+)
+
+# Generate a tokenizing function from the machine.
+ctx = Automa.CodeGenContext()
+init_code = MacroTools.prettify(Automa.generate_init_code(ctx, tokenizer))
+exec_code = MacroTools.prettify(Automa.generate_exec_code(ctx, tokenizer))
+
+write(open("src/lexer.jl","w"), """# Generated Lexer for OpenModelica Values.Value output
+
+function tokenize(data::String)
+  $(init_code)
+  p_end = p_eof = sizeof(data)
+  failed = false
+  tokens = Any[]
+  emit(tok) = push!(tokens, tok)
+  while p ≤ p_eof && cs > 0
+    $(exec_code)
+  end
+  if cs < 0 || failed
+    throw(LexerError("Error while lexing"))
+  end
+  if p < p_eof
+    throw(LexerError("Did not scan until end of file. Remaining: \$(data[p:p_eof])"))
+  end
+  return tokens
+end
+""")
diff --git a/src/OMJulia.jl b/src/OMJulia.jl
@@ -980,7 +980,7 @@ mutable struct OMCSession
 function sendExpression(omc, expr)
    ZMQ.send(omc.socket, expr)
    message=ZMQ.recv(omc.socket)
-   return Parser.parse_whole(Parser.exp, unsafe_string(message))
+   return Parser.parseOM(unsafe_string(message))
 end
 
 end
diff --git a/src/lexer.jl b/src/lexer.jl
diff --git a/src/memory.jl b/src/memory.jl
@@ -0,0 +1,88 @@
+# SizedMemory
+# ===========
+# The Automa.jl package is licensed under the MIT "Expat" License:
+
+# Copyright (c) 2016: BioJulia.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+struct SizedMemory
+    ptr::Ptr{UInt8}
+    len::UInt
+end
+
+"""
+    SizedMemory(data)
+
+Create a `SizedMemory` object from `data`.
+
+`data` must implement `Automa.pointerstart` and `Automa.pointerend` methods.
+These are used to get the range of the contiguous data memory of `data`.  These
+have default methods which uses `Base.pointer` and `Base.sizeof` methods.  For
+example, `String` and `Vector{UInt8}` support these `Base` methods.
+
+Note that it is user's responsibility to keep the `data` object alive during
+`SizedMemory`'s lifetime because it does not have a reference to the object.
+"""
+function SizedMemory(data, len::Integer=(pointerend(data) + 1) - pointerstart(data))
+    return SizedMemory(pointerstart(data), len)
+end
+
+"""
+    pointerstart(data)::Ptr{UInt8}
+
+Return the start position of `data`.
+
+The default implementation is `convert(Ptr{UInt8}, pointer(data))`.
+"""
+function pointerstart(data)::Ptr{UInt8}
+    return convert(Ptr{UInt8}, pointer(data))
+end
+
+"""
+    pointerend(data)::Ptr{UInt8}
+
+Return the end position of `data`.
+
+The default implementation is `Automa.pointerstart(data) + sizeof(data) - 1`.
+"""
+function pointerend(data)::Ptr{UInt8}
+    return pointerstart(data) + sizeof(data) - 1
+end
+
+function Base.checkbounds(mem::SizedMemory, i::Integer)
+    if 1 ≤ i ≤ mem.len
+        return
+    end
+    throw(BoundsError(i))
+end
+
+function Base.getindex(mem::SizedMemory, i::Integer)
+    @boundscheck checkbounds(mem, i)
+    return unsafe_load(mem.ptr, i)
+end
+
+function Base.lastindex(mem::SizedMemory)
+    return Int(mem.len)
+end
+
+function Base.length(mem::SizedMemory)
+    return Int(mem.len)
+end
diff --git a/src/parser.jl b/src/parser.jl
@@ -1,21 +1,125 @@
 module Parser
 
-using PEG
-
-@rule exp = bool, float , integer, string, array, tuple, none, some, record, ident
-@rule bool = r"true"ip |> x -> true, r"false"ip |> x -> false
-@rule string = r"\"([^\"\\]|\\.)*\""p |> x -> unescape_string(x[2:end-1])
-@rule number = r"\d+"w , "123."
-@rule integer = r"\d+"w |> x -> parse(Int64, x)
-@rule float = r"(\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+)?|\d+([eE][+-]?\d+)" |> x -> parse(Float64, x)
-@rule array = r"{"p & sequence & r"}"p > (x,y,z) -> collect(Base.tuple(y...)) # Fixed the type of the array
-@rule tuple = "(" & sequence & ")" > (x,y,z) -> Base.tuple(y...)
-@rule sequence = (exp & ( "," & exp > (x,y) -> y )[:*] > (x,y) -> vcat([x],y)) , "" |> x -> []
-@rule none = r"NONE"p & r"\("p & r"\)"p |> x -> nothing
-@rule some = r"SOME"p & r"\("p & exp & r"\)"p > (x,y,exp,z) -> exp
-@rule ident = r"[[:alnum:]_][[:alnum:]_0-9]*"p |> x -> convert(String, x) , r"'([^']|\\.)*'"p |> x -> convert(String, x)
-@rule member = ident & r"\s*=\s*" & exp > (x,y,z) -> (x,z)
-@rule members = member & (r"\s*,\s*" & member > (x,y) -> y)[:*] > (x,y) -> begin res = Dict(y) ; res[x[1]] = x[2] ; res end, ("" |> x -> Dict{String,Any}())
-@rule record = r"record"w & ident & members & r"end"w & ident & ";" > (x,i1,members,e,i2,sc) -> Dict(members)
+struct Identifier
+  id::String
+end
+
+struct Record
+end
+
+struct ParseError <: Exception
+  errmsg::AbstractString
+end
+
+struct LexerError <: Exception
+  errmsg::AbstractString
+end
+
+include("memory.jl")
+include("lexer.jl")
+
+show(io::IO, exc::ParseError) = print(io, string("Parse error: ",exc.errmsg))
+
+function parseOM(t::Union{Int,Float64,String,Bool}, tokens)
+  return t
+end
+
+function checkToken(sym::Symbol, tok)
+  if tok != sym
+    throw(ParseError("Expected token of type $sym, got $(tok)"))
+  end
+  tok
+end
+
+function checkToken(t, tok)
+  if typeof(tok) != t
+    throw(ParseError("Expected token of type $t, got $(typeof(tok))"))
+  end
+  tok
+end
+
+function parseSequence(tokens, last)
+  res = []
+  tok = popfirst!(tokens)
+  if (tok == last)
+    return res
+  end
+  push!(res, parseOM(tok, tokens))
+  tok = popfirst!(tokens)
+  while tok == Symbol(",")
+    push!(res, parseOM(popfirst!(tokens), tokens))
+    tok = popfirst!(tokens)
+  end
+  checkToken(last, tok)
+  return collect(tuple(res...))
+end
+
+function parseOM(t::Symbol, tokens)
+  if t == Symbol("(")
+    res = tuple(parseSequence(tokens, Symbol(")"))...)
+  elseif t == Symbol("{")
+    res = parseSequence(tokens, Symbol("}"))
+  end
+end
+
+function parseOM(t::Identifier, tokens)
+  if t.id == "NONE"
+    checkToken(Symbol("("), popfirst!(tokens))
+    checkToken(Symbol(")"), popfirst!(tokens))
+    return nothing
+  elseif t.id == "SOME"
+    checkToken(Symbol("("), popfirst!(tokens))
+    res = parseOM(popfirst!(tokens), tokens)
+    checkToken(Symbol(")"), popfirst!(tokens))
+    return res
+  else
+    return Symbol(t.id)
+  end
+end
+
+function parseOM(t::Record, tokens)
+  res = Tuple{String,Any}[]
+
+  checkToken(Identifier, popfirst!(tokens))
+  tok = popfirst!(tokens)
+  if tok != :end
+    id = checkToken(Identifier, tok)
+    checkToken(Symbol("="), popfirst!(tokens))
+    val = parseOM(popfirst!(tokens), tokens)
+    push!(res, (id.id, val))
+    tok = popfirst!(tokens)
+    while tok == Symbol(",")
+      id = checkToken(Identifier, popfirst!(tokens))
+      checkToken(Symbol("="), popfirst!(tokens))
+      val = parseOM(popfirst!(tokens), tokens)
+      push!(res, (id.id, val))
+      tok = popfirst!(tokens)
+    end
+  end
+  checkToken(:end, tok)
+  checkToken(Identifier, popfirst!(tokens))
+  checkToken(Symbol(";"), popfirst!(tokens))
+  # Fixes the type of the dictionary
+  if isempty(res)
+    return Dict(res)
+  end
+  return Dict(collect(Base.tuple(res...)))
+end
+
+function parseOM(tokens::AbstractArray{Any,1})
+  if (length(tokens)==0)
+    return nothing
+  end
+  t = popfirst!(tokens)
+  res = parseOM(t, tokens)
+  if !isempty(tokens)
+    throw(ParseError("Expected EOF, got output $tokens"))
+  end
+  res
+end
+
+function parseOM(str::String)
+  parseOM(tokenize(str))
+end
 
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,7 +8,7 @@ using Test
 @testset "Parser" begin
 
 function check(string, expected_value, expected_type)
-  value = OMJulia.Parser.parse_whole(OMJulia.Parser.exp, string)
+  value = OMJulia.Parser.parseOM(string)
   expected_value == value && expected_type == typeof(value)
 end
 
@@ -27,9 +27,10 @@ end
 @test check("(1,2,3)", (1,2,3), Tuple{Int,Int,Int})
 @test check("NONE()", nothing, Nothing)
 @test check("SOME(1)", 1, Int)
-@test check("abc_2", "abc_2", String)
+@test check("abc_2", :abc_2, Symbol)
 @test check("record ABC end ABC;", Dict(), Dict{String,Any})
 @test check("record ABC a = 1, 'b' = 2,\n  c = 3\nend ABC;", Dict("a" => 1, "'b'" => 2, "c" => 3), Dict{String,Int})
+@test check("", nothing, Nothing)
 
 end