Skip to content

Commit

Permalink
Change to a lexer (Automa.jl) and hand-written parser
Browse files Browse the repository at this point in the history
The PEG parser runs out of memory very quickly due to using PCRE at
runtime. Automa.jl generates a DFA consuming essentially no memory and
the parser is very simple.
  • Loading branch information
sjoelund committed Feb 22, 2019
1 parent 31885f9 commit 0c04ac7
Show file tree
Hide file tree
Showing 9 changed files with 450 additions and 22 deletions.
1 change: 0 additions & 1 deletion .CI/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ RUN export HOME=/home/julia && \
julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add DataFrames;precompile");using DataFrames' && \
julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add DataStructures ;precompile");using DataStructures' && \
julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add LightXML ;precompile");using LightXML' && \
julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add PEG ;precompile");using PEG' && \
julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add Random ;precompile");using Random' && \
julia -O3 -e 'using Pkg;Pkg.REPLMode.pkgstr("add Test ;precompile");using Test' && \
(cd /home/julia && tar cf /home/julia.tar .julia) && rm -rf /home/julia/.julia && chmod ugo+rwx /home/julia
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
Manifest.toml
*.cov
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
LightXML = "9c8b4983-aa76-5018-a973-4c85ecc9e179"
PEG = "12d937ae-5f68-53be-93c9-3a6f997a20a8"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ZMQ = "c2297ded-f4af-51ae-bb23-16f91089e4e1"

Expand Down
61 changes: 61 additions & 0 deletions bin/generate_lexer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Generate a Lexer for OpenModelica output (Values.Value)
# =====================================================================

import Automa
import Automa.RegExp: @re_str
import MacroTools
const re = Automa.RegExp

# Describe patterns in regular expression.
t = re"[tT][rR][uU][eE]"
f = re"[fF][aA][lL][sS][eE]"
string = re"\"([^\"\\x5c]|(\\x5c.))*\""
ident = re"[_A-Za-z][_A-Za-z0-9]*|'([^'\\x5c]|(\\x5c.))+'"
int = re"[-+]?[0-9]+"
prefloat = re"[-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)"
float = prefloat | re.cat(prefloat | re"[-+]?[0-9]+", re"[eE][-+]?[0-9]+")
operator = re"[={}(),;]|end"
number = int | float
ws = re"[ ]+"
omtoken = number | string | ident | operator
omtokens = re.opt(ws) * re.rep(omtoken * re.opt(ws))

# Compile a finite-state machine.
tokenizer = Automa.compile(
t => :(emit(true)),
f => :(emit(false)),
operator => :(emit(Symbol(data[ts:te]))),
re"record" => :(emit(Record())),
string => :(emit(unescape_string(data[ts+1:te-1]))),
ident => :(emit(Identifier(unescape_string(data[ts:te])))), # Should this be a symbol instead?
int => :(emit(parse(Int, data[ts:te]))),
float => :(emit(parse(Float64, data[ts:te]))),
re"[\n\t ]" => :(),
re"." => :(failed = true)
)

# Generate a tokenizing function from the machine.
ctx = Automa.CodeGenContext()
init_code = MacroTools.prettify(Automa.generate_init_code(ctx, tokenizer))
exec_code = MacroTools.prettify(Automa.generate_exec_code(ctx, tokenizer))

write(open("src/lexer.jl","w"), """# Generated Lexer for OpenModelica Values.Value output
function tokenize(data::String)
$(init_code)
p_end = p_eof = sizeof(data)
failed = false
tokens = Any[]
emit(tok) = push!(tokens, tok)
while p ≤ p_eof && cs > 0
$(exec_code)
end
if cs < 0 || failed
throw(LexerError("Error while lexing"))
end
if p < p_eof
throw(LexerError("Did not scan until end of file. Remaining: \$(data[p:p_eof])"))
end
return tokens
end
""")
2 changes: 1 addition & 1 deletion src/OMJulia.jl
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,7 @@ mutable struct OMCSession
function sendExpression(omc, expr)
ZMQ.send(omc.socket, expr)
message=ZMQ.recv(omc.socket)
return Parser.parse_whole(Parser.exp, unsafe_string(message))
return Parser.parseOM(unsafe_string(message))
end

end
175 changes: 175 additions & 0 deletions src/lexer.jl

Large diffs are not rendered by default.

88 changes: 88 additions & 0 deletions src/memory.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# SizedMemory
# ===========
# The Automa.jl package is licensed under the MIT "Expat" License:

# Copyright (c) 2016: BioJulia.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


struct SizedMemory
ptr::Ptr{UInt8}
len::UInt
end

"""
SizedMemory(data)
Create a `SizedMemory` object from `data`.
`data` must implement `Automa.pointerstart` and `Automa.pointerend` methods.
These are used to get the range of the contiguous data memory of `data`. These
have default methods which uses `Base.pointer` and `Base.sizeof` methods. For
example, `String` and `Vector{UInt8}` support these `Base` methods.
Note that it is user's responsibility to keep the `data` object alive during
`SizedMemory`'s lifetime because it does not have a reference to the object.
"""
function SizedMemory(data, len::Integer=(pointerend(data) + 1) - pointerstart(data))
return SizedMemory(pointerstart(data), len)
end

"""
pointerstart(data)::Ptr{UInt8}
Return the start position of `data`.
The default implementation is `convert(Ptr{UInt8}, pointer(data))`.
"""
function pointerstart(data)::Ptr{UInt8}
return convert(Ptr{UInt8}, pointer(data))
end

"""
pointerend(data)::Ptr{UInt8}
Return the end position of `data`.
The default implementation is `Automa.pointerstart(data) + sizeof(data) - 1`.
"""
function pointerend(data)::Ptr{UInt8}
return pointerstart(data) + sizeof(data) - 1
end

function Base.checkbounds(mem::SizedMemory, i::Integer)
if 1 i mem.len
return
end
throw(BoundsError(i))
end

function Base.getindex(mem::SizedMemory, i::Integer)
@boundscheck checkbounds(mem, i)
return unsafe_load(mem.ptr, i)
end

function Base.lastindex(mem::SizedMemory)
return Int(mem.len)
end

function Base.length(mem::SizedMemory)
return Int(mem.len)
end
138 changes: 121 additions & 17 deletions src/parser.jl
Original file line number Diff line number Diff line change
@@ -1,21 +1,125 @@
module Parser

using PEG

@rule exp = bool, float , integer, string, array, tuple, none, some, record, ident
@rule bool = r"true"ip |> x -> true, r"false"ip |> x -> false
@rule string = r"\"([^\"\\]|\\.)*\""p |> x -> unescape_string(x[2:end-1])
@rule number = r"\d+"w , "123."
@rule integer = r"\d+"w |> x -> parse(Int64, x)
@rule float = r"(\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+)?|\d+([eE][+-]?\d+)" |> x -> parse(Float64, x)
@rule array = r"{"p & sequence & r"}"p > (x,y,z) -> collect(Base.tuple(y...)) # Fixed the type of the array
@rule tuple = "(" & sequence & ")" > (x,y,z) -> Base.tuple(y...)
@rule sequence = (exp & ( "," & exp > (x,y) -> y )[:*] > (x,y) -> vcat([x],y)) , "" |> x -> []
@rule none = r"NONE"p & r"\("p & r"\)"p |> x -> nothing
@rule some = r"SOME"p & r"\("p & exp & r"\)"p > (x,y,exp,z) -> exp
@rule ident = r"[[:alnum:]_][[:alnum:]_0-9]*"p |> x -> convert(String, x) , r"'([^']|\\.)*'"p |> x -> convert(String, x)
@rule member = ident & r"\s*=\s*" & exp > (x,y,z) -> (x,z)
@rule members = member & (r"\s*,\s*" & member > (x,y) -> y)[:*] > (x,y) -> begin res = Dict(y) ; res[x[1]] = x[2] ; res end, ("" |> x -> Dict{String,Any}())
@rule record = r"record"w & ident & members & r"end"w & ident & ";" > (x,i1,members,e,i2,sc) -> Dict(members)
struct Identifier
id::String
end

struct Record
end

struct ParseError <: Exception
errmsg::AbstractString
end

struct LexerError <: Exception
errmsg::AbstractString
end

include("memory.jl")
include("lexer.jl")

show(io::IO, exc::ParseError) = print(io, string("Parse error: ",exc.errmsg))

function parseOM(t::Union{Int,Float64,String,Bool}, tokens)
return t
end

function checkToken(sym::Symbol, tok)
if tok != sym
throw(ParseError("Expected token of type $sym, got $(tok)"))
end
tok
end

function checkToken(t, tok)
if typeof(tok) != t
throw(ParseError("Expected token of type $t, got $(typeof(tok))"))
end
tok
end

function parseSequence(tokens, last)
res = []
tok = popfirst!(tokens)
if (tok == last)
return res
end
push!(res, parseOM(tok, tokens))
tok = popfirst!(tokens)
while tok == Symbol(",")
push!(res, parseOM(popfirst!(tokens), tokens))
tok = popfirst!(tokens)
end
checkToken(last, tok)
return collect(tuple(res...))
end

function parseOM(t::Symbol, tokens)
if t == Symbol("(")
res = tuple(parseSequence(tokens, Symbol(")"))...)
elseif t == Symbol("{")
res = parseSequence(tokens, Symbol("}"))
end
end

function parseOM(t::Identifier, tokens)
if t.id == "NONE"
checkToken(Symbol("("), popfirst!(tokens))
checkToken(Symbol(")"), popfirst!(tokens))
return nothing
elseif t.id == "SOME"
checkToken(Symbol("("), popfirst!(tokens))
res = parseOM(popfirst!(tokens), tokens)
checkToken(Symbol(")"), popfirst!(tokens))
return res
else
return Symbol(t.id)
end
end

function parseOM(t::Record, tokens)
res = Tuple{String,Any}[]

checkToken(Identifier, popfirst!(tokens))
tok = popfirst!(tokens)
if tok != :end
id = checkToken(Identifier, tok)
checkToken(Symbol("="), popfirst!(tokens))
val = parseOM(popfirst!(tokens), tokens)
push!(res, (id.id, val))
tok = popfirst!(tokens)
while tok == Symbol(",")
id = checkToken(Identifier, popfirst!(tokens))
checkToken(Symbol("="), popfirst!(tokens))
val = parseOM(popfirst!(tokens), tokens)
push!(res, (id.id, val))
tok = popfirst!(tokens)
end
end
checkToken(:end, tok)
checkToken(Identifier, popfirst!(tokens))
checkToken(Symbol(";"), popfirst!(tokens))
# Fixes the type of the dictionary
if isempty(res)
return Dict(res)
end
return Dict(collect(Base.tuple(res...)))
end

function parseOM(tokens::AbstractArray{Any,1})
if (length(tokens)==0)
return nothing
end
t = popfirst!(tokens)
res = parseOM(t, tokens)
if !isempty(tokens)
throw(ParseError("Expected EOF, got output $tokens"))
end
res
end

function parseOM(str::String)
parseOM(tokenize(str))
end

end
5 changes: 3 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using Test
@testset "Parser" begin

function check(string, expected_value, expected_type)
value = OMJulia.Parser.parse_whole(OMJulia.Parser.exp, string)
value = OMJulia.Parser.parseOM(string)
expected_value == value && expected_type == typeof(value)
end

Expand All @@ -27,9 +27,10 @@ end
@test check("(1,2,3)", (1,2,3), Tuple{Int,Int,Int})
@test check("NONE()", nothing, Nothing)
@test check("SOME(1)", 1, Int)
@test check("abc_2", "abc_2", String)
@test check("abc_2", :abc_2, Symbol)
@test check("record ABC end ABC;", Dict(), Dict{String,Any})
@test check("record ABC a = 1, 'b' = 2,\n c = 3\nend ABC;", Dict("a" => 1, "'b'" => 2, "c" => 3), Dict{String,Int})
@test check("", nothing, Nothing)

end

Expand Down

0 comments on commit 0c04ac7

Please sign in to comment.