/
Regexp2.go
312 lines (284 loc) · 9.73 KB
/
Regexp2.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
// Copyright 2023 Sneller, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp2
import (
"fmt"
"log"
"os"
"reflect"
"regexp"
"regexp/syntax"
"strconv"
"strings"
"unicode/utf8"
)
// MaxNodesAutomaton is the maximum number of states when constructing and transforming NFAs and DFAs.
const MaxNodesAutomaton = 3000
// MaxCharInRegex is the maximum number of characters in a regex string
const MaxCharInRegex = 1000
// IsSupported determines whether expr is a supported regex; return nil if supported, error otherwise
func IsSupported(expr string) error {
nRunesExpr := utf8.RuneCountInString(expr)
if nRunesExpr > MaxCharInRegex {
return fmt.Errorf("provided regex expression contains %v code-points which is more than the max %v", nRunesExpr, MaxCharInRegex)
}
return nil
}
type RegexType int
const (
SimilarTo RegexType = iota
Regexp
RegexpCi
GolangSimilarTo
GolangRegexp
)
// Compile return a regex for the provided string and regexType.
func Compile(expr string, regexType RegexType) (regex *regexp.Regexp, err error) {
exprOrg := expr
if regexType == SimilarTo || regexType == GolangSimilarTo {
exprRunes := []rune(expr)
newRegexRunes := make([]rune, 0, len(exprRunes))
for index, r := range exprRunes {
escaped := (index > 0) && (exprRunes[index-1] == escapeChar)
switch r {
case '.', '^', '$': // characters '.', '^' and '$' are NOT meta-characters in "SIMILAR TO",
if escaped {
// found an escaped char, do not escape it again
newRegexRunes = append(newRegexRunes, r)
} else {
newRegexRunes = append(newRegexRunes, escapeChar, r)
}
case '%': // replace '%': it represents n character
if escaped {
// found an escaped char, do not change it
newRegexRunes = append(newRegexRunes, r)
} else {
newRegexRunes = append(newRegexRunes, '.', '*')
}
case '_': // replace '_': it represents a single character
if escaped {
// found an escaped char, do not change it
newRegexRunes = append(newRegexRunes, r)
} else {
newRegexRunes = append(newRegexRunes, '.')
}
default:
newRegexRunes = append(newRegexRunes, r)
}
}
expr = string(newRegexRunes)
}
switch regexType {
case SimilarTo:
if !strings.HasSuffix(exprOrg, "$") {
expr = "(" + expr + ")$" // NOTE brackets are necessary
}
case GolangSimilarTo:
if !strings.HasPrefix(exprOrg, "^") {
expr = "^(" + expr + ")" // NOTE brackets are necessary
}
if !strings.HasSuffix(exprOrg, "$") {
expr = "(" + expr + ")$" // NOTE brackets are necessary
}
case RegexpCi:
if !strings.HasPrefix(exprOrg, "(?i)") {
expr = "(?i)" + expr
}
case Regexp:
if !strings.HasPrefix(exprOrg, "^") {
expr = "(.|\n)*(" + expr + ")" // NOTE brackets are necessary
}
case GolangRegexp:
// do nothing
}
return regexp.Compile(expr)
}
// extractProg extracts the internal syntax.Prog from regexp.Regexp instance using reflection
func extractProg(regex *regexp.Regexp) *syntax.Prog {
return (*syntax.Prog)(reflect.ValueOf(regex).Elem().FieldByName("prog").UnsafePointer())
}
// extractNFA extracts the NFA from regexp.Regexp instance using Go
func extractNFA(regex *regexp.Regexp, maxNodes int) (*NFAStore, error) {
// extract the NFA data-structure that has been created by Go to handle the provided regex
p := extractProg(regex)
// create an empty store of nodes
store := newNFAStore(maxNodes)
// create translation map for nodeIDs from golangNFA to our NFA
translation := newMap[int, nodeIDT]()
{
idSet := newSet[int]()
for from := range p.Inst {
i := &p.Inst[from]
idSet.insert(from)
switch i.Op {
case syntax.InstAlt, syntax.InstAltMatch:
idSet.insert(int(i.Out))
idSet.insert(int(i.Arg))
case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstRune, syntax.InstNop, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
idSet.insert(int(i.Out))
case syntax.InstMatch, syntax.InstFail:
// do nothing
}
}
for id := range idSet {
nodeID, err := store.newNode()
if err != nil {
return nil, fmt.Errorf("%v::extractNFA", err)
}
translation.insert(id, nodeID)
}
}
// initialize the start node
store.startIDi = translation.at(p.Start)
if startNode, err := store.get(store.startIDi); err != nil {
return nil, err
} else {
startNode.start = true
}
// iterate over all instruction in the golangNFA and add nodes and edges to our NFA
for from := range p.Inst {
node, err := store.get(translation.at(from))
if err != nil {
return nil, err
}
i := &p.Inst[from]
switch i.Op {
case syntax.InstAlt:
node.addEdgeRune(edgeEpsilonRune, translation.at(int(i.Out)), false)
node.addEdgeRune(edgeEpsilonRune, translation.at(int(i.Arg)), false)
case syntax.InstAltMatch:
//NOTE dead code: InstAltMatch is nowhere issued, but when it will (in some future)...
node.addEdgeRune(edgeEpsilonRune, translation.at(int(i.Out)), false)
node.addEdgeRune(edgeEpsilonRune, translation.at(int(i.Arg)), false)
case syntax.InstCapture:
node.addEdgeRune(edgeEpsilonRune, translation.at(int(i.Out)), false)
case syntax.InstEmptyWidth:
nodeTo := translation.at(int(i.Out))
switch syntax.EmptyOp(i.Arg) {
//NOTE EmptyEndLine is issued for POSIX regex; EmptyEndText is issued for NON-POSIX regex
case syntax.EmptyEndText:
//NOTE non-posix $: $ matches then end-of-line AND end-of-text
node.addEdgeInternal(edgeT{edgeRLZARange, nodeTo})
case syntax.EmptyEndLine:
//NOTE posix $: $ matches the end-of-line
node.addEdgeInternal(edgeT{newSymbolRange(edgeEpsilonRune, edgeEpsilonRune), nodeTo})
case syntax.EmptyBeginLine, syntax.EmptyBeginText, syntax.EmptyNoWordBoundary, syntax.EmptyWordBoundary:
node.addEdgeInternal(edgeT{newSymbolRange(edgeEpsilonRune, edgeEpsilonRune), nodeTo})
default:
node.addEdgeRune(edgeEpsilonRune, nodeTo, false)
}
case syntax.InstNop:
node.addEdgeRune(edgeEpsilonRune, translation.at(int(i.Out)), false)
case syntax.InstMatch: // no i.Out
node.accept = true
case syntax.InstFail: // no i.Out
case syntax.InstRune: // i.Rune is a sequence of rune ranges
caseSensitive := (syntax.Flags(i.Arg) & syntax.FoldCase) == 0
nRunes := len(i.Rune)
if nRunes == 1 {
node.addEdgeRune(i.Rune[0], translation.at(int(i.Out)), caseSensitive)
} else {
if (nRunes & 1) == 1 {
return nil, fmt.Errorf("received invalid sequence of rune ranges from GOLANG: %#U", i.Rune)
}
seq := i.Rune
for nRunes > 0 {
node.addEdge(newSymbolRange(seq[0], seq[1]), translation.at(int(i.Out)))
nRunes -= 2
seq = seq[2:]
}
}
case syntax.InstRune1:
caseSensitive := (syntax.Flags(i.Arg) & syntax.FoldCase) == 0
for _, r := range i.Rune {
node.addEdgeRune(r, translation.at(int(i.Out)), caseSensitive)
}
case syntax.InstRuneAny:
node.addEdgeRune(edgeAnyRune, translation.at(int(i.Out)), false)
case syntax.InstRuneAnyNotNL:
node.addEdgeRune(edgeAnyNotLfRune, translation.at(int(i.Out)), false)
}
}
return &store, nil
}
func CompileDFA(regex *regexp.Regexp, maxNodes int) (*DFAStore, error) {
return CompileDFADebug(regex, false, maxNodes)
}
func CompileDFADebug(regex *regexp.Regexp, writeDot bool, maxNodes int) (*DFAStore, error) {
tmpPath := os.TempDir() + "\\sneller\\"
if writeDot {
os.MkdirAll(tmpPath, os.ModeDir)
log.Printf("6af9e7a9 going to write dot files to your temp dir %v for regex %q", tmpPath, regex.String())
}
name := "sneller"
nfaStore, err := extractNFA(regex, maxNodes)
if err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if writeDot {
name += "_nfa"
nfaStore.dot().WriteToFile(tmpPath+name+".dot", name, regex.String())
}
if err = nfaStore.pruneRLZ(); err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if writeDot {
name += "_prn"
nfaStore.dot().WriteToFile(tmpPath+name+".dot", name, regex.String())
}
if err = nfaStore.refactorEdges(); err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if writeDot {
name += "_ref"
nfaStore.dot().WriteToFile(tmpPath+name+".dot", name, regex.String())
}
dfaStore, err := nfaToDfa(nfaStore, maxNodes)
if err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if writeDot {
name += "_dfa"
dfaStore.Dot().WriteToFile(tmpPath+name+".dot", name, regex.String())
}
if err = dfaStore.pruneUnreachable(); err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if err = dfaStore.pruneNeverAccepting(); err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if writeDot {
name += "_prn"
dfaStore.Dot().WriteToFile(tmpPath+name+".dot", name, regex.String())
}
dfaStore, err = minDfa(dfaStore, maxNodes)
if err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
dfaStore.removeEdgesFromAcceptNodes() // remove all outgoing edges from accepting nodes
// we can merge accept nodes since they do not have outgoing edges (anymore)
if err := dfaStore.mergeAcceptNodes(); err != nil {
return nil, fmt.Errorf("%v::CompileDFA", err)
}
if writeDot {
name += "_min"
dfaStore.Dot().WriteToFile(tmpPath+name+".dot", name, regex.String())
}
return dfaStore, nil
}
// PrettyStrForDot makes the string pretty and usable for printing in dot files
func PrettyStrForDot(str string) string {
str = strconv.Quote(strings.ReplaceAll(str, "\n", "0xA"))
return str[1 : len(str)-1] // trim leading and trailing '"'
}