-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
152 lines (136 loc) · 4.1 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package main
import (
"log"
"os"
"strconv"
"strings"
"text/template"
"unicode"
)
const path = "unicode.go"
var tmpl = template.Must(template.New("tlds").Parse(`// Generated by unicodegen
package xurls
const allowedUcsChar = {{.withPunc}}
const allowedUcsCharMinusPunc = {{.withoutPunc}}
`))
func visit(rt *unicode.RangeTable, fn func(rune)) {
for _, r16 := range rt.R16 {
for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) {
fn(r)
}
}
for _, r32 := range rt.R32 {
for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) {
fn(r)
}
}
}
func writeUnicode() error {
// rfc3987Ranges contains the ranges of valid code points specified by RFC 3987.
rfc3987Ranges := [][2]rune{
{0xA0, 0xD7FF},
{0xF900, 0xFDCF},
{0xFDF0, 0xFFEF},
{0x10000, 0x1FFFD},
{0x20000, 0x2FFFD},
{0x30000, 0x3FFFD},
{0x40000, 0x4FFFD},
{0x50000, 0x5FFFD},
{0x60000, 0x6FFFD},
{0x70000, 0x7FFFD},
{0x80000, 0x8FFFD},
{0x90000, 0x9FFFD},
{0xA0000, 0xAFFFD},
{0xB0000, 0xBFFFD},
{0xC0000, 0xCFFFD},
{0xD0000, 0xDFFFD},
{0xE1000, 0xEFFFD},
}
// removeRune accepts a slice of inclusive code point ranges (in ascending order)
// and returns a new slice that is equivalent except for excluding a specified rune
// by removing/replacing/splitting any range containing it.
// Its linear searches over the ranges (including those added by previous invocations)
// are inefficient, but acceptable because this code runs only at build time.
removeRune := func(ranges [][2]rune, cp rune) [][2]rune {
for i, r := range ranges {
// Ranges are in ascending order. Skip any that precede `cp`,
// and bail out upon reaching one that follows `cp`.
if r[1] < cp {
continue
} else if cp < r[0] {
break
}
// `cp` is in this range and must be removed from it.
if cp == r[0] && cp == r[1] {
// Remove this single-element range.
return append(ranges[0:i], ranges[i+1:]...)
} else if cp == r[0] {
// Remove the first element of this range.
newRange := [2]rune{r[0] + 1, r[1]}
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
return append(ranges[0:i], newTail...)
} else if cp == r[1] {
// Remove the last element of this range.
newRange := [2]rune{r[0], r[1] - 1}
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
return append(ranges[0:i], newTail...)
} else {
// Split this range.
newTail := append(
[][2]rune{
{r[0], cp - 1},
{cp + 1, r[1]},
},
ranges[i+1:]...)
return append(ranges[0:i], newTail...)
}
}
return ranges
}
// sepFreeRanges excludes separators from rfc3987Ranges.
sepFreeRanges := append([][2]rune{}, rfc3987Ranges...)
visit(unicode.Z, func(cp rune) {
sepFreeRanges = removeRune(sepFreeRanges, cp)
})
// puncFreeRanges excludes punctuation from sepFreeRanges.
puncFreeRanges := append([][2]rune{}, sepFreeRanges...)
visit(unicode.Po, func(cp rune) {
puncFreeRanges = removeRune(puncFreeRanges, cp)
})
// Build the corresponding regular expression character class contents.
characterClassContents := func(ranges [][2]rune) strings.Builder {
var builder strings.Builder
for _, r := range ranges {
// regexp.QuoteMeta is not necessary because all metacharacters are ASCII.
// cf. https://golang.org/s/re2syntax and
// https://cs.opensource.google/go/go/+/refs/tags/go1.17.6:src/regexp/regexp.go;l=721
builder.WriteRune(r[0])
if r[0] == r[1] {
continue
}
builder.WriteRune('-')
builder.WriteRune(r[1])
}
return builder
}
allowedUcsChar := characterClassContents(sepFreeRanges)
allowedUcsCharMinusPunc := characterClassContents(puncFreeRanges)
// Write to file.
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
return tmpl.Execute(f, map[string]string{
"withPunc": strconv.Quote(allowedUcsChar.String()),
"withoutPunc": strconv.Quote(allowedUcsCharMinusPunc.String()),
})
}
func main() {
log.Printf("Generating %s...", path)
if err := writeUnicode(); err != nil {
log.Fatalf("Could not write path: %v", err)
}
}