From 72f07d16d344cbf5ea0df074e84f19736b1bcf68 Mon Sep 17 00:00:00 2001 From: Sebastien Lavoie Date: Tue, 12 Jan 2021 13:28:29 -0500 Subject: [PATCH] Add strcase package - `ToPascalCase`, `ToCamelCase`, and `ToSnakeCase` will transform any input to that form. - Support for unicode runes - Support for all-uppercase initialisms, like mandated by the Go convention. - Expose `IsInitialism` - Emphasis on reducing allocations for memory efficiency. --- strcase/id.go | 120 +++++++++++++++++++++++ strcase/id_test.go | 191 +++++++++++++++++++++++++++++++++++++ strcase/initialism.go | 83 ++++++++++++++++ strcase/initialism_test.go | 37 +++++++ 4 files changed, 431 insertions(+) create mode 100644 strcase/id.go create mode 100644 strcase/id_test.go create mode 100644 strcase/initialism.go create mode 100644 strcase/initialism_test.go diff --git a/strcase/id.go b/strcase/id.go new file mode 100644 index 0000000..c22e2ef --- /dev/null +++ b/strcase/id.go @@ -0,0 +1,120 @@ +package strcase + +import ( + "math" + "strings" + "unicode" +) + +func ToPascalCase(input string) string { + return splitJoin(input, 0, 0) +} + +func ToCamelCase(input string) string { + return splitJoin(input, 1, 0) +} + +func ToSnakeCase(input string) string { + return splitJoin(input, math.MaxInt64, '_') +} + +func allocateBuilder(input string, separator rune) *strings.Builder { + var b strings.Builder + length := len(input) + if separator != 0 { + // Heuristic to add about 25% buffer for separators + // Not having perfect match isn't terrible, it will only result in a few more memory allocations. + // Ex: + // foo_bar_baz: 9 original chars, 11 final. 9 * 5 / 4 = 11 + // foo_id: 5 original chars, 6 final. 5 * 5 / 4 = 6 + // a_b_c_d: 4 original chars, 7 final. 4 * 5 / 4 = 5, which will result in an extra allocation. + length = length * 5 / 4 + } + + b.Grow(length) + return &b +} + +func splitJoin(input string, firstUpper int, separator rune) string { + b := allocateBuilder(input, separator) + var buf []rune + var currentPartIndex int + var lastCategory runeCategory + + // Flush the buffer as a part + flush := func() { + if len(buf) == 0 { + // Nothing was added since last flush + return + } + if separator != 0 && currentPartIndex > 0 { + b.WriteRune(separator) + } + if currentPartIndex >= firstUpper { + pascalPart(buf) + } + for _, r := range buf { + b.WriteRune(r) + } + currentPartIndex++ + lastCategory = unknown + buf = buf[0:0] // Clear buffer, but keep current allocation + } + + for _, r := range input { + switch cat := category(r); cat { + case upper: + if lastCategory != upper { + flush() + } + lastCategory = cat + buf = append(buf, unicode.ToLower(r)) + case lower, number: + if (lastCategory > number) != (cat > number) { + flush() + } + lastCategory = cat + buf = append(buf, r) + default: + // separator + flush() + } + } + flush() + + return b.String() +} + +// Convert to uppercase if initialism. +// Convert first rune to uppercase otherwise. +func pascalPart(part []rune) { + if isInitialism(part) { + for ri, r := range part { + part[ri] = unicode.ToUpper(r) + } + } else { + part[0] = unicode.ToUpper(part[0]) + } +} + +type runeCategory int + +const ( + unknown runeCategory = iota + number + lower + upper +) + +func category(r rune) runeCategory { + switch { + case unicode.IsLower(r): + return lower + case unicode.IsUpper(r): + return upper + case unicode.IsNumber(r): + return number + default: + return unknown + } +} diff --git a/strcase/id_test.go b/strcase/id_test.go new file mode 100644 index 0000000..c4f9a5b --- /dev/null +++ b/strcase/id_test.go @@ -0,0 +1,191 @@ +package strcase + +import ( + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +// splitjoin_l1_p1 38.1 ns/op 16 B/op 1 allocs/op +// IDToCamelCase_l1_p1 88.6 ns/op 48 B/op 3 allocs/op +// IDToSnakeCase_l1_p1 87.7 ns/op 48 B/op 3 allocs/op +// +// splitjoin_l1_p10 253 ns/op 176 B/op 2 allocs/op +// IDToCamelCase_l1_p10 421 ns/op 72 B/op 3 allocs/op +// IDToSnakeCase_l1_p10 269 ns/op 72 B/op 3 allocs/op +// +// splitjoin_l1_p100 2137 ns/op 1904 B/op 2 allocs/op +// IDToCamelCase_l1_p100 3503 ns/op 248 B/op 3 allocs/op +// IDToSnakeCase_l1_p100 1879 ns/op 296 B/op 3 allocs/op +// +// splitjoin_l10_p1 38.0 ns/op 16 B/op 1 allocs/op +// IDToCamelCase_l10_p1 247 ns/op 168 B/op 6 allocs/op +// IDToSnakeCase_l10_p1 248 ns/op 168 B/op 6 allocs/op +// +// splitjoin_l10_p10 278 ns/op 272 B/op 2 allocs/op +// IDToCamelCase_l10_p10 1140 ns/op 264 B/op 6 allocs/op +// IDToSnakeCase_l10_p10 979 ns/op 296 B/op 6 allocs/op +// +// splitjoin_l10_p100 2267 ns/op 2816 B/op 2 allocs/op +// IDToCamelCase_l10_p100 9538 ns/op 1304 B/op 6 allocs/op +// IDToSnakeCase_l10_p100 8147 ns/op 1560 B/op 6 allocs/op +// +// splitjoin_l100_p1 41.1 ns/op 16 B/op 1 allocs/op +// IDToCamelCase_l100_p1 1114 ns/op 1160 B/op 9 allocs/op +// IDToSnakeCase_l100_p1 1104 ns/op 1176 B/op 9 allocs/op +// +// splitjoin_l100_p10 446 ns/op 1184 B/op 2 allocs/op +// IDToCamelCase_l100_p10 7692 ns/op 2072 B/op 9 allocs/op +// IDToSnakeCase_l100_p10 7589 ns/op 2328 B/op 9 allocs/op +// +// splitjoin_l100_p100 3877 ns/op 12032 B/op 2 allocs/op +// IDToCamelCase_l100_p100 72671 ns/op 11288 B/op 9 allocs/op +// IDToSnakeCase_l100_p100 71673 ns/op 14616 B/op 9 allocs/op +func Benchmark_splitJoin(b *testing.B) { + for _, length := range []int{1, 10, 100} { + part := strings.Repeat("a", length) + + for _, count := range []int{1, 10, 100} { + input := part + strings.Repeat("_"+part, count-1) + + // Baseline, split and join all parts + b.Run(fmt.Sprintf("splitjoin_l%d_p%d", length, count), func(b *testing.B) { + for i := 0; i < b.N; i++ { + strings.Join(strings.Split(input, "_"), "") + } + }) + + b.Run(fmt.Sprintf("IDToCamelCase_l%d_p%d", length, count), func(b *testing.B) { + for i := 0; i < b.N; i++ { + ToCamelCase(input) + } + }) + + b.Run(fmt.Sprintf("IDToSnakeCase_l%d_p%d", length, count), func(b *testing.B) { + for i := 0; i < b.N; i++ { + ToSnakeCase(input) + } + }) + } + } +} + +// lower 5.03 ns/op 0 B/op 0 allocs/op +// upper 5.81 ns/op 0 B/op 0 allocs/op +// number 6.59 ns/op 0 B/op 0 allocs/op +// symbol 6.58 ns/op 0 B/op 0 allocs/op +// 16_bits 153 ns/op 0 B/op 0 allocs/op +// 32_bits 160 ns/op 0 B/op 0 allocs/op +func Benchmark_category(b *testing.B) { + tests := map[string][]rune{ + "lower": {'a', 'b'}, + "upper": {'A', 'B'}, + "number": {'0', '1'}, + "symbol": {'_', ' '}, + "16 bits": {'™', '∞', '•', 'Ω'}, + "32 bits": {'𠁂', '𠁄', '𠁔', '𠁑'}, + } + for name, runes := range tests { + b.Run(name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, r := range runes { + category(r) + } + } + }) + } +} + +func Test_splitJoin(t *testing.T) { + tests := []struct { + input string + camel string + pascal string + snake string + }{ + { + // everything empty + }, + { + input: "a", + pascal: "A", + camel: "a", + snake: "a", + }, + { + input: "A", + pascal: "A", + camel: "a", + snake: "a", + }, + { + input: "a_a", + pascal: "AA", + camel: "aA", + snake: "a_a", + }, + { + input: "__a___a_", + pascal: "AA", + camel: "aA", + snake: "a_a", + }, + { + input: "aa_bbb", + pascal: "AaBbb", + camel: "aaBbb", + snake: "aa_bbb", + }, + { + input: "aa_id", + pascal: "AaID", + camel: "aaID", + snake: "aa_id", + }, + { + input: "fooBar", + pascal: "FooBar", + camel: "fooBar", + snake: "foo_bar", + }, + { + input: "FooBAR", + pascal: "FooBar", + camel: "fooBar", + snake: "foo_bar", + }, + { + input: "fooUrl", + pascal: "FooURL", + camel: "fooURL", + snake: "foo_url", + }, + { + input: "fooURL", + pascal: "FooURL", + camel: "fooURL", + snake: "foo_url", + }, + { + input: "url10", + pascal: "URL10", + camel: "url10", + snake: "url_10", + }, + { + input: "url_id", + pascal: "URLID", + camel: "urlID", + snake: "url_id", + }, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + require.Equal(t, tt.pascal, ToPascalCase(tt.input)) + require.Equal(t, tt.camel, ToCamelCase(tt.input)) + require.Equal(t, tt.snake, ToSnakeCase(tt.input)) + }) + } +} diff --git a/strcase/initialism.go b/strcase/initialism.go new file mode 100644 index 0000000..8e51599 --- /dev/null +++ b/strcase/initialism.go @@ -0,0 +1,83 @@ +package strcase + +import "sort" + +var commonInitialisms [][]rune + +func init() { + // To follow go's convention of have acronyms in all caps, hard code a few of the common ones + // Taken from https://github.com/golang/lint/blob/83fdc39ff7b56453e3793356bcff3070b9b96445/lint.go#L770-L809 + var initialisms = []string{ + "acl", + "api", + "ascii", + "cpu", + "css", + "dns", + "eof", + "guid", + "html", + "http", + "https", + "id", + "ip", + "json", + "lhs", + "qps", + "ram", + "rhs", + "rpc", + "sla", + "smtp", + "sql", + "ssh", + "tcp", + "tls", + "ttl", + "udp", + "ui", + "uid", + "uuid", + "uri", + "url", + "utf8", + "vm", + "xml", + "xmpp", + "xsrf", + "xss", + } + sort.Strings(initialisms) + + for _, initialism := range initialisms { + commonInitialisms = append(commonInitialisms, []rune(initialism)) + } +} + +func IsInitialism(part string) bool { + return isInitialism([]rune(part)) +} + +func isInitialism(part []rune) bool { + // Adapted from sort.Search to benefit from the fact that we only deal with rune slices + i := 0 + j := len(commonInitialisms) +out: + for i < j { + h := int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + + for k, r := range commonInitialisms[h] { + switch { + case len(part) < k+1 || part[k] < r: + j = h + continue out + case part[k] > r: + i = h + 1 + continue out + } + } + return true + } + return false +} diff --git a/strcase/initialism_test.go b/strcase/initialism_test.go new file mode 100644 index 0000000..7ea3fc8 --- /dev/null +++ b/strcase/initialism_test.go @@ -0,0 +1,37 @@ +package strcase + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func Test_IsInitialism(t *testing.T) { + tests := []struct { + input string + output bool + }{ + {"", false}, + {"foo", false}, + {"id", true}, + {"url", true}, + } + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + require.Equal(t, tt.output, IsInitialism(tt.input)) + }) + } +} + +// foo 18.3 ns/op 0 B/op 0 allocs/op +// url 22.2 ns/op 0 B/op 0 allocs/op +// acl 22.4 ns/op 0 B/op 0 allocs/op +func BenchmarkIsInitialism(b *testing.B) { + for _, input := range []string{"foo", "url", string(commonInitialisms[0])} { + b.Run(input, func(b *testing.B) { + for i := 0; i < b.N; i++ { + IsInitialism(input) + } + }) + } +}