Skip to content

Commit 27f48bc

Browse files
committed
Foundation: Move StringView::compare to StringSpan
1 parent 3ef05ef commit 27f48bc

File tree

6 files changed

+354
-119
lines changed

6 files changed

+354
-119
lines changed

Libraries/Foundation/Internal/StringSpan.inl

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,79 @@ SC::Result SC::StringSpan::appendNullTerminatedTo(NativeWritable& string, bool r
6262
string.length = toSlice + numWritten;
6363
return Result(true);
6464
}
65+
66+
SC::uint32_t SC::StringSpan::advanceUTF8(const char*& it, const char* end)
67+
{
68+
const uint8_t lead = static_cast<uint8_t>(*(it++));
69+
if (lead < 0x80)
70+
{
71+
return lead;
72+
}
73+
else if ((lead >> 5) == 0x06 and it < end) // 2-byte sequence
74+
{
75+
const uint8_t trail = static_cast<uint8_t>(*(it++));
76+
if ((trail >> 6) == 0x02)
77+
return ((lead & 0x1Fu) << 6) | (trail & 0x3Fu);
78+
}
79+
else if ((lead >> 4) == 0x0E and it + 1 < end) // 3-byte sequence
80+
{
81+
const uint8_t trail1 = static_cast<uint8_t>(*(it++));
82+
const uint8_t trail2 = static_cast<uint8_t>(*(it++));
83+
if ((trail1 >> 6) == 0x02 and (trail2 >> 6) == 0x02)
84+
return ((lead & 0x0Fu) << 12) | ((trail1 & 0x3Fu) << 6) | (trail2 & 0x3Fu);
85+
}
86+
else if ((lead >> 3) == 0x1E and it + 2 < end) // 4-byte sequence
87+
{
88+
const uint8_t trail1 = static_cast<uint8_t>(*(it++));
89+
const uint8_t trail2 = static_cast<uint8_t>(*(it++));
90+
const uint8_t trail3 = static_cast<uint8_t>(*(it++));
91+
if ((trail1 >> 6) == 0x02 and (trail2 >> 6) == 0x02 and (trail3 >> 6) == 0x02)
92+
return ((lead & 0x07u) << 18) | ((trail1 & 0x3Fu) << 12) | ((trail2 & 0x3Fu) << 6) | (trail3 & 0x3F);
93+
}
94+
return 0; // Invalid sequence
95+
}
96+
97+
SC::uint32_t SC::StringSpan::advanceUTF16(const char*& it, const char* end)
98+
{
99+
uint16_t lead, trail;
100+
::memcpy(&lead, it, sizeof(uint16_t)); // Avoid potential unaligned read
101+
it += sizeof(uint16_t);
102+
if (lead < 0xD800 or lead > 0xDFFF)
103+
return lead;
104+
::memcpy(&trail, it, sizeof(uint16_t)); // Avoid potential unaligned read
105+
if ((lead >= 0xDC00) or (it >= end) or (trail < 0xDC00) or (trail > 0xDFFF))
106+
return 0; // trail surrogate without lead / incomplete surrogate pair / invalid trail surrogate
107+
it += sizeof(uint16_t);
108+
return 0x10000u + ((lead - 0xD800u) << 10) + (trail - 0xDC00u);
109+
}
110+
111+
SC::StringSpan::Comparison SC::StringSpan::compare(StringSpan other) const
112+
{
113+
if (getEncoding() == other.getEncoding())
114+
{
115+
const size_t minSize = sizeInBytes() < other.sizeInBytes() ? sizeInBytes() : other.sizeInBytes();
116+
if (text == nullptr)
117+
return other.textSizeInBytes == 0 ? Comparison::Equals : Comparison::Smaller;
118+
if (other.text == nullptr)
119+
return textSizeInBytes == 0 ? Comparison::Equals : Comparison::Bigger;
120+
const int cmp = ::memcmp(text, other.text, minSize);
121+
if (cmp != 0)
122+
return cmp < 0 ? Comparison::Smaller : Comparison::Bigger;
123+
return textSizeInBytes < other.textSizeInBytes
124+
? Comparison::Smaller
125+
: (textSizeInBytes > other.textSizeInBytes ? Comparison::Bigger : Comparison::Equals);
126+
}
127+
128+
const char *p1 = text, *end1 = p1 + textSizeInBytes;
129+
const char *p2 = other.text, *end2 = p2 + other.textSizeInBytes;
130+
while (p1 < end1 and p2 < end2)
131+
{
132+
uint32_t cp1 = getEncoding() == StringEncoding::Utf16 ? advanceUTF16(p1, end1) : advanceUTF8(p1, end1);
133+
uint32_t cp2 = other.getEncoding() == StringEncoding::Utf16 ? advanceUTF16(p2, end2) : advanceUTF8(p2, end2);
134+
if (cp1 < cp2)
135+
return Comparison::Smaller;
136+
if (cp1 > cp2)
137+
return Comparison::Bigger;
138+
}
139+
return p1 < end1 ? Comparison::Bigger : (p2 < end2 ? Comparison::Smaller : Comparison::Equals);
140+
}

Libraries/Foundation/StringSpan.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,24 @@ struct SC_COMPILER_EXPORT StringSpan
6060
constexpr StringSpan(const wchar_t (&str)[N]) : textWide(str), textSizeInBytes((N - 1)* sizeof(wchar_t)), encoding(static_cast<uint8_t>(StringEncoding::Native)), hasNullTerm(true) {}
6161
static constexpr StringSpan fromNullTerminated(const wchar_t* text, StringEncoding encoding) { return text == nullptr ? StringSpan(encoding) : StringSpan({text, ::wcslen(text)}, true); }
6262
#endif
63-
constexpr bool operator ==(const StringSpan other) const { return textSizeInBytes == other.textSizeInBytes and ::memcmp(text, other.text, textSizeInBytes) == 0; }
63+
6464
// clang-format on
6565

66+
/// @brief Result of ordering comparison done by StringSpan::compare
67+
enum class Comparison
68+
{
69+
Smaller = -1, ////< Current string is smaller than the other
70+
Equals = 0, ////< Current string is equal to the other
71+
Bigger = 1 ////< Current string is bigger than the other
72+
};
73+
74+
/// @brief Ordering comparison between non-normalized StringView (operates on code points, not on utf graphemes)
75+
[[nodiscard]] Comparison compare(StringSpan other) const;
76+
77+
[[nodiscard]] bool operator==(const StringSpan other) const { return compare(other) == Comparison::Equals; }
78+
[[nodiscard]] bool operator!=(const StringSpan other) const { return compare(other) != Comparison::Equals; }
79+
[[nodiscard]] bool operator<(const StringSpan other) const { return compare(other) == Comparison::Smaller; }
80+
6681
/// @brief Obtain a `const char` Span from this StringView
6782
[[nodiscard]] Span<const char> toCharSpan() const { return {text, textSizeInBytes}; }
6883

@@ -115,6 +130,14 @@ struct SC_COMPILER_EXPORT StringSpan
115130
/// @param removePreviousNullTerminator If true, the previous null terminator is removed
116131
Result appendNullTerminatedTo(NativeWritable& string, bool removePreviousNullTerminator = true) const;
117132

133+
/// @brief Decode a single UTF8 code point and advance the iterator
134+
/// @return The decoded code point, or 0 if the sequence is invalid (or end is reached)
135+
static uint32_t advanceUTF8(const char*& it, const char* end);
136+
137+
/// @brief Decode a single UTF16 code point and advance the iterator
138+
/// @return The decoded code point, or 0 if the sequence is invalid (or end is reached)
139+
static uint32_t advanceUTF16(const char*& it, const char* end);
140+
118141
protected:
119142
friend struct StringView;
120143
union

Libraries/Strings/Internal/StringView.inl

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -118,52 +118,6 @@ bool SC::StringView::parseDouble(double& value) const
118118
return true;
119119
}
120120

121-
SC::StringView::Comparison SC::StringView::compare(StringView other) const
122-
{
123-
if (hasCompatibleEncoding(other))
124-
{
125-
const int res = memcmp(text, other.text, min(textSizeInBytes, other.textSizeInBytes));
126-
if (res < 0)
127-
return Comparison::Smaller;
128-
else if (res == 0)
129-
return Comparison::Equals;
130-
else
131-
return Comparison::Bigger;
132-
}
133-
else
134-
{
135-
return withIterator(
136-
[other](auto it1)
137-
{
138-
return other.withIterator(
139-
[&it1](auto it2)
140-
{
141-
StringCodePoint c1 = 0, c2 = 0;
142-
while (it1.advanceRead(c1) and it2.advanceRead(c2))
143-
{
144-
if (c1 < c2)
145-
{
146-
return Comparison::Smaller;
147-
}
148-
else if (c1 > c2)
149-
{
150-
return Comparison::Bigger;
151-
}
152-
}
153-
if (it1.isAtEnd() and it2.isAtEnd())
154-
{
155-
return Comparison::Equals;
156-
}
157-
if (it1.isAtEnd())
158-
{
159-
return Comparison::Bigger;
160-
}
161-
return Comparison::Smaller;
162-
});
163-
});
164-
}
165-
}
166-
167121
bool SC::StringView::startsWith(const StringView str) const
168122
{
169123
if (hasCompatibleEncoding(str))

Libraries/Strings/StringView.h

Lines changed: 13 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -78,53 +78,6 @@ struct SC::StringView : public StringSpan
7878
return Span<const uint8_t>::reinterpret_bytes(text, textSizeInBytes);
7979
}
8080

81-
/// @brief Result of ordering comparison done by StringView::compare
82-
enum class Comparison
83-
{
84-
Smaller = -1, ////< Current string is smaller than the other
85-
Equals = 0, ////< Current string is equal to the other
86-
Bigger = 1 ////< Current string is bigger than the other
87-
};
88-
89-
/// @brief Ordering comparison between non-normalized StringView (operates on code points, not on utf graphemes)
90-
/// @param other The string being compared to current one
91-
/// @return Result of the comparison (smaller, equals or bigger)
92-
///
93-
/// Example:
94-
/// @code{.cpp}
95-
/// // àèìòù (1 UTF16-LE sequence, 2 UTF8 sequence)
96-
/// SC_ASSERT_RELEASE("\xc3\xa0\xc3\xa8\xc3\xac\xc3\xb2\xc3\xb9"_u8.compare(
97-
/// "\xe0\x0\xe8\x0\xec\x0\xf2\x0\xf9\x0"_u16) == StringView::Comparison::Equals);
98-
///
99-
/// // 日本語語語 (1 UTF16-LE sequence, 3 UTF8 sequence)
100-
/// StringView stringUtf8 = StringView("\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe8\xaa\x9e\xe8\xaa\x9e"_u8);
101-
/// StringView stringUtf16 = StringView("\xE5\x65\x2C\x67\x9E\x8a\x9E\x8a\x9E\x8a\x00"_u16); // LE
102-
/// // Comparisons are on code points NOT grapheme clusters!!
103-
/// SC_ASSERT_RELEASE(stringUtf8.compare(stringUtf16) == StringView::Comparison::Equals);
104-
/// SC_ASSERT_RELEASE(stringUtf16.compare(stringUtf8) == StringView::Comparison::Equals);
105-
/// SC_ASSERT_RELEASE(stringUtf8 == stringUtf16);
106-
/// SC_ASSERT_RELEASE(stringUtf16 == stringUtf8);
107-
/// @endcode
108-
[[nodiscard]] Comparison compare(StringView other) const;
109-
110-
/// @brief Ordering operator for StringView using StringView::compare
111-
/// @param other The string being compared to current one
112-
/// @return `true` if current string is Comparison::Smaller than other
113-
///
114-
/// Example:
115-
/// @code{.cpp}
116-
/// StringView sv[3] = {
117-
/// StringView("3"),
118-
/// StringView("1"),
119-
/// StringView("2"),
120-
/// };
121-
/// Algorithms::bubbleSort(sv, sv + 3, [](StringView a, StringView b) { return a < b; });
122-
/// SC_TEST_EXPECT(sv[0] == "1");
123-
/// SC_TEST_EXPECT(sv[1] == "2");
124-
/// SC_TEST_EXPECT(sv[2] == "3");
125-
/// @endcode
126-
[[nodiscard]] bool operator<(StringView other) const { return compare(other) == Comparison::Smaller; }
127-
12881
/// @brief Call given lambda with one of StringIteratorASCII, StringIteratorUTF8, StringIteratorUTF16 depending on
12982
/// encoding.
13083
/// @tparam Func A function/lambda with `auto operator()({StringIteratorASCII | StringIteratorUTF8 |
@@ -677,34 +630,22 @@ constexpr bool SC::StringView::equalsIterator(StringView other, size_t& points)
677630
#pragma clang diagnostic push
678631
#pragma clang diagnostic ignored "-Wunreachable-code"
679632
#endif
680-
if (hasCompatibleEncoding(other))
633+
if (__builtin_is_constant_evaluated())
681634
{
682-
if (textSizeInBytes != other.textSizeInBytes)
635+
if (not hasCompatibleEncoding(other))
683636
return false;
684-
if (__builtin_is_constant_evaluated())
685-
{
686-
auto it1 = text;
687-
auto it2 = other.text;
688-
auto sz = textSizeInBytes;
689-
for (size_t idx = 0; idx < sz; ++idx)
690-
if (it1[idx] != it2[idx])
691-
return false;
692-
}
693-
else
694-
{
695-
if (text == nullptr)
696-
{
697-
return other.textSizeInBytes == 0;
698-
}
699-
if (other.text == nullptr)
700-
{
701-
return textSizeInBytes == 0;
702-
}
703-
return memcmp(text, other.text, textSizeInBytes) == 0;
704-
}
637+
auto it1 = text;
638+
auto it2 = other.text;
639+
auto sz = textSizeInBytes;
640+
for (size_t idx = 0; idx < sz; ++idx)
641+
if (it1[idx] != it2[idx])
642+
return false;
643+
return true;
644+
}
645+
else
646+
{
647+
return StringSpan::operator==(other);
705648
}
706-
size_t commonOverlappingPoints = 0;
707-
return fullyOverlaps(other, commonOverlappingPoints);
708649
#if defined(__clang__)
709650
#pragma clang diagnostic pop
710651
#endif

0 commit comments

Comments
 (0)