diff --git a/Pluto.vcxproj b/Pluto.vcxproj index 26a3ab54c..34f2550ac 100644 --- a/Pluto.vcxproj +++ b/Pluto.vcxproj @@ -662,6 +662,7 @@ + @@ -762,6 +763,8 @@ + + @@ -835,6 +838,7 @@ + @@ -920,11 +924,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Pluto.vcxproj.filters b/Pluto.vcxproj.filters index 6467541fe..0ee1edbcf 100644 --- a/Pluto.vcxproj.filters +++ b/Pluto.vcxproj.filters @@ -298,6 +298,13 @@ vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + @@ -846,6 +853,99 @@ vendor\Soup\soup + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + + + vendor\Soup\soup + @@ -861,4 +961,4 @@ {f0adda43-f311-40e5-b4ec-284f248bad46} - \ No newline at end of file + diff --git a/src/Makefile b/src/Makefile index 870a292b3..10462c07f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -40,7 +40,7 @@ PLATS= guess aix bsd freebsd generic linux linux-readline macosx posix solaris LUA_A= libplutostatic.a LUA_SO= libpluto.so CORE_O= lapi.o lcode.o lctype.o ldebug.o ldo.o ldump.o lfunc.o lgc.o llex.o lmem.o lobject.o lopcodes.o lparser.o lstate.o lstring.o ltable.o ltm.o lundump.o lvm.o lzio.o -LIB_O= lauxlib.o lbaselib.o lcorolib.o ldblib.o liolib.o lmathlib.o loadlib.o loslib.o lstrlib.o lcryptolib.o ltablib.o lutf8lib.o lassertlib.o lvector3lib.o lbase32.o lbase64.o ljson.o lurllib.o linit.o lstarlib.o lcatlib.o lhttplib.o lschedulerlib.o lsocketlib.o lbigint.o lxml.o +LIB_O= lauxlib.o lbaselib.o lcorolib.o ldblib.o liolib.o lmathlib.o loadlib.o loslib.o lstrlib.o lcryptolib.o ltablib.o lutf8lib.o lassertlib.o lvector3lib.o lbase32.o lbase64.o ljson.o lurllib.o linit.o lstarlib.o lcatlib.o lhttplib.o lschedulerlib.o lsocketlib.o lbigint.o lxml.o lregex.o BASE_O= $(CORE_O) $(LIB_O) $(MYOBJS) LUA_T= pluto diff --git a/src/lregex.cpp b/src/lregex.cpp new file mode 100644 index 000000000..07c931565 --- /dev/null +++ b/src/lregex.cpp @@ -0,0 +1,56 @@ +#define LUA_LIB +#include "lualib.h" + +#include "vendor/Soup/soup/Regex.hpp" + +static soup::Regex* checkregex (lua_State *L, int i) { + return (soup::Regex*)luaL_checkudata(L, i, "pluto:regex"); +} + +static int regex_new (lua_State *L) { + new (lua_newuserdata(L, sizeof(soup::Regex))) soup::Regex{ soup::Regex::fromFullString(pluto_checkstring(L, 1)) }; + if (luaL_newmetatable(L, "pluto:regex")) { + lua_pushliteral(L, "__index"); + luaL_loadbuffer(L, "return require\"pluto:regex\"", 27, 0); + lua_call(L, 0, 1); + lua_settable(L, -3); + lua_pushliteral(L, "__gc"); + lua_pushcfunction(L, [](lua_State *L) { + std::destroy_at<>(checkregex(L, 1)); + return 0; + }); + lua_settable(L, -3); + } + lua_setmetatable(L, -2); + return 1; +} + +static int regex_match (lua_State *L) { + size_t len; + const char *str = luaL_checklstring(L, 2, &len); + auto res = checkregex(L, 1)->match(str, str + len); + if (res.isSuccess()) { + lua_newtable(L); + for (size_t i = 0; i != res.groups.size(); ++i) { + if (res.groups[i].has_value()) { + if (res.groups[i]->name.empty()) + lua_pushinteger(L, i); + else + pluto_pushstring(L, res.groups[i]->name); + lua_pushlstring(L, res.groups[i]->begin, res.groups[i]->length()); + lua_settable(L, -3); + } + } + } + else { + luaL_pushfail(L); + } + return 1; +} + +static const luaL_Reg funcs_regex[] = { + {"new", regex_new}, + {"match", regex_match}, + {nullptr, nullptr} +}; +PLUTO_NEWLIB(regex); diff --git a/src/lualib.h b/src/lualib.h index b95c31a7b..60d28f988 100644 --- a/src/lualib.h +++ b/src/lualib.h @@ -59,6 +59,7 @@ namespace Pluto { #endif extern const PreloadedLibrary preloaded_bigint; extern const PreloadedLibrary preloaded_xml; + extern const PreloadedLibrary preloaded_regex; inline const PreloadedLibrary* const all_preloaded[] = { &preloaded_crypto, @@ -77,6 +78,7 @@ namespace Pluto { #endif &preloaded_bigint, &preloaded_xml, + &preloaded_regex, }; } @@ -96,6 +98,7 @@ LUAMOD_API int (luaopen_socket) (lua_State *L); #endif LUAMOD_API int (luaopen_bigint) (lua_State *L); LUAMOD_API int (luaopen_xml) (lua_State *L); +LUAMOD_API int (luaopen_regex) (lua_State *L); /* open all previous libraries */ LUALIB_API void (luaL_openlibs) (lua_State *L); diff --git a/src/vendor/Soup/soup/BigBitset.hpp b/src/vendor/Soup/soup/BigBitset.hpp new file mode 100644 index 000000000..08f48583c --- /dev/null +++ b/src/vendor/Soup/soup/BigBitset.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include +#include // memcpy + +#include "base.hpp" + +NAMESPACE_SOUP +{ +#pragma pack(push, 1) + template + struct BigBitset + { + uint8_t data[Bytes]{}; + + BigBitset() = default; + + BigBitset(const BigBitset& b) + { + memcpy(data, b.data, sizeof(data)); + } + + [[nodiscard]] static BigBitset* at(void* dp) noexcept + { + return reinterpret_cast*>(dp); + } + + [[nodiscard]] static const BigBitset* at(const void* dp) noexcept + { + return reinterpret_cast*>(dp); + } + + [[nodiscard]] constexpr bool get(const size_t i) const noexcept + { + const auto j = (i / 8); + const auto k = (i % 8); + + return (data[j] >> k) & 1; + } + + constexpr void set(const size_t i, const bool v) noexcept + { + const auto j = (i / 8); + const auto k = (i % 8); + + const uint8_t mask = (1 << k); + + data[j] &= ~mask; + data[j] |= (mask * v); + } + + constexpr void enable(const size_t i) noexcept + { + const auto j = (i / 8); + const auto k = (i % 8); + + data[j] |= (1 << k); + } + + constexpr void disable(const size_t i) noexcept + { + const auto j = (i / 8); + const auto k = (i % 8); + + data[j] &= ~(1 << k); + } + }; +#pragma pack(pop) +} diff --git a/src/vendor/Soup/soup/Makefile b/src/vendor/Soup/soup/Makefile index 8b81d7c83..c946a52a3 100644 --- a/src/vendor/Soup/soup/Makefile +++ b/src/vendor/Soup/soup/Makefile @@ -3,7 +3,7 @@ CFLAGS=-c -Wall -DSOUP_USE_INTRIN LIBNAME=libsoup.a # echo $(ls *.cpp | sed 's/.cpp/.o/g') -OBJS=adler32.o aes.o alloc.o Asn1Identifier.o Asn1Sequence.o base32.o base64.o base.o Bigint.o Capture.o cat.o CpuInfo.o crc32.o Curve25519.o deflate.o DetachedScheduler.o dnsHttpResolver.o dnsName.o dnsRawResolver.o dnsSmartResolver.o dnsUdpResolver.o dns_records.o dnsResolver.o ecc.o filesystem.o HttpRequest.o HttpRequestTask.o IpAddr.o joaat.o JsonArray.o JsonBool.o json.o JsonFloat.o JsonInt.o JsonNode.o JsonNull.o JsonObject.o JsonString.o log.o MimeMessage.o netConfig.o netConnectTask.o netStatus.o Oid.o pem.o Promise.o rand.o rsa.o Scheduler.o SelfDeletingThread.o sha1.o sha256.o sha384.o sha512.o Socket.o SocketTlsEncrypter.o SocketTlsHandshaker.o spaceship.o string.o Task.o Thread.o time.o TrustStore.o unicode.o Uri.o urlenc.o version_compare.o Worker.o X509Certchain.o X509Certificate.o X509RelativeDistinguishedName.o xml.o Reader.o Writer.o DefaultRngInterface.o HardwareRng.o Server.o os.o +OBJS=adler32.o aes.o alloc.o Asn1Identifier.o Asn1Sequence.o base32.o base64.o base.o Bigint.o Capture.o cat.o CpuInfo.o crc32.o Curve25519.o deflate.o DetachedScheduler.o dnsHttpResolver.o dnsName.o dnsRawResolver.o dnsSmartResolver.o dnsUdpResolver.o dns_records.o dnsResolver.o ecc.o filesystem.o HttpRequest.o HttpRequestTask.o IpAddr.o joaat.o JsonArray.o JsonBool.o json.o JsonFloat.o JsonInt.o JsonNode.o JsonNull.o JsonObject.o JsonString.o log.o MimeMessage.o netConfig.o netConnectTask.o netStatus.o Oid.o pem.o Promise.o rand.o rsa.o Scheduler.o SelfDeletingThread.o sha1.o sha256.o sha384.o sha512.o Socket.o SocketTlsEncrypter.o SocketTlsHandshaker.o spaceship.o string.o Task.o Thread.o time.o TrustStore.o unicode.o Uri.o urlenc.o version_compare.o Worker.o X509Certchain.o X509Certificate.o X509RelativeDistinguishedName.o xml.o Reader.o Writer.o DefaultRngInterface.o HardwareRng.o Server.o os.o Regex.o RegexGroup.o all: $(LIBNAME) diff --git a/src/vendor/Soup/soup/PointerAndBool.hpp b/src/vendor/Soup/soup/PointerAndBool.hpp new file mode 100644 index 000000000..7a370455e --- /dev/null +++ b/src/vendor/Soup/soup/PointerAndBool.hpp @@ -0,0 +1,76 @@ +#pragma once + +#include "base.hpp" +#include "type_traits.hpp" + +NAMESPACE_SOUP +{ + template )> + class PointerAndBool + { + private: + uintptr_t data; + + public: + PointerAndBool(T ptr) + : data(reinterpret_cast(ptr)) + { + //SOUP_ASSERT((data & 1) == 0); + } + + PointerAndBool(T ptr, bool b) + : data(reinterpret_cast(ptr)) + { + //SOUP_ASSERT((data & 1) == 0); + data |= (uintptr_t)b; + } + + [[nodiscard]] T getPointer() const noexcept + { + return reinterpret_cast(data & ~(uintptr_t)1); + } + + [[nodiscard]] bool getBool() const noexcept + { + return data & 1; + } + + void setBool(bool b) noexcept + { + data &= ~static_cast(1); + data |= static_cast(b); + } + + void set(T ptr, bool b) + { + data = reinterpret_cast(ptr); + //SOUP_ASSERT((data & 1) == 0); + data |= static_cast(b); + } + + operator T() const noexcept + { + return getPointer(); + } + + [[nodiscard]] std::remove_pointer_t& operator*() const noexcept + { + return *getPointer(); + } + + [[nodiscard]] T operator->() const noexcept + { + return getPointer(); + } + + [[nodiscard]] bool operator==(T b) const noexcept + { + return getPointer() == b; + } + + [[nodiscard]] bool operator!=(T b) const noexcept + { + return !operator==(b); + } + }; +} diff --git a/src/vendor/Soup/soup/Regex.cpp b/src/vendor/Soup/soup/Regex.cpp new file mode 100644 index 000000000..05dd4aadc --- /dev/null +++ b/src/vendor/Soup/soup/Regex.cpp @@ -0,0 +1,387 @@ +#include "Regex.hpp" + +#include +#include + +#include "base.hpp" +#include "RegexConstraint.hpp" +#include "RegexMatcher.hpp" +#include "string.hpp" + +#define REGEX_DEBUG_MATCH false + +#if REGEX_DEBUG_MATCH +#include +#endif + +NAMESPACE_SOUP +{ + Regex Regex::fromFullString(const std::string& str) + { + if (str.length() >= 2) + { + const char c = str.at(0); + const auto i = str.find_last_of(c); + if (i > 0) + { + return Regex(str.c_str() + 1, str.c_str() + i, parseFlags(str.c_str() + i + 1)); + } + } + return {}; + } + + bool Regex::matches(const std::string& str) const noexcept + { + return matches(str.data(), &str.data()[str.size()]); + } + + bool Regex::matches(const char* it, const char* end) const noexcept + { + return match(it, end).isSuccess(); + } + + bool Regex::matchesFully(const std::string& str) const noexcept + { + return matchesFully(str.data(), &str.data()[str.size()]); + } + + bool Regex::matchesFully(const char* it, const char* end) const noexcept + { + auto res = match(it, end); + if (res.isSuccess()) + { + return res.groups.at(0)->end == end; + } + return false; + } + + RegexMatchResult Regex::match(const std::string& str) const noexcept + { + return match(str.data(), &str.data()[str.size()]); + } + + RegexMatchResult Regex::match(const char* it, const char* end) const noexcept + { + return match(it, it, end); + } + + RegexMatchResult Regex::match(const char* it, const char* begin, const char* end) const noexcept + { + RegexMatcher m(*this, begin, end); + return match(m, it); + } + + RegexMatchResult Regex::match(RegexMatcher& m, const char* it) const noexcept + { + const auto match_begin = it; + m.it = it; + SOUP_IF_UNLIKELY (m.shouldSaveCheckpoint()) + { + m.saveCheckpoint(); + } + SOUP_ASSERT(!m.shouldResetCapture()); + bool reset_capture = false; + while (m.c != nullptr) + { +#if REGEX_DEBUG_MATCH + std::cout << m.c->toString(); + if (m.c->group) + { + std::cout << " (g " << m.c->group->index << ")"; + } + std::cout << ": "; +#endif + + m.insertMissingCapturingGroups(m.c->group); + + if (m.c->rollback_transition) + { +#if REGEX_DEBUG_MATCH + std::cout << "saved rollback; "; +#endif + m.saveRollback(m.c->rollback_transition); + } + + if (reset_capture) + { + reset_capture = false; +#if REGEX_DEBUG_MATCH + std::cout << "reset capture for group " << m.c->getGroupCaturedWithin()->index << "; "; +#endif + m.result.groups.at(m.c->getGroupCaturedWithin()->index)->begin = m.it; + } + + // Matches? + if (m.c->matches(m)) + { + // Update 'end' of applicable capturing groups + for (auto g = m.c->group; g; g = g->parent) + { + if (g->lookahead_or_lookbehind) + { + break; + } + if (g->isNonCapturing()) + { + continue; + } + m.result.groups.at(g->index)->end = m.it; + } + + m.c = m.c->success_transition; + if (m.shouldSaveCheckpoint()) + { +#if REGEX_DEBUG_MATCH + std::cout << "saved checkpoint; "; +#endif + m.saveCheckpoint(); + } + reset_capture = m.shouldResetCapture(); + if (m.c != RegexConstraint::SUCCESS_TO_FAIL) + { +#if REGEX_DEBUG_MATCH + std::cout << "matched\n"; +#endif + continue; + } +#if REGEX_DEBUG_MATCH + std::cout << "matched into a snafu"; +#endif + if (!m.rollback_points.empty()) + { + m.rollback_points.pop_back(); + } + } +#if REGEX_DEBUG_MATCH + else + { + std::cout << "did not match"; + } +#endif + + // Rollback? + if (!m.rollback_points.empty()) + { +#if REGEX_DEBUG_MATCH + std::cout << "; rolling back\n"; +#endif + m.restoreRollback(); + SOUP_ASSERT(!m.shouldSaveCheckpoint()); + reset_capture = m.shouldResetCapture(); + if (m.c == RegexConstraint::ROLLBACK_TO_SUCCESS) + { +#if REGEX_DEBUG_MATCH + std::cout << "rollback says we should succeed now\n"; +#endif + break; + } + continue; + } + + // Oh well +#if REGEX_DEBUG_MATCH + std::cout << "\n"; +#endif + return {}; + } + + // Handle match of regex without capturing groups + SOUP_IF_UNLIKELY (!m.result.isSuccess()) + { + m.result.groups.emplace_back(RegexMatchedGroup{ {}, match_begin, m.it }); + } + + SOUP_ASSERT(m.checkpoints.empty()); // if we made a checkpoint for a lookahead group, it should have been restored. + + SOUP_MOVE_RETURN(m.result); + } + + RegexMatchResult Regex::search(const std::string& str) const noexcept + { + return search(str.data(), &str.data()[str.size()]); + } + + RegexMatchResult Regex::search(const char* it, const char* end) const noexcept + { + RegexMatcher m(*this, it, end); + for (; it != end; ++it) + { +#if REGEX_DEBUG_MATCH + std::cout << "--- Attempting match with " << std::distance(m.begin, it) << " byte offset ---\r\n"; +#endif + auto res = match(m, it); + if (res.isSuccess()) + { + return res; + } + m.reset(*this); + } + return {}; + } + + void Regex::replaceAll(std::string& str, const std::string& replacement) const + { + RegexMatchResult match; + while (match = search(str), match.isSuccess()) + { + const size_t offset = (match.groups.at(0).value().begin - str.data()); + str.erase(offset, match.length()); + str.insert(offset, replacement); + } + } + + std::string Regex::unparseFlags(uint16_t flags) + { + std::string str{}; + if (flags & RE_MULTILINE) + { + str.push_back('m'); + } + if (flags & RE_DOTALL) + { + str.push_back('s'); + } + if (flags & RE_INSENSITIVE) + { + str.push_back('i'); + } + if (flags & RE_EXTENDED) + { + str.push_back('x'); + } + if (flags & RE_UNICODE) + { + str.push_back('u'); + } + if (flags & RE_UNGREEDY) + { + str.push_back('U'); + } + if (flags & RE_DOLLAR_ENDONLY) + { + str.push_back('D'); + } + if (flags & RE_EXPLICIT_CAPTURE) + { + str.push_back('n'); + } + return str; + } + + [[nodiscard]] static std::string node_to_graphviz_dot_string(const RegexConstraint* node) + { + std::stringstream ss; + if (auto str = node->toString(); !str.empty()) + { + ss << std::move(str); + } + else + { + ss << "dummy"; + } + ss << " ("; + ss << (void*)node; + ss << ')'; + + return string::escape(ss.str()); + } + + static void add_success_node(std::stringstream& ss, std::unordered_set& mapped_nodes) + { + if (mapped_nodes.count(reinterpret_cast(1)) == 0) + { + mapped_nodes.emplace(reinterpret_cast(1)); + ss << R"("success" [shape="diamond"];)" << '\n'; + } + } + + static void add_fail_node(std::stringstream& ss, std::unordered_set& mapped_nodes) + { + if (mapped_nodes.count(reinterpret_cast(2)) == 0) + { + mapped_nodes.emplace(reinterpret_cast(2)); + ss << R"("fail" [shape="diamond"];)" << '\n'; + } + } + + static void node_to_graphviz_dot(std::stringstream& ss, std::unordered_set& mapped_nodes, const RegexConstraint* node) + { + if (mapped_nodes.count(node) != 0) + { + return; + } + mapped_nodes.emplace(node); + + ss << node_to_graphviz_dot_string(node); + ss << R"( [shape="rect"];)"; + ss << '\n'; + + if (node->getSuccessTransition() == nullptr) + { + add_success_node(ss, mapped_nodes); + + ss << node_to_graphviz_dot_string(node); + ss << " -> "; + ss << R"("success")"; + ss << R"( [label="success"];)"; + ss << '\n'; + } + else if (node->getSuccessTransition() == RegexConstraint::SUCCESS_TO_FAIL) + { + add_fail_node(ss, mapped_nodes); + + ss << node_to_graphviz_dot_string(node); + ss << " -> "; + ss << R"("fail")"; + ss << R"( [label="success"];)"; + ss << '\n'; + } + else + { + node_to_graphviz_dot(ss, mapped_nodes, node->getSuccessTransition()); + + ss << node_to_graphviz_dot_string(node); + ss << " -> "; + ss << node_to_graphviz_dot_string(node->getSuccessTransition()); + ss << R"( [label="success"];)"; + ss << '\n'; + } + + if (node->getRollbackTransition() != nullptr) + { + if (node->getRollbackTransition() == RegexConstraint::ROLLBACK_TO_SUCCESS) + { + add_success_node(ss, mapped_nodes); + + ss << node_to_graphviz_dot_string(node); + ss << " -> "; + ss << R"("success")"; + ss << R"( [label="rollback"];)"; + ss << '\n'; + } + else + { + node_to_graphviz_dot(ss, mapped_nodes, node->getRollbackTransition()); + + ss << node_to_graphviz_dot_string(node); + ss << " -> "; + ss << node_to_graphviz_dot_string(node->getRollbackTransition()); + ss << R"( [label="rollback"];)"; + ss << '\n'; + } + } + } + + std::string Regex::toGraphvizDot() const SOUP_EXCAL + { + std::stringstream ss; + std::unordered_set mapped_nodes{}; + + ss << "digraph {\n"; + ss << "label=" << string::escape(toFullString()) << ";\n"; + node_to_graphviz_dot(ss, mapped_nodes, reinterpret_cast(reinterpret_cast(group.initial) & ~1)); + ss << '}'; + + return ss.str(); + } +} diff --git a/src/vendor/Soup/soup/Regex.hpp b/src/vendor/Soup/soup/Regex.hpp new file mode 100644 index 000000000..137ad373f --- /dev/null +++ b/src/vendor/Soup/soup/Regex.hpp @@ -0,0 +1,132 @@ +#pragma once + +#include "RegexFlags.hpp" +#include "RegexGroup.hpp" +#include "RegexMatchResult.hpp" + +NAMESPACE_SOUP +{ + struct Regex + { + RegexGroup group; + + Regex(const std::string& pattern, const char* flags) + : Regex(pattern.data(), &pattern.data()[pattern.size()], parseFlags(flags)) + { + } + + Regex(const std::string& pattern, uint16_t flags = 0) + : Regex(pattern.data(), &pattern.data()[pattern.size()], flags) + { + } + + Regex(const char* it, const char* end, uint16_t flags) + : group(it, end, flags) + { + } + + Regex(const Regex& b) + : Regex(b.toString(), b.getFlags()) + { + } + + Regex() = default; + Regex(Regex&&) = default; + + [[nodiscard]] static Regex fromFullString(const std::string& str); + + [[nodiscard]] bool matches(const std::string& str) const noexcept; + [[nodiscard]] bool matches(const char* it, const char* end) const noexcept; + + [[nodiscard]] bool matchesFully(const std::string& str) const noexcept; + [[nodiscard]] bool matchesFully(const char* it, const char* end) const noexcept; + + [[nodiscard]] RegexMatchResult match(const std::string& str) const noexcept; + [[nodiscard]] RegexMatchResult match(const char* it, const char* end) const noexcept; + [[nodiscard]] RegexMatchResult match(const char* it, const char* begin, const char* end) const noexcept; + [[nodiscard]] RegexMatchResult match(RegexMatcher& m, const char* it) const noexcept; + + [[nodiscard]] RegexMatchResult search(const std::string& str) const noexcept; + [[nodiscard]] RegexMatchResult search(const char* it, const char* end) const noexcept; + + void replaceAll(std::string& str, const std::string& replacement) const; + + [[nodiscard]] std::string toString() const SOUP_EXCAL + { + return group.toString(); + } + + [[nodiscard]] std::string toFullString() const SOUP_EXCAL + { + std::string str(1, '/'); + str.append(toString()); + str.push_back('/'); + str.append(getFlagsString()); + return str; + } + + [[nodiscard]] uint16_t getFlags() const noexcept + { + return group.getFlags(); + } + + [[nodiscard]] std::string getFlagsString() const noexcept + { + return unparseFlags(group.getFlags()); + } + + [[nodiscard]] static constexpr uint16_t parseFlags(const char* flags) + { + uint16_t res = 0; + for (; *flags != '\0'; ++flags) + { + if (*flags == 'm') + { + res |= RE_MULTILINE; + } + else if (*flags == 's') + { + res |= RE_DOTALL; + } + else if (*flags == 'i') + { + res |= RE_INSENSITIVE; + } + else if (*flags == 'x') + { + res |= RE_EXTENDED; + } + else if (*flags == 'u') + { + res |= RE_UNICODE; + } + else if (*flags == 'U') + { + res |= RE_UNGREEDY; + } + else if (*flags == 'D') + { + res |= RE_DOLLAR_ENDONLY; + } + else if (*flags == 'n') + { + res |= RE_EXPLICIT_CAPTURE; + } + } + return res; + } + + [[nodiscard]] static std::string unparseFlags(uint16_t flags); + + // Result can be used with 'dot' via CLI to produce an image, or an online viewer such as https://dreampuf.github.io/GraphvizOnline/ + [[nodiscard]] std::string toGraphvizDot() const SOUP_EXCAL; + }; + + namespace literals + { + inline Regex operator ""_r(const char* str, size_t len) + { + return Regex(std::string(str, len)); + } + } +} diff --git a/src/vendor/Soup/soup/RegexAlternative.hpp b/src/vendor/Soup/soup/RegexAlternative.hpp new file mode 100644 index 000000000..c3d13d8a5 --- /dev/null +++ b/src/vendor/Soup/soup/RegexAlternative.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include + +#include "UniquePtr.hpp" +#include "RegexConstraint.hpp" + +NAMESPACE_SOUP +{ + struct RegexAlternative + { + std::vector> constraints{}; + }; +} diff --git a/src/vendor/Soup/soup/RegexAnyCharConstraint.hpp b/src/vendor/Soup/soup/RegexAnyCharConstraint.hpp new file mode 100644 index 000000000..67e32f79a --- /dev/null +++ b/src/vendor/Soup/soup/RegexAnyCharConstraint.hpp @@ -0,0 +1,74 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexAnyCharConstraint : public RegexConstraint + { + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (m.it == m.end) + { + return false; + } + if constexpr (!dotall) + { + if (*m.it == '\n') + { + return false; + } + } + if constexpr (unicode) + { + unicode::utf8_add(m.it, m.end); + } + else + { + ++m.it; + } + return true; + } + + [[nodiscard]] std::string toString() const noexcept final + { + return "."; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + if constexpr (dotall) + { + set |= RE_DOTALL; + } + else + { + unset |= RE_DOTALL; + } + if constexpr (unicode) + { + set |= RE_UNICODE; + } + else + { + unset |= RE_UNICODE; + } + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 1; + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + auto cc = soup::make_unique(); + success_transitions.setTransitionTo(cc->getEntrypoint()); + success_transitions.emplace(&cc->success_transition); + return cc; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexCharConstraint.hpp b/src/vendor/Soup/soup/RegexCharConstraint.hpp new file mode 100644 index 000000000..99ca57ec1 --- /dev/null +++ b/src/vendor/Soup/soup/RegexCharConstraint.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + struct RegexCharConstraint : public RegexConstraint + { + char c; + + RegexCharConstraint(char c) + : c(c) + { + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (m.it == m.end) + { + return false; + } + if (*m.it != c) + { + return false; + } + ++m.it; + return true; + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str(1, c); + switch (c) + { + case '\\': + case '|': + case '(': + case ')': + case '?': + case '+': + case '*': + case '.': + case '^': + case '$': + str.insert(0, 1, '\\'); + break; + } + return str; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 1; + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + auto cc = soup::make_unique(c); + success_transitions.setTransitionTo(cc->getEntrypoint()); + success_transitions.emplace(&cc->success_transition); + return cc; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexCodepointConstraint.hpp b/src/vendor/Soup/soup/RegexCodepointConstraint.hpp new file mode 100644 index 000000000..f6fc469fd --- /dev/null +++ b/src/vendor/Soup/soup/RegexCodepointConstraint.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + struct RegexCodepointConstraint : public RegexConstraint + { + std::string c; + + RegexCodepointConstraint(std::string c) + : c(std::move(c)) + { + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (static_cast(std::distance(m.it, m.end)) < c.size()) + { + return false; + } + for (size_t i = 0; i != c.size(); ++i) + { + if (m.it[i] != c[i]) + { + return false; + } + } + m.it += c.size(); + return true; + } + + [[nodiscard]] std::string toString() const noexcept final + { + return c; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + set |= RE_UNICODE; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 1; + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + auto cc = soup::make_unique(c); + success_transitions.setTransitionTo(cc->getEntrypoint()); + success_transitions.emplace(&cc->success_transition); + return cc; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexConstraint.hpp b/src/vendor/Soup/soup/RegexConstraint.hpp new file mode 100644 index 000000000..06246f0fd --- /dev/null +++ b/src/vendor/Soup/soup/RegexConstraint.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include + +#include "fwd.hpp" + +#include "Exception.hpp" + +NAMESPACE_SOUP +{ + struct RegexConstraint + { + inline static RegexConstraint* SUCCESS_TO_FAIL = reinterpret_cast(0b100); + inline static RegexConstraint* ROLLBACK_TO_SUCCESS = reinterpret_cast(0b100); + inline static uintptr_t MASK = 0b11; + + RegexConstraint* success_transition = nullptr; + RegexConstraint* rollback_transition = nullptr; + const RegexGroup* group = nullptr; + + RegexConstraint() = default; + + RegexConstraint(const RegexConstraint& b) + { + // We want the pointers to be nullptr so transitions are not copied by `clone`. + } + + virtual ~RegexConstraint() = default; + + [[nodiscard]] RegexConstraint* getSuccessTransition() const noexcept + { + return reinterpret_cast(reinterpret_cast(success_transition) & ~MASK); + } + + [[nodiscard]] RegexConstraint* getRollbackTransition() const noexcept + { + return reinterpret_cast(reinterpret_cast(rollback_transition) & ~MASK); + } + + [[nodiscard]] virtual bool shouldResetCapture() const noexcept + { + return false; + } + + // May only modify `m.it` and only if the constraint matches. + [[nodiscard]] virtual bool matches(RegexMatcher& m) const noexcept = 0; + + [[nodiscard]] virtual RegexConstraint* getEntrypoint() noexcept + { + return this; + } + + [[nodiscard]] virtual const RegexGroup* getGroupCaturedWithin() const noexcept + { + return group; + } + + [[nodiscard]] virtual size_t getCursorAdvancement() const + { + SOUP_THROW(Exception("Constraint is not fixed-width")); + } + + [[nodiscard]] virtual UniquePtr clone(RegexTransitionsVector& success_transitions) const + { + SOUP_THROW(Exception("Constraint is not clonable")); + } + + [[nodiscard]] virtual std::string toString() const noexcept = 0; + + virtual void getFlags(uint16_t& set, uint16_t& unset) const noexcept {} + }; +} diff --git a/src/vendor/Soup/soup/RegexConstraintLookbehind.hpp b/src/vendor/Soup/soup/RegexConstraintLookbehind.hpp new file mode 100644 index 000000000..88c8f49f7 --- /dev/null +++ b/src/vendor/Soup/soup/RegexConstraintLookbehind.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include "RegexConstraint.hpp" + +NAMESPACE_SOUP +{ + struct RegexConstraintLookbehind : public RegexConstraint + { + RegexGroup group; + size_t window; + + RegexConstraintLookbehind(const RegexGroup::ConstructorState& s) + : group(s, true) + { + } + + [[nodiscard]] RegexConstraint* getEntrypoint() noexcept final + { + return group.initial; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + }; + + template + struct RegexConstraintLookbehindImpl : public RegexConstraintLookbehind + { + using RegexConstraintLookbehind::RegexConstraintLookbehind; + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if constexpr (unicode) + { + for (size_t i = 0; i != window; ++i) + { + if (m.begin == m.it) + { + return false; + } + if (UTF8_IS_CONTINUATION(*m.it)) + { + return false; + } + unicode::utf8_sub(m.it, m.begin); + } + } + else + { + if (static_cast(std::distance(m.begin, m.it)) < window) + { + return false; + } + m.it -= window; + } + return true; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + group.getFlags(set, unset); + if constexpr (unicode) + { + set |= RE_UNICODE; + } + } + }; +} diff --git a/src/vendor/Soup/soup/RegexDummyConstraint.hpp b/src/vendor/Soup/soup/RegexDummyConstraint.hpp new file mode 100644 index 000000000..216ff3d12 --- /dev/null +++ b/src/vendor/Soup/soup/RegexDummyConstraint.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "RegexConstraint.hpp" + +NAMESPACE_SOUP +{ + struct RegexDummyConstraint : public RegexConstraint + { + using RegexConstraint::RegexConstraint; + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + + [[nodiscard]] std::string toString() const noexcept final + { + return {}; + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return true; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexEndConstraint.hpp b/src/vendor/Soup/soup/RegexEndConstraint.hpp new file mode 100644 index 000000000..26a4eef02 --- /dev/null +++ b/src/vendor/Soup/soup/RegexEndConstraint.hpp @@ -0,0 +1,73 @@ +#pragma once + +#include "RegexConstraint.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexEndConstraint : public RegexConstraint + { + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (m.it == m.end) + { + return true; + } + if constexpr (multi_line) + { + if (*m.it == '\n') + { + return true; + } + } + else if constexpr (!end_only) + { + if ((m.it + 1) == m.end + && *m.it == '\n' + ) + { + return true; + } + } + return false; + } + + [[nodiscard]] std::string toString() const noexcept final + { + if constexpr (escape_sequence) + { + static_assert(multi_line == false); + return end_only ? "\\z" : "\\Z"; + } + return "$"; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + if constexpr (!escape_sequence) + { + if constexpr (multi_line) + { + set |= RE_MULTILINE; + } + else + { + unset |= RE_MULTILINE; + } + if constexpr (end_only) + { + set |= RE_DOLLAR_ENDONLY; + } + else + { + unset |= RE_DOLLAR_ENDONLY; + } + } + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexExactQuantifierConstraint.hpp b/src/vendor/Soup/soup/RegexExactQuantifierConstraint.hpp new file mode 100644 index 000000000..7a65f30f6 --- /dev/null +++ b/src/vendor/Soup/soup/RegexExactQuantifierConstraint.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include + +#include "UniquePtr.hpp" + +NAMESPACE_SOUP +{ + struct RegexExactQuantifierConstraint : public RegexConstraint + { + std::vector> constraints; + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return true; + } + + [[nodiscard]] RegexConstraint* getEntrypoint() noexcept final + { + return constraints.at(0)->getEntrypoint(); + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str = constraints.at(0)->toString(); + str.push_back('{'); + str.append(std::to_string(constraints.size())); + str.push_back('}'); + return str; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return constraints.at(0)->getCursorAdvancement() * constraints.size(); + } + }; +} diff --git a/src/vendor/Soup/soup/RegexFlags.hpp b/src/vendor/Soup/soup/RegexFlags.hpp new file mode 100644 index 000000000..2a2c283ea --- /dev/null +++ b/src/vendor/Soup/soup/RegexFlags.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include "base.hpp" + +NAMESPACE_SOUP +{ + enum RegexFlags : uint16_t + { + RE_MULTILINE = (1 << 0), // 'm' - '^' and '$' also match start and end of lines, respectively + RE_DOTALL = (1 << 1), // 's' - '.' also matches '\n' + RE_INSENSITIVE = (1 << 2), // 'i' - case insensitive match + RE_EXTENDED = (1 << 3), // 'x' - Ignore bare space characters in pattern. '#' signifies begin of line comment. + RE_UNICODE = (1 << 4), // 'u' - Treat pattern and strings-to-match as UTF-8 instead of binary data + RE_UNGREEDY = (1 << 5), // 'U' - Quantifiers become lazy by default and are instead made greedy by a trailing '?' + RE_DOLLAR_ENDONLY = (1 << 6), // 'D' - '$' only matches end of pattern, not '\n' - ignored if multi_line flag is set + RE_EXPLICIT_CAPTURE = (1 << 7), // 'n' - only capture named groups (non-standard flag from .NET/C#) + }; +} diff --git a/src/vendor/Soup/soup/RegexGroup.cpp b/src/vendor/Soup/soup/RegexGroup.cpp new file mode 100644 index 000000000..b77b970a1 --- /dev/null +++ b/src/vendor/Soup/soup/RegexGroup.cpp @@ -0,0 +1,956 @@ +#include "RegexGroup.hpp" + +#include "RegexFlags.hpp" +#include "RegexTransitionsVector.hpp" +#include "string.hpp" +#include "unicode.hpp" + +#include "RegexAnyCharConstraint.hpp" +#include "RegexCharConstraint.hpp" +#include "RegexCodepointConstraint.hpp" +#include "RegexDummyConstraint.hpp" +#include "RegexEndConstraint.hpp" +#include "RegexExactQuantifierConstraint.hpp" +#include "RegexGroupConstraint.hpp" +#include "RegexNegativeLookaheadConstraint.hpp" +#include "RegexNegativeLookbehindConstraint.hpp" +#include "RegexOpenEndedRangeQuantifierConstraint.hpp" +#include "RegexPositiveLookaheadConstraint.hpp" +#include "RegexPositiveLookbehindConstraint.hpp" +#include "RegexOptConstraint.hpp" +#include "RegexRangeQuantifierConstraint.hpp" +#include "RegexRangeConstraint.hpp" +#include "RegexRecallConstraint.hpp" +#include "RegexRepeatConstraint.hpp" +#include "RegexStartConstraint.hpp" +#include "RegexWordBoundaryConstraint.hpp" +#include "RegexWordCharConstraint.hpp" + +NAMESPACE_SOUP +{ + static void discharge_alternative(RegexGroup& g, RegexTransitionsVector& success_transitions, RegexAlternative& a) + { + // Ensure all alternatives have at least one constraint so we can set up transitions + if (a.constraints.empty()) + { + auto upC = soup::make_unique(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + } + + g.alternatives.emplace_back(std::move(a)); + a.constraints.clear(); + } + + RegexGroup::RegexGroup(const ConstructorState& s, bool non_capturing) + : index(non_capturing ? -1 : s.next_index++) + { + RegexTransitionsVector success_transitions; + success_transitions.data = { &initial }; + + RegexAlternative a{}; + + std::vector alternatives_transitions{}; + + bool escape = false; + for (; s.it != s.end; ++s.it) + { + if (escape) + { + escape = false; + if (*s.it == 'b') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'B') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'w') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'W') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'A') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'Z') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'z') + { + auto upC = soup::make_unique>(); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'd') + { + auto upC = soup::make_unique(RegexRangeConstraint::digits); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 's') + { + auto upC = soup::make_unique(RegexRangeConstraint::whitespace); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == 'k') + { + if (++s.it != s.end) + { + std::string name; + if (*s.it == '<') + { + while (++s.it != s.end && *s.it != '>') + { + name.push_back(*s.it); + } + } + else + { + while (++s.it != s.end && *s.it != '\'') + { + name.push_back(*s.it); + } + } + + auto upC = soup::make_unique(std::move(name)); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + } + continue; + } + } + else + { + if (*s.it == '\\') + { + if (++s.it != s.end && string::isNumberChar(*s.it)) + { + size_t i = ((*s.it) - '0'); + while (++s.it != s.end && string::isNumberChar(*s.it)) + { + i *= 10; + i += ((*s.it) - '0'); + } + + auto upC = soup::make_unique(i); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + } + else + { + escape = true; + } + --s.it; + continue; + } + else if (*s.it == '|') + { + discharge_alternative(*this, success_transitions, a); + success_transitions.discharge(alternatives_transitions); + continue; + } + else if (*s.it == '(') + { + bool non_capturing = false; + bool positive_lookahead = false; + bool negative_lookahead = false; + bool positive_lookbehind = false; + bool negative_lookbehind = false; + std::string name{}; + std::string inline_modifiers{}; + if (++s.it != s.end && *s.it == '?') + { + while (++s.it != s.end && (*s.it == '-' || string::isLetter(*s.it))) + { + inline_modifiers.push_back(*s.it); + } + if (s.it != s.end) + { + if (*s.it == ':') + { + ++s.it; + non_capturing = true; + } + else if (*s.it == '\'') + { + while (++s.it != s.end && *s.it != '\'') + { + name.push_back(*s.it); + } + if (s.it != s.end) + { + ++s.it; + } + } + else if (*s.it == '=') + { + positive_lookahead = true; + ++s.it; + } + else if (*s.it == '!') + { + negative_lookahead = true; + ++s.it; + } + else if (*s.it == '<') + { + if (++s.it != s.end) + { + if (*s.it == '=') + { + positive_lookbehind = true; + ++s.it; + } + else if (*s.it == '!') + { + negative_lookbehind = true; + ++s.it; + } + else + { + do + { + name.push_back(*s.it); + } while (++s.it != s.end && *s.it != '>'); + if (s.it != s.end) + { + ++s.it; + } + } + } + } + } + } + uint16_t restore_flags = s.flags; + if (!inline_modifiers.empty()) + { + std::string negative_inline_modifiers{}; + auto sep = inline_modifiers.find('-'); + if (sep != std::string::npos) + { + negative_inline_modifiers = inline_modifiers.substr(sep + 1); + inline_modifiers.erase(0, sep + 1); + } + s.flags |= Regex::parseFlags(inline_modifiers.c_str()); + s.flags &= ~Regex::parseFlags(negative_inline_modifiers.c_str()); + + // If non_capturing is true, these are supposed to be localised inline modifers. + if (!non_capturing) + { + // Otherwise, we want to keep them beyond the scope of this group. + restore_flags = s.flags; + + // However, this kind of group should always be non-capturing. + non_capturing = true; + } + } + if (positive_lookahead) + { + auto upGC = soup::make_unique(s); + upGC->group.parent = this; + upGC->group.lookahead_or_lookbehind = true; + + if (upGC->group.initial) + { + // last-constraint --[success]-> first-lookahead-constraint + save checkpoint + success_transitions.setTransitionTo(upGC->group.initial, true); + success_transitions.data = std::move(s.alternatives_transitions); + + // last-lookahead-constraint --[success]-> group (to restore checkpoint) + success_transitions.setTransitionTo(upGC.get()); + + // group --> next-constraint + success_transitions.emplace(&upGC->success_transition); + } + + a.constraints.emplace_back(std::move(upGC)); + } + else if (negative_lookahead) + { + auto upGC = soup::make_unique(s); + upGC->group.parent = this; + upGC->group.lookahead_or_lookbehind = true; + + if (upGC->group.initial) + { + // last-constraint --[success]-> first-lookahead-constraint + success_transitions.setTransitionTo(upGC->group.initial); + success_transitions.data = std::move(s.alternatives_transitions); + } + + // last-lookahead-constraint --[success]-> fail + success_transitions.setTransitionTo(RegexConstraint::SUCCESS_TO_FAIL); + + if (upGC->group.initial) + { + // first-lookahead-constraint --[rollback]-> next-constraint + success_transitions.emplaceRollback(&upGC->group.initial->rollback_transition); + } + + a.constraints.emplace_back(std::move(upGC)); + } + else if (positive_lookbehind) + { + UniquePtr upGC; + if (s.hasFlag(RE_UNICODE)) + { + upGC = soup::make_unique>(s); + } + else + { + upGC = soup::make_unique>(s); + } + upGC->group.parent = this; + upGC->group.lookahead_or_lookbehind = true; + upGC->window = upGC->group.getCursorAdvancement(); + + // last-constraint --[success]-> group (to move cursor) + success_transitions.setTransitionTo(upGC.get()); + + // group --> first-lookbehind-constraint + success_transitions.emplace(&upGC->success_transition); + success_transitions.setTransitionTo(upGC->group.initial); + + // last-lookbehind-constraint --[success]-> next-constraint + success_transitions.data = std::move(s.alternatives_transitions); + + a.constraints.emplace_back(std::move(upGC)); + } + else if (negative_lookbehind) + { + UniquePtr upGC; + if (s.hasFlag(RE_UNICODE)) + { + upGC = soup::make_unique>(s); + } + else + { + upGC = soup::make_unique>(s); + } + upGC->group.parent = this; + upGC->group.lookahead_or_lookbehind = true; + upGC->window = upGC->group.getCursorAdvancement(); + + // last-constraint --[success]-> group (to move cursor) + success_transitions.setTransitionTo(upGC.get()); + + // group --> first-lookbehind-constraint + success_transitions.emplace(&upGC->success_transition); + success_transitions.setTransitionTo(upGC->group.initial); + + // last-lookbehind-constraint --[success]-> fail + success_transitions.data = std::move(s.alternatives_transitions); + success_transitions.setTransitionTo(RegexConstraint::SUCCESS_TO_FAIL); + + // group --[rollback]--> next-constraint + success_transitions.emplaceRollback(&upGC->rollback_transition); + + a.constraints.emplace_back(std::move(upGC)); + } + else + { + if (s.hasFlag(RE_EXPLICIT_CAPTURE) && name.empty()) + { + non_capturing = true; + } + if (*s.it == ')' // No contents? + && non_capturing // Not a capturing group? + ) + { + // Don't have to generate anything for this group. + } + else + { + auto upGC = soup::make_unique(s, non_capturing); + upGC->data.parent = this; + upGC->data.name = std::move(name); + success_transitions.setTransitionTo(upGC.get()); + success_transitions.emplace(&upGC->success_transition); + success_transitions.setTransitionTo(upGC->data.initial); + success_transitions.data = std::move(s.alternatives_transitions); + a.constraints.emplace_back(std::move(upGC)); + } + s.flags = restore_flags; + } + if (s.it == s.end) + { + break; + } + continue; + } + else if (*s.it == ')') + { + break; + } + else if (*s.it == '+') + { + bool greedy = true; + if (s.it + 1 != s.end + && *(s.it + 1) == '?' + ) + { + greedy = false; + ++s.it; + } + greedy ^= s.hasFlag(RE_UNGREEDY); + + SOUP_ASSERT(!a.constraints.empty(), "Invalid modifier"); + RegexConstraint* pModifiedConstraint; + UniquePtr upQuantifierConstraint; + if (greedy) + { + UniquePtr upModifiedConstraint = std::move(a.constraints.back()); + pModifiedConstraint = upModifiedConstraint.get(); + upQuantifierConstraint = soup::make_unique>(std::move(upModifiedConstraint)); + static_cast*>(upQuantifierConstraint.get())->setupTransitionsAtLeastOne(success_transitions); + } + else + { + UniquePtr upModifiedConstraint = std::move(a.constraints.back()); + pModifiedConstraint = upModifiedConstraint.get(); + upQuantifierConstraint = soup::make_unique>(std::move(upModifiedConstraint)); + static_cast*>(upQuantifierConstraint.get())->setupTransitionsAtLeastOne(success_transitions); + } + + pModifiedConstraint->group = this; + + a.constraints.back() = std::move(upQuantifierConstraint); + continue; + } + else if (*s.it == '*') + { + bool greedy = true; + if (s.it + 1 != s.end + && *(s.it + 1) == '?' + ) + { + greedy = false; + ++s.it; + } + greedy ^= s.hasFlag(RE_UNGREEDY); + + SOUP_ASSERT(!a.constraints.empty(), "Invalid modifier"); + RegexConstraint* pModifiedConstraint; + UniquePtr upQuantifierConstraint; + if (greedy) + { + UniquePtr upModifiedConstraint = std::move(a.constraints.back()); + pModifiedConstraint = upModifiedConstraint.get(); + upQuantifierConstraint = soup::make_unique>(std::move(upModifiedConstraint)); + } + else + { + UniquePtr upModifiedConstraint = std::move(a.constraints.back()); + pModifiedConstraint = upModifiedConstraint.get(); + upQuantifierConstraint = soup::make_unique>(std::move(upModifiedConstraint)); + } + + pModifiedConstraint->group = this; + + if (greedy) + { + // constraint --[success]-> constraint + success_transitions.setTransitionTo(pModifiedConstraint->getEntrypoint()); + + // constraint --[rollback]-> next-constraint + success_transitions.emplaceRollback(&pModifiedConstraint->rollback_transition); + } + else + { + // prev-constraint --[success]-> quantifier + success_transitions.setPreviousTransitionTo(upQuantifierConstraint.get()); + + // constraint --[success]-> quantifier + success_transitions.setTransitionTo(upQuantifierConstraint.get()); + + // quantifier --[success]-> next-constraint + success_transitions.emplace(&upQuantifierConstraint->success_transition); + + // quantifier --[rollback]-> constraint + upQuantifierConstraint->rollback_transition = pModifiedConstraint->getEntrypoint(); + } + + a.constraints.back() = std::move(upQuantifierConstraint); + continue; + } + else if (*s.it == '?') + { + SOUP_ASSERT(!a.constraints.empty(), "Invalid modifier"); + UniquePtr upModifiedConstraint = std::move(a.constraints.back()); + auto pModifiedConstraint = upModifiedConstraint.get(); + auto upOptConstraint = soup::make_unique(std::move(upModifiedConstraint)); + + pModifiedConstraint->group = this; + + // constraint --[rollback]-> next-constraint + success_transitions.emplaceRollback(&pModifiedConstraint->getEntrypoint()->rollback_transition); + + a.constraints.back() = std::move(upOptConstraint); + continue; + } + else if (*s.it == '.') + { + UniquePtr upC; + if (s.hasFlag(RE_DOTALL)) + { + if (s.hasFlag(RE_UNICODE)) + { + upC = soup::make_unique>(); + } + else + { + upC = soup::make_unique>(); + } + } + else + { + if (s.hasFlag(RE_UNICODE)) + { + upC = soup::make_unique>(); + } + else + { + upC = soup::make_unique>(); + } + } + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == '[') + { + auto upC = soup::make_unique(s.it, s.end, s.hasFlag(RE_INSENSITIVE)); + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + if (s.it == s.end) + { + break; + } + continue; + } + else if (*s.it == '^') + { + UniquePtr upC; + if (s.flags & RE_MULTILINE) + { + upC = soup::make_unique>(); + } + else + { + upC = soup::make_unique>(); + } + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == '$') + { + UniquePtr upC; + if (s.flags & RE_MULTILINE) + { + upC = soup::make_unique>(); + } + else if (s.flags & RE_DOLLAR_ENDONLY) + { + upC = soup::make_unique>(); + } + else + { + upC = soup::make_unique>(); + } + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + continue; + } + else if (*s.it == '{') + { + size_t min_reps = 0; + while (++s.it != s.end && string::isNumberChar(*s.it)) + { + min_reps *= 10; + min_reps += ((*s.it) - '0'); + } + if (s.it == s.end) + { + break; + } + + bool exact = true; + size_t max_reps = 0; + if (*s.it == ',') + { + exact = false; + while (++s.it != s.end && string::isNumberChar(*s.it)) + { + max_reps *= 10; + max_reps += ((*s.it) - '0'); + } + if (s.it == s.end) + { + break; + } + } + + bool greedy = true; + if (s.it + 1 != s.end + && *(s.it + 1) == '?' + ) + { + greedy = false; + ++s.it; + } + greedy ^= s.hasFlag(RE_UNGREEDY); + + SOUP_ASSERT(!a.constraints.empty(), "Invalid modifier"); + UniquePtr upModifiedConstraint = std::move(a.constraints.back()); + auto pModifiedConstraint = upModifiedConstraint.get(); + if (min_reps == 0) + { + success_transitions.rollback(); + a.constraints.pop_back(); + } + else if (exact || min_reps == max_reps) // {X} or {X,X} + { + // greedy or not doesn't make a difference here + + auto upRepConstraint = soup::make_unique(); + upRepConstraint->constraints.emplace_back(std::move(upModifiedConstraint)); + + pModifiedConstraint->group = this; + + while (--min_reps != 0) + { + if (pModifiedConstraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + auto upClone = pModifiedConstraint->clone(success_transitions); + upClone->group = this; + + upRepConstraint->constraints.emplace_back(std::move(upClone)); + } + a.constraints.back() = std::move(upRepConstraint); + } + else if (max_reps == 0) // {X,} + { + UniquePtr upRepConstraint; + if (greedy) + { + upRepConstraint = soup::make_unique>(); + } + else + { + upRepConstraint = soup::make_unique>(); + } + upRepConstraint->constraints.emplace_back(std::move(upModifiedConstraint)); + + pModifiedConstraint->group = this; + + while (--min_reps != 0) + { + if (pModifiedConstraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + auto upClone = pModifiedConstraint->clone(success_transitions); + upClone->group = this; + + upRepConstraint->constraints.emplace_back(std::move(upClone)); + } + + // last-clone --[success]-> quantifier + success_transitions.setTransitionTo(upRepConstraint.get()); + + if (greedy) + { + // quantifier --[success]-> last-clone + success_transitions.emplace(&upRepConstraint->success_transition); + if (pModifiedConstraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + success_transitions.setTransitionTo(upRepConstraint->constraints.back()->getEntrypoint()); + + // quantifier --[rollback]-> next-constraint + success_transitions.emplaceRollback(&upRepConstraint->rollback_transition); + } + else + { + // quantifier --[rollback]-> last-clone + success_transitions.emplaceRollback(&upRepConstraint->rollback_transition); + if (pModifiedConstraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + success_transitions.setTransitionTo(upRepConstraint->constraints.back()->getEntrypoint()); + + // quantifier --[success]-> next-constraint + success_transitions.emplace(&upRepConstraint->success_transition); + } + + a.constraints.back() = std::move(upRepConstraint); + } + else if (min_reps < max_reps) // {X,Y} + { + if (greedy) + { + auto upRepConstraint = soup::make_unique(); + upRepConstraint->constraints.emplace_back(std::move(upModifiedConstraint)); + upRepConstraint->min_reps = min_reps; + + pModifiedConstraint->group = this; + + size_t required_reps = min_reps; + while (--required_reps != 0) + { + if (pModifiedConstraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + auto upClone = pModifiedConstraint->clone(success_transitions); + upClone->group = this; + + upRepConstraint->constraints.emplace_back(std::move(upClone)); + } + RegexTransitionsVector rep_transitions; + success_transitions.discharge(rep_transitions.data); + for (size_t optional_reps = (max_reps - min_reps); optional_reps != 0; --optional_reps) + { + if (pModifiedConstraint->shouldResetCapture()) + { + rep_transitions.setResetCapture(); + } + auto upClone = pModifiedConstraint->clone(rep_transitions); + upClone->group = this; + + // clone --[rollback]-> next-constraint + success_transitions.emplaceRollback(&upClone->getEntrypoint()->rollback_transition); + + upRepConstraint->constraints.emplace_back(std::move(upClone)); + } + + // last-clone --[success]-> next-constraint + rep_transitions.discharge(success_transitions.data); + + a.constraints.back() = std::move(upRepConstraint); + } + else + { + auto upRepConstraint = soup::make_unique(); + upRepConstraint->constraints.emplace_back(std::move(upModifiedConstraint)); + upRepConstraint->min_reps = min_reps; + + pModifiedConstraint->group = this; + + size_t required_reps = min_reps; + while (--required_reps != 0) + { + if (pModifiedConstraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + auto upClone = pModifiedConstraint->clone(success_transitions); + upClone->group = this; + + upRepConstraint->constraints.emplace_back(std::move(upClone)); + } + + RegexTransitionsVector rep_transitions; + success_transitions.discharge(rep_transitions.data); + for (size_t optional_reps = (max_reps - min_reps); optional_reps != 0; --optional_reps) + { + auto upDummy = soup::make_unique(); + + // last-constraint --[success]-> dummy + rep_transitions.setTransitionTo(upDummy->getEntrypoint()); + + // dummy --[success]-> next-constraint + success_transitions.emplace(&upDummy->success_transition); + + // clone --[success]-> next-dummy + auto upClone = pModifiedConstraint->clone(rep_transitions); + upClone->group = this; + + // dummy --[rollback]-> clone + upDummy->rollback_transition = upClone->getEntrypoint(); + + upRepConstraint->constraints.emplace_back(std::move(upClone)); + upRepConstraint->constraints.emplace_back(std::move(upDummy)); + } + + // last-clone --[success]-> next-constraint + rep_transitions.discharge(success_transitions.data); + + a.constraints.back() = std::move(upRepConstraint); + } + } + else + { + // We may be here if (!exact && min_reps > max_reps) + // Which is invalid, so we just yeet the constraint as if {0} was written. + success_transitions.rollback(); + a.constraints.pop_back(); + } + continue; + } + + if (s.hasFlag(RE_EXTENDED)) + { + if (string::isSpace(*s.it)) + { + continue; + } + if (*s.it == '#') + { + do + { + ++s.it; + } while (s.it != s.end && *s.it != '\n'); + continue; + } + } + } + + UniquePtr upC; + if (UTF8_HAS_CONTINUATION(*s.it) && s.hasFlag(RE_UNICODE)) + { + std::string c; + do + { + c.push_back(*s.it); + } while (s.it + 1 != s.end && UTF8_IS_CONTINUATION(*++s.it)); + upC = soup::make_unique(std::move(c)); + } + else if (s.hasFlag(RE_INSENSITIVE) && string::lower_char(*s.it) != string::upper_char(*s.it)) + { + const char arr[] = { string::lower_char(*s.it), string::upper_char(*s.it) }; + upC = soup::make_unique(arr); + } + else + { + upC = soup::make_unique(*s.it); + } + success_transitions.setTransitionTo(upC.get()); + success_transitions.emplace(&upC->success_transition); + a.constraints.emplace_back(std::move(upC)); + } + discharge_alternative(*this, success_transitions, a); + success_transitions.discharge(alternatives_transitions); + + if (alternatives.size() > 1) + { + // Set up rollback transitions for the first constraint in each alternative to jump to next alternative + for (size_t i = 0; i + 1 != alternatives.size(); ++i) + { + alternatives.at(i).constraints.at(0)->rollback_transition = alternatives.at(i + 1).constraints.at(0)->getEntrypoint(); + } + } + + // Set up group pointers + for (const auto& a : alternatives) + { + for (const auto& c : a.constraints) + { + c->group = this; + } + } + + s.alternatives_transitions = std::move(alternatives_transitions); + } + + std::string RegexGroup::toString() const SOUP_EXCAL + { + std::string str{}; + for (const auto& a : alternatives) + { + for (const auto& c : a.constraints) + { + str.append(c->toString()); + } + str.push_back('|'); + } + if (!str.empty()) + { + str.pop_back(); + } + return str; + } + + uint16_t RegexGroup::getFlags() const + { + uint16_t set = 0; + uint16_t unset = 0; + getFlags(set, unset); + SOUP_ASSERT((set & unset) == 0, "RegexGroup has contradicting flags"); + return set; + } + + void RegexGroup::getFlags(uint16_t& set, uint16_t& unset) const noexcept + { + for (const auto& a : alternatives) + { + for (const auto& c : a.constraints) + { + c->getFlags(set, unset); + } + } + } + + size_t RegexGroup::getCursorAdvancement() const + { + size_t accum = 0; + for (const auto& a : alternatives) + { + for (const auto& c : a.constraints) + { + accum += c->getCursorAdvancement(); + } + } + return accum; + } +} diff --git a/src/vendor/Soup/soup/RegexGroup.hpp b/src/vendor/Soup/soup/RegexGroup.hpp new file mode 100644 index 000000000..473fe6053 --- /dev/null +++ b/src/vendor/Soup/soup/RegexGroup.hpp @@ -0,0 +1,64 @@ +#pragma once + +#include +#include + +#include "RegexAlternative.hpp" + +NAMESPACE_SOUP +{ + struct RegexGroup + { + struct ConstructorState + { + mutable const char* it; + const char* end; + mutable uint16_t flags; + mutable size_t next_index = 0; + mutable std::vector alternatives_transitions{}; + + ConstructorState(const char* it, const char* end, uint16_t flags) + : it(it), end(end), flags(flags) + { + } + + [[nodiscard]] bool hasFlag(uint16_t flag) const noexcept + { + return (flags & flag) != 0; + } + }; + + const size_t index = 0; + const RegexGroup* parent = nullptr; + RegexConstraint* initial = nullptr; + std::vector alternatives{}; + std::string name{}; + bool lookahead_or_lookbehind = false; + + RegexGroup() = default; + + RegexGroup(size_t index) + : index(index) + { + } + + RegexGroup(const char* it, const char* end, uint16_t flags) + : RegexGroup(ConstructorState(it, end, flags)) + { + } + + RegexGroup(const ConstructorState& s, bool non_capturing = false); + + [[nodiscard]] bool isNonCapturing() const noexcept + { + return index == -1; + } + + [[nodiscard]] std::string toString() const SOUP_EXCAL; + + [[nodiscard]] uint16_t getFlags() const; + void getFlags(uint16_t& set, uint16_t& unset) const noexcept; + + [[nodiscard]] size_t getCursorAdvancement() const; + }; +} diff --git a/src/vendor/Soup/soup/RegexGroupConstraint.hpp b/src/vendor/Soup/soup/RegexGroupConstraint.hpp new file mode 100644 index 000000000..c79d074d4 --- /dev/null +++ b/src/vendor/Soup/soup/RegexGroupConstraint.hpp @@ -0,0 +1,113 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexGroup.hpp" + +#include + +NAMESPACE_SOUP +{ + struct RegexGroupConstraint : public RegexConstraint + { + RegexGroup data; + + RegexGroupConstraint(size_t index) + : data(index) + { + } + + RegexGroupConstraint(const RegexGroup::ConstructorState& s, bool non_capturing) + : data(s, non_capturing) + { + } + + [[nodiscard]] bool shouldResetCapture() const noexcept final + { + return !data.isNonCapturing(); + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return true; + } + + [[nodiscard]] const RegexGroup* getGroupCaturedWithin() const noexcept final + { + return &data; + } + + [[nodiscard]] std::string toString() const noexcept final + { + auto str = data.toString(); + if (data.isNonCapturing()) + { + str.insert(0, "?:"); + } + else if (!data.name.empty()) + { + if (data.name.find('\'') != std::string::npos) + { + str.insert(0, 1, '>'); + str.insert(0, data.name); + str.insert(0, 1, '<'); + } + else + { + str.insert(0, 1, '\''); + str.insert(0, data.name); + str.insert(0, 1, '\''); + } + str.insert(0, 1, '?'); + } + str.insert(0, 1, '('); + str.push_back(')'); + return str; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + data.getFlags(set, unset); + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return data.getCursorAdvancement(); + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + auto upClone = soup::make_unique(data.index); + success_transitions.setTransitionTo(upClone.get()); + success_transitions.emplace(&upClone->success_transition); + for (const auto& a : data.alternatives) + { + RegexAlternative& ac = upClone->data.alternatives.emplace_back(); + for (const auto& c : a.constraints) + { + auto pConstraintClone = ac.constraints.emplace_back(c->clone(success_transitions)).get(); + pConstraintClone->group = &upClone->data; + if (!upClone->data.initial) + { + if (data.initial == c.get()) + { + upClone->data.initial = pConstraintClone; + } + else if (data.initial == c->getEntrypoint()) + { + upClone->data.initial = pConstraintClone->getEntrypoint(); + } + } + } + } + + upClone->data.parent = data.parent; + upClone->data.name = data.name; + upClone->data.lookahead_or_lookbehind = data.lookahead_or_lookbehind; + + SOUP_ASSERT(upClone->data.initial, "Failed to find initial constraint for cloned group"); + + return upClone; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexMatchResult.hpp b/src/vendor/Soup/soup/RegexMatchResult.hpp new file mode 100644 index 000000000..1a2109c9a --- /dev/null +++ b/src/vendor/Soup/soup/RegexMatchResult.hpp @@ -0,0 +1,75 @@ +#pragma once + +#include +#include + +#include "RegexMatchedGroup.hpp" + +NAMESPACE_SOUP +{ + struct RegexMatchResult + { + std::vector> groups{}; + + [[nodiscard]] bool isSuccess() const noexcept + { + return !groups.empty(); + } + + [[nodiscard]] size_t length() const + { + return groups.at(0).value().length(); + } + + [[nodiscard]] const RegexMatchedGroup* findGroupByIndex(size_t i) const noexcept + { + if (i < groups.size() + && groups.at(i).has_value() + ) + { + return &groups.at(i).value(); + } + return nullptr; + } + + [[nodiscard]] const RegexMatchedGroup* findGroupByName(const std::string& name) const noexcept + { + for (size_t i = 0; i != groups.size(); ++i) + { + if (groups.at(i).has_value() + && groups.at(i)->name == name + ) + { + return &groups.at(i).value(); + } + } + return nullptr; + } + + [[nodiscard]] std::string toString() const noexcept + { + std::string str{}; + for (size_t i = 0; i != groups.size(); ++i) + { + if (groups.at(i).has_value()) + { + str.append(std::to_string(i)); + if (!groups.at(i)->name.empty()) + { + str.push_back('{'); + str.append(groups.at(i)->name); + str.push_back('}'); + } + str.append("=\""); + str.append(groups.at(i)->toString()); + str.append("\", "); + } + } + if (!str.empty()) + { + str.erase(str.size() - 2, 2); + } + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexMatchedGroup.hpp b/src/vendor/Soup/soup/RegexMatchedGroup.hpp new file mode 100644 index 000000000..e8b1c2961 --- /dev/null +++ b/src/vendor/Soup/soup/RegexMatchedGroup.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include + +NAMESPACE_SOUP +{ + struct RegexMatchedGroup + { + std::string name; + const char* begin; + const char* end; + + [[nodiscard]] size_t length() const + { + return std::distance(begin, end); + } + + [[nodiscard]] std::string toString() const noexcept + { + return std::string(begin, end); + } + }; +} diff --git a/src/vendor/Soup/soup/RegexMatcher.hpp b/src/vendor/Soup/soup/RegexMatcher.hpp new file mode 100644 index 000000000..fc855ed3b --- /dev/null +++ b/src/vendor/Soup/soup/RegexMatcher.hpp @@ -0,0 +1,111 @@ +#pragma once + +#include +#include +#include + +#include "fwd.hpp" +#include "Regex.hpp" +#include "RegexMatchResult.hpp" + +NAMESPACE_SOUP +{ + struct RegexMatcher + { + struct RollbackPoint + { + const RegexConstraint* c; + const char* it; + RegexMatchResult result{}; + }; + + const RegexConstraint* c; + const char* it; + const char* const begin; + const char* const end; + std::vector rollback_points{}; + std::vector checkpoints{}; + RegexMatchResult result{}; + + RegexMatcher(const Regex& r, const char* begin, const char* end) + : c(r.group.initial), begin(begin), end(end) + { + } + + void reset(const Regex& r) noexcept + { + c = r.group.initial; + rollback_points.clear(); + checkpoints.clear(); + result.groups.clear(); + } + + void saveRollback(const RegexConstraint* rollback_transition) + { + rollback_points.emplace_back(RollbackPoint{ rollback_transition, it, result }); + } + + void restoreRollback() + { + c = rollback_points.back().c; + it = rollback_points.back().it; + result = std::move(rollback_points.back().result); + rollback_points.pop_back(); + } + + bool shouldSaveCheckpoint() noexcept + { + if (reinterpret_cast(c) & 0b1) + { + c = reinterpret_cast(reinterpret_cast(c) & ~0b1); + SOUP_ASSERT(c != nullptr); + return true; + } + return false; + } + + bool shouldResetCapture() noexcept + { + if (reinterpret_cast(c) & 0b10) + { + c = reinterpret_cast(reinterpret_cast(c) & ~0b10); + return true; + } + return false; + } + + void saveCheckpoint() + { + checkpoints.emplace_back(it); + } + + void restoreCheckpoint() + { + it = checkpoints.back(); + checkpoints.pop_back(); + } + + void insertMissingCapturingGroups(const RegexGroup* g) + { + for (; g; g = g->parent) + { + if (g->lookahead_or_lookbehind) + { + break; + } + if (g->isNonCapturing()) + { + continue; + } + while (g->index >= this->result.groups.size()) + { + this->result.groups.emplace_back(std::nullopt); + } + if (!this->result.groups.at(g->index).has_value()) + { + this->result.groups.at(g->index) = RegexMatchedGroup{ g->name, this->it, this->it }; + } + } + } + }; +} diff --git a/src/vendor/Soup/soup/RegexNegativeLookaheadConstraint.hpp b/src/vendor/Soup/soup/RegexNegativeLookaheadConstraint.hpp new file mode 100644 index 000000000..aa803eff3 --- /dev/null +++ b/src/vendor/Soup/soup/RegexNegativeLookaheadConstraint.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexGroup.hpp" +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + struct RegexNegativeLookaheadConstraint : public RegexConstraint + { + RegexGroup group; + + RegexNegativeLookaheadConstraint(const RegexGroup::ConstructorState& s) + : group(s, true) + { + } + + [[nodiscard]] RegexConstraint* getEntrypoint() noexcept final + { + return group.initial; + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + m.restoreCheckpoint(); + return true; + } + + [[nodiscard]] std::string toString() const noexcept final + { + auto str = group.toString(); + str.insert(0, "(?!"); + str.push_back(')'); + return str; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + group.getFlags(set, unset); + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexNegativeLookbehindConstraint.hpp b/src/vendor/Soup/soup/RegexNegativeLookbehindConstraint.hpp new file mode 100644 index 000000000..0d78e3e41 --- /dev/null +++ b/src/vendor/Soup/soup/RegexNegativeLookbehindConstraint.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "RegexConstraintLookbehind.hpp" + +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexNegativeLookbehindConstraint : public RegexConstraintLookbehindImpl + { + using Base = RegexConstraintLookbehindImpl; + + using Base::Base; + + [[nodiscard]] std::string toString() const noexcept final + { + auto str = Base::group.toString(); + str.insert(0, "(? + +#include "UniquePtr.hpp" + +NAMESPACE_SOUP +{ + struct RegexOpenEndedRangeQuantifierConstraintBase : public RegexConstraint + { + std::vector> constraints; + + [[nodiscard]] RegexConstraint* getEntrypoint() noexcept final + { + return constraints.at(0)->getEntrypoint(); + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + // Meta-constraint. Transitions will be set up to correctly handle matching of this. + return true; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return constraints.at(0)->getCursorAdvancement() * constraints.size(); + } + }; + + template + struct RegexOpenEndedRangeQuantifierConstraint : public RegexOpenEndedRangeQuantifierConstraintBase + { + [[nodiscard]] std::string toString() const noexcept final + { + std::string str = constraints.at(0)->toString(); + str.push_back('{'); + str.append(std::to_string(constraints.size())); + str.append(",}"); + if (!greedy) + { + str.push_back('?'); + } + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexOptConstraint.hpp b/src/vendor/Soup/soup/RegexOptConstraint.hpp new file mode 100644 index 000000000..5663925a6 --- /dev/null +++ b/src/vendor/Soup/soup/RegexOptConstraint.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "UniquePtr.hpp" + +NAMESPACE_SOUP +{ + struct RegexOptConstraint : public RegexConstraint + { + UniquePtr constraint; + + RegexOptConstraint(UniquePtr&& constraint) + : constraint(std::move(constraint)) + { + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + // Meta-constraint. Transitions will be set up to correctly handle matching of this. + return true; + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str = constraint->toString(); + str.push_back('?'); + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexPositiveLookaheadConstraint.hpp b/src/vendor/Soup/soup/RegexPositiveLookaheadConstraint.hpp new file mode 100644 index 000000000..efc46c43d --- /dev/null +++ b/src/vendor/Soup/soup/RegexPositiveLookaheadConstraint.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexGroup.hpp" +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + struct RegexPositiveLookaheadConstraint : public RegexConstraint + { + RegexGroup group; + + RegexPositiveLookaheadConstraint(const RegexGroup::ConstructorState& s) + : group(s, true) + { + } + + [[nodiscard]] RegexConstraint* getEntrypoint() noexcept final + { + return group.initial; + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + m.restoreCheckpoint(); + return true; + } + + [[nodiscard]] std::string toString() const noexcept final + { + auto str = group.toString(); + str.insert(0, "(?="); + str.push_back(')'); + return str; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + group.getFlags(set, unset); + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexPositiveLookbehindConstraint.hpp b/src/vendor/Soup/soup/RegexPositiveLookbehindConstraint.hpp new file mode 100644 index 000000000..84b0eb5e9 --- /dev/null +++ b/src/vendor/Soup/soup/RegexPositiveLookbehindConstraint.hpp @@ -0,0 +1,24 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexPositiveLookbehindConstraint : public RegexConstraintLookbehindImpl + { + using Base = RegexConstraintLookbehindImpl; + + using Base::Base; + + [[nodiscard]] std::string toString() const noexcept final + { + auto str = Base::group.toString(); + str.insert(0, "(?<="); + str.push_back(')'); + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexRangeConstraint.hpp b/src/vendor/Soup/soup/RegexRangeConstraint.hpp new file mode 100644 index 000000000..3ac02ca6a --- /dev/null +++ b/src/vendor/Soup/soup/RegexRangeConstraint.hpp @@ -0,0 +1,389 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "base.hpp" +#include "BigBitset.hpp" +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + struct RegexRangeConstraint : public RegexConstraint + { + BigBitset<0x100 / 8> mask{}; + bool inverted = false; + + inline static const char digits[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }; + inline static const char whitespace[] = { '\r', '\n', '\t', '\f', '\v', ' ' }; + + RegexRangeConstraint(const char*& it, const char* end, bool insensitive) + { + if (++it == end) + { + return; + } + if (*it == '^') + { + inverted = true; + } + else + { + --it; + } + char range_begin = 0; + while (++it != end && *it != ']') + { + if (*it == '-') + { + SOUP_IF_UNLIKELY (++it == end) + { + break; + } + if (range_begin <= *it) + { + for (char c = range_begin; c != *it; ++c) + { + mask.enable(c); + } + } + } + else if (*it == '\\') + { + SOUP_IF_UNLIKELY (++it == end) + { + break; + } + if (*it == 'd') + { + for (auto& c : digits) + { + mask.enable(c); + } + continue; + } + if (*it == 's') + { + for (auto& c : whitespace) + { + mask.enable(c); + } + continue; + } + } + else if (*it == '[' + && (it + 1) != end && *++it == ':' + ) + { + if ((it + 1) != end && *(it + 1) == 'a' + && (it + 2) != end && *(it + 2) == 'l' + && (it + 3) != end && *(it + 3) == 'n' + && (it + 4) != end && *(it + 4) == 'u' + && (it + 5) != end && *(it + 5) == 'm' + ) + { + it += 5; + for (uint8_t c = '0'; c != '9' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'A'; c != 'Z' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'a'; c != 'z' + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'a' + && (it + 2) != end && *(it + 2) == 'l' + && (it + 3) != end && *(it + 3) == 'p' + && (it + 4) != end && *(it + 4) == 'h' + && (it + 5) != end && *(it + 5) == 'a' + ) + { + it += 5; + for (uint8_t c = 'A'; c != 'Z' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'a'; c != 'z' + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'a' + && (it + 2) != end && *(it + 2) == 's' + && (it + 3) != end && *(it + 3) == 'c' + && (it + 4) != end && *(it + 4) == 'i' + && (it + 5) != end && *(it + 5) == 'i' + ) + { + it += 5; + for (uint8_t c = 0x00; c != 0x7F + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'b' + && (it + 2) != end && *(it + 2) == 'l' + && (it + 3) != end && *(it + 3) == 'a' + && (it + 4) != end && *(it + 4) == 'n' + && (it + 5) != end && *(it + 5) == 'k' + ) + { + it += 5; + mask.enable(' '); + mask.enable('\t'); + } + else if ((it + 1) != end && *(it + 1) == 'c' + && (it + 2) != end && *(it + 2) == 'n' + && (it + 3) != end && *(it + 3) == 't' + && (it + 4) != end && *(it + 4) == 'r' + && (it + 5) != end && *(it + 5) == 'l' + ) + { + it += 5; + for (uint8_t c = 0x00; c != 0x1F + 1; ++c) + { + mask.enable(c); + } + mask.enable(0x7F); + } + else if ((it + 1) != end && *(it + 1) == 'd' + && (it + 2) != end && *(it + 2) == 'i' + && (it + 3) != end && *(it + 3) == 'g' + && (it + 4) != end && *(it + 4) == 'i' + && (it + 5) != end && *(it + 5) == 't' + ) + { + it += 5; + for (uint8_t c = '0'; c != '9' + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'g' + && (it + 2) != end && *(it + 2) == 'r' + && (it + 3) != end && *(it + 3) == 'a' + && (it + 4) != end && *(it + 4) == 'p' + && (it + 5) != end && *(it + 5) == 'h' + ) + { + it += 5; + for (uint8_t c = 0x21; c != 0x7E + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'l' + && (it + 2) != end && *(it + 2) == 'o' + && (it + 3) != end && *(it + 3) == 'w' + && (it + 4) != end && *(it + 4) == 'e' + && (it + 5) != end && *(it + 5) == 'r' + ) + { + it += 5; + for (uint8_t c = 'a'; c != 'z' + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'u' + && (it + 2) != end && *(it + 2) == 'p' + && (it + 3) != end && *(it + 3) == 'p' + && (it + 4) != end && *(it + 4) == 'e' + && (it + 5) != end && *(it + 5) == 'r' + ) + { + it += 5; + for (uint8_t c = 'A'; c != 'Z' + 1; ++c) + { + mask.enable(c); + } + } + else if ((it + 1) != end && *(it + 1) == 'w' + && (it + 2) != end && *(it + 2) == 'o' + && (it + 3) != end && *(it + 3) == 'r' + && (it + 4) != end && *(it + 4) == 'd' + ) + { + it += 4; + for (uint8_t c = '0'; c != '9' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'A'; c != 'Z' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'a'; c != 'z' + 1; ++c) + { + mask.enable(c); + } + mask.enable('_'); + } + else if ((it + 1) != end && *(it + 1) == 'x' + && (it + 2) != end && *(it + 2) == 'd' + && (it + 3) != end && *(it + 3) == 'i' + && (it + 4) != end && *(it + 4) == 'g' + && (it + 5) != end && *(it + 5) == 'i' + && (it + 6) != end && *(it + 6) == 't' + ) + { + it += 6; + for (uint8_t c = '0'; c != '9' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'A'; c != 'F' + 1; ++c) + { + mask.enable(c); + } + for (uint8_t c = 'a'; c != 'f' + 1; ++c) + { + mask.enable(c); + } + } + else + { + SOUP_THROW(Exception("Unrecognised class in [[:class:]]")); + } + if ((it + 1) != end) { ++it; } // : + if ((it + 1) != end) { ++it; } // ] + continue; + } + mask.enable(*it); + range_begin = (*it) + 1; + } + if (insensitive) + { + for (uint8_t c = 'a'; c != 'z' + 1; ++c) + { + if (mask.get(c)) + { + mask.enable(c - 'a' + 'A'); + } + } + for (uint8_t c = 'A'; c != 'Z' + 1; ++c) + { + if (mask.get(c)) + { + mask.enable(c - 'A' + 'a'); + } + } + } + } + + template + RegexRangeConstraint(const char(&arr)[S]) + { + for (const auto& c : arr) + { + mask.enable(c); + } + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (m.it == m.end) + { + return false; + } + if (mask.get(static_cast(*m.it)) == inverted) + { + return false; + } + ++m.it; + return true; + } + + static void appendPresentably(std::string& str, char c) noexcept + { + switch (c) + { + case '\r': str.append("\\r"); return; + case '\n': str.append("\\n"); return; + case '\t': str.append("\\t"); return; + case '\f': str.append("\\f"); return; + case '\v': str.append("\\v"); return; + } + str.push_back(c); + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str(1, '['); + if (inverted) + { + str.push_back('^'); + } + uint16_t range_begin = 0x100; + for (uint16_t i = 0; i != 0x100; ++i) + { + if (mask.get(i)) + { + if (range_begin == 0x100) + { + range_begin = i; + } + } + else + { + if (range_begin != 0x100) + { + const uint8_t range_end = static_cast(i); + const uint8_t range_len = (range_end - range_begin); + if (range_len > 3) + { + appendPresentably(str, static_cast(range_begin)); + str.push_back('-'); + appendPresentably(str, range_end - 1); + } + else + { + for (uint16_t j = range_begin; j != range_end; ++j) + { + appendPresentably(str, static_cast(j)); + } + } + range_begin = 0x100; + } + } + } + if (range_begin != 0x100) + { + constexpr uint16_t range_end = 0x100; + const uint8_t range_len = (range_end - range_begin); + if (range_len > 3) + { + appendPresentably(str, static_cast(range_begin)); + str.push_back('-'); + appendPresentably(str, (char)(range_end - 1)); + } + else + { + for (uint16_t j = range_begin; j != range_end; ++j) + { + appendPresentably(str, static_cast(j)); + } + } + } + str.push_back(']'); + return str; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 1; + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + auto cc = soup::make_unique(*this); + success_transitions.setTransitionTo(cc->getEntrypoint()); + success_transitions.emplace(&cc->success_transition); + return cc; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexRangeQuantifierConstraint.hpp b/src/vendor/Soup/soup/RegexRangeQuantifierConstraint.hpp new file mode 100644 index 000000000..174e73314 --- /dev/null +++ b/src/vendor/Soup/soup/RegexRangeQuantifierConstraint.hpp @@ -0,0 +1,62 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include + +#include "UniquePtr.hpp" + +NAMESPACE_SOUP +{ + struct RegexRangeQuantifierConstraintBase : public RegexConstraint + { + std::vector> constraints; + size_t min_reps; + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return true; + } + + [[nodiscard]] RegexConstraint* getEntrypoint() noexcept final + { + return constraints.at(0)->getEntrypoint(); + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return constraints.at(0)->getCursorAdvancement() * constraints.size(); + } + }; + + struct RegexRangeQuantifierConstraintGreedy : public RegexRangeQuantifierConstraintBase + { + [[nodiscard]] std::string toString() const noexcept final + { + std::string str = constraints.at(0)->toString(); + str.push_back('{'); + str.append(std::to_string(min_reps)); + str.push_back(','); + str.append(std::to_string(constraints.size())); + str.push_back('}'); + return str; + } + }; + + struct RegexRangeQuantifierConstraintLazy : public RegexRangeQuantifierConstraintBase + { + [[nodiscard]] std::string toString() const noexcept final + { + const size_t optional_reps = (constraints.size() - min_reps) / 2; + + std::string str = constraints.at(0)->toString(); + str.push_back('{'); + str.append(std::to_string(min_reps)); + str.push_back(','); + str.append(std::to_string(min_reps + optional_reps)); + str.push_back('}'); + str.push_back('?'); + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexRecallConstraint.hpp b/src/vendor/Soup/soup/RegexRecallConstraint.hpp new file mode 100644 index 000000000..8dfe124af --- /dev/null +++ b/src/vendor/Soup/soup/RegexRecallConstraint.hpp @@ -0,0 +1,85 @@ +#pragma once + +#include "RegexConstraint.hpp" + +NAMESPACE_SOUP +{ + struct RegexRecallConstraint : public RegexConstraint + { + [[nodiscard]] bool matchesImpl(RegexMatcher& m, const RegexMatchedGroup* group) const noexcept + { + if (group) + { + auto it = m.it; + for (auto group_it = group->begin; group_it != group->end; ++group_it) + { + if (it == m.end + || *it != *group_it + ) + { + return false; + } + ++it; + } + m.it = it; + return true; + } + return false; + } + }; + + struct RegexRecallIndexConstraint : public RegexRecallConstraint + { + const size_t i; + + RegexRecallIndexConstraint(size_t i) + : i(i) + { + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return matchesImpl(m, m.result.findGroupByIndex(i)); + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str(1, '\\'); + str.append(std::to_string(i)); + return str; + } + }; + + struct RegexRecallNameConstraint : public RegexRecallConstraint + { + const std::string name; + + RegexRecallNameConstraint(std::string&& name) + : name(std::move(name)) + { + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return matchesImpl(m, m.result.findGroupByName(name)); + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str = "\\k"; + if (name.find('\'') != std::string::npos) + { + str.push_back('<'); + str.append(name); + str.push_back('>'); + } + else + { + str.push_back('\''); + str.append(name); + str.push_back('\''); + } + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexRepeatConstraint.hpp b/src/vendor/Soup/soup/RegexRepeatConstraint.hpp new file mode 100644 index 000000000..7f7f9a847 --- /dev/null +++ b/src/vendor/Soup/soup/RegexRepeatConstraint.hpp @@ -0,0 +1,90 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "UniquePtr.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexRepeatConstraint : public RegexConstraint + { + UniquePtr constraint; + + RegexRepeatConstraint(UniquePtr&& constraint) + : constraint(std::move(constraint)) + { + } + + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + // Meta-constraint. Transitions will be set up to correctly handle matching of this. + return true; + } + + [[nodiscard]] virtual RegexConstraint* getEntrypoint() noexcept final + { + return constraint->getEntrypoint(); + } + + void setupTransitionsAtLeastOne(RegexTransitionsVector& success_transitions) + { + success_transitions.setTransitionTo(this); + if (greedy) + { + // quantifier --[success]-> constraint + success_transitions.emplace(&success_transition); + if (constraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + success_transitions.setTransitionTo(constraint->getEntrypoint()); + + // quantifier --[rollback]-> next-constraint + success_transitions.emplaceRollback(&rollback_transition); + } + else + { + // quantifier --[success]-> next-constraint + success_transitions.emplace(&success_transition); + if (constraint->shouldResetCapture()) + { + success_transitions.setResetCapture(); + } + + // quantifier --[rollback]-> constraint + rollback_transition = constraint->getEntrypoint(); + } + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + if (at_least_one) + { + auto cc = soup::make_unique(constraint->clone(success_transitions)); + cc->constraint->group = constraint->group; + cc->setupTransitionsAtLeastOne(success_transitions); + return cc; + } + return RegexConstraint::clone(success_transitions); + } + + [[nodiscard]] std::string toString() const noexcept final + { + std::string str = constraint->toString(); + if (at_least_one) + { + str.push_back('+'); + } + else + { + str.push_back('*'); + } + if (!greedy) + { + str.push_back('?'); + } + return str; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexStartConstraint.hpp b/src/vendor/Soup/soup/RegexStartConstraint.hpp new file mode 100644 index 000000000..600a35cbd --- /dev/null +++ b/src/vendor/Soup/soup/RegexStartConstraint.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexStartConstraint : public RegexConstraint + { + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (m.it == m.begin) + { + return true; + } + if constexpr (multi_line) + { + if (*(m.it - 1) == '\n') + { + return true; + } + } + return false; + } + + [[nodiscard]] std::string toString() const noexcept final + { + return escape_sequence ? "\\A" : "^"; + } + + void getFlags(uint16_t& set, uint16_t& unset) const noexcept final + { + if constexpr (!escape_sequence) + { + if constexpr (multi_line) + { + set |= RE_MULTILINE; + } + else + { + unset |= RE_MULTILINE; + } + } + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexTransitionsVector.hpp b/src/vendor/Soup/soup/RegexTransitionsVector.hpp new file mode 100644 index 000000000..d4f7e917f --- /dev/null +++ b/src/vendor/Soup/soup/RegexTransitionsVector.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include + +NAMESPACE_SOUP +{ + struct RegexTransitionsVector + { + std::vector data; + std::vector prev_data; + + void emplace(RegexConstraint** p) + { + data.emplace_back(p); + } + + void emplaceRollback(RegexConstraint** p) + { + data.emplace_back(p); + + // If we don't have a next constraint, rollback is match success. + *p = RegexConstraint::ROLLBACK_TO_SUCCESS; + } + + void setPreviousTransitionTo(RegexConstraint* c) noexcept + { + SOUP_ASSERT((reinterpret_cast(c) & RegexConstraint::MASK) == 0); + + for (const auto& p : prev_data) + { + *p = reinterpret_cast(reinterpret_cast(c) | (reinterpret_cast(*p) & 0b10)); + } + } + + void setResetCapture() noexcept + { + for (const auto& p : data) + { + *reinterpret_cast(p) = 0b10; + } + } + + void setTransitionTo(RegexConstraint* c, bool save_checkpoint = false) noexcept + { + SOUP_ASSERT((reinterpret_cast(c) & RegexConstraint::MASK) == 0); + + if (save_checkpoint) + { + reinterpret_cast(c) |= 0b1; + } + + for (const auto& p : data) + { + *p = reinterpret_cast(reinterpret_cast(c) | (reinterpret_cast(*p) & 0b10)); + } + + prev_data = std::move(data); + data.clear(); + } + + void discharge(std::vector& outTransitions) noexcept + { + for (const auto& p : data) + { + outTransitions.emplace_back(p); + } + data.clear(); + } + + void rollback() noexcept + { + data = std::move(prev_data); + prev_data.clear(); + + for (const auto& p : data) + { + *reinterpret_cast(p) &= 0b10; + } + } + }; +} diff --git a/src/vendor/Soup/soup/RegexWordBoundaryConstraint.hpp b/src/vendor/Soup/soup/RegexWordBoundaryConstraint.hpp new file mode 100644 index 000000000..d63337fe6 --- /dev/null +++ b/src/vendor/Soup/soup/RegexWordBoundaryConstraint.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" +#include "string.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexWordBoundaryConstraint : public RegexConstraint + { + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + if (m.it == m.begin + || m.it == m.end + ) + { + return true ^ inverted; + } + if (string::isWordChar(*(m.it - 1))) + { + return !string::isWordChar(*m.it) ^ inverted; + } + else + { + return string::isWordChar(*m.it) ^ inverted; + } + } + + [[nodiscard]] std::string toString() const noexcept final + { + return inverted ? "\\B" : "\\b"; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 0; + } + }; +} diff --git a/src/vendor/Soup/soup/RegexWordCharConstraint.hpp b/src/vendor/Soup/soup/RegexWordCharConstraint.hpp new file mode 100644 index 000000000..9cf4b172c --- /dev/null +++ b/src/vendor/Soup/soup/RegexWordCharConstraint.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include "RegexConstraint.hpp" + +#include "RegexMatcher.hpp" +#include "string.hpp" + +NAMESPACE_SOUP +{ + template + struct RegexWordCharConstraint : public RegexConstraint + { + [[nodiscard]] bool matches(RegexMatcher& m) const noexcept final + { + return string::isWordChar(*m.it++) ^ inverted; + } + + [[nodiscard]] std::string toString() const noexcept final + { + return inverted ? "\\W" : "\\w"; + } + + [[nodiscard]] size_t getCursorAdvancement() const final + { + return 1; + } + + [[nodiscard]] UniquePtr clone(RegexTransitionsVector& success_transitions) const final + { + auto cc = soup::make_unique(); + success_transitions.setTransitionTo(cc->getEntrypoint()); + success_transitions.emplace(&cc->success_transition); + return cc; + } + }; +} diff --git a/testes/pluto/basic.pluto b/testes/pluto/basic.pluto index 55633e1d8..9256cb732 100644 --- a/testes/pluto/basic.pluto +++ b/testes/pluto/basic.pluto @@ -2076,6 +2076,20 @@ do end) sched:run() end +do + local regex = require "pluto:regex" + + local pattern = new regex [[/^the (only )?one$/i]] + assert(pattern:match("THE ONE")) + assert(not pattern:match("NOT THE ONE")) + + pattern = new regex [[/anywhere from (\d+) to (\d+)/]] + local match = pattern:match("anywhere from 3 to 5") + assert(match) + assert(match[0] == "anywhere from 3 to 5") + assert(match[1] == "3") + assert(match[2] == "5") +end print "Testing default table metatable." do