From 78c5908be58028948de1535ef73a7c8d902530f9 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Sat, 23 May 2026 18:58:24 +0000 Subject: [PATCH] fix(tokenizer): allow `$` as identifier continuation char in PostgreSQL Per the PostgreSQL lexical syntax docs, unquoted identifiers may contain `$` after the first character (e.g. `schema$1` is a single identifier, not `schema` followed by the placeholder `$1`). The tokenizer was stopping at `$` for both dialects, which broke ProxySQL's set_parser_algorithm=3 path for inputs like `SET search_path = schema$1` -- the walker only saw `schema` and the trailing `$1` fell through as a separate token. The first-character constraint is preserved: `$` at the start of a token still emits TK_ERROR (covers $user, $bareword, etc., which are not valid PG tokens at that position). Numeric placeholders (`$1`) and dollar-quoted strings (`$$...$$`) are unaffected -- their branches in next_token_impl() run before the identifier scanner. MySQL behaviour is unchanged: `$` still terminates an unquoted MySQL identifier (MySQL doesn't allow `$` in identifiers without backticks). Tests: added 4 cases in test_set.cpp covering PG mid-ident `$`, multi-`$` idents, PG `$` still erroring, and MySQL unchanged. --- include/sql_parser/tokenizer.h | 13 +++++++-- tests/test_set.cpp | 48 ++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/include/sql_parser/tokenizer.h b/include/sql_parser/tokenizer.h index f1970be..5d593e7 100644 --- a/include/sql_parser/tokenizer.h +++ b/include/sql_parser/tokenizer.h @@ -153,8 +153,17 @@ class Tokenizer { const char* start = cursor_; while (cursor_ < end_) { char c = *cursor_; - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || c == '_') { + bool is_cont = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '_'; + // PostgreSQL allows `$` as an identifier continuation char (but + // not as the first char, which is enforced because $ at start + // is handled by the $$ / $N branches in next_token_impl()). + // e.g. `SET search_path = schema$1` — `schema$1` is a single + // identifier, not `schema` followed by the placeholder `$1`. + if (!is_cont && D == Dialect::PostgreSQL && c == '$' && cursor_ > start) { + is_cont = true; + } + if (is_cont) { ++cursor_; } else { break; diff --git a/tests/test_set.cpp b/tests/test_set.cpp index 34524c3..023d66b 100644 --- a/tests/test_set.cpp +++ b/tests/test_set.cpp @@ -1027,6 +1027,54 @@ TEST(PgSQLSetP2, NumericPlaceholderStillOk) { EXPECT_NE(r.status, ParseResult::ERROR); } +// PostgreSQL identifiers can contain $ after the first character (per +// https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS). +// Earlier versions of the tokenizer stopped at $, so `schema$1` parsed as +// the identifier `schema` followed by the placeholder `$1`, which was then +// truncated/rejected downstream. +TEST(PgSQLSetP2, DollarInUnquotedIdentIsContinuation) { + Parser parser; + const char* sql = "SET search_path = schema$1"; + auto r = parser.parse(sql, strlen(sql)); + EXPECT_EQ(r.status, ParseResult::OK); + const AstNode* v = first_value(r.ast); + ASSERT_NE(v, nullptr); + EXPECT_EQ(std::string(v->value_ptr, v->value_len), "schema$1"); +} + +TEST(PgSQLSetP2, DollarInMiddleOfIdent) { + Parser parser; + const char* sql = "SET search_path = my$schema$2_name"; + auto r = parser.parse(sql, strlen(sql)); + EXPECT_EQ(r.status, ParseResult::OK); + const AstNode* v = first_value(r.ast); + ASSERT_NE(v, nullptr); + EXPECT_EQ(std::string(v->value_ptr, v->value_len), "my$schema$2_name"); +} + +TEST(PgSQLSetP2, DollarAtStartStillEmitsError) { + // `$word` (dollar followed by non-digit) is reserved and must still + // emit TK_ERROR. Only mid-identifier `$` becomes a continuation char. + Parser parser; + const char* sql = "SET search_path = $bareword"; + auto r = parser.parse(sql, strlen(sql)); + EXPECT_EQ(r.status, ParseResult::ERROR); +} + +// MySQL still disallows $ in unquoted identifiers — the PG-only continuation +// rule must not leak into MySQL parsing. +TEST(MySQLSet, DollarStillBreaksUnquotedIdent) { + Parser parser; + const char* sql = "SET schema$1 = 1"; + auto r = parser.parse(sql, strlen(sql)); + // MySQL: $ stops the identifier so the parse fails (or produces a partial + // result without `schema$1` as a single token). + const AstNode* v = first_value(r.ast); + if (v != nullptr) { + EXPECT_NE(std::string(v->value_ptr, v->value_len), "schema$1"); + } +} + // ============================================================================ // Post-1.0.4 audit follow-ups: PG non-GUC SET forms and value-preservation. // ============================================================================