From 78c5908be58028948de1535ef73a7c8d902530f9 Mon Sep 17 00:00:00 2001
From: Rene Cannao <rene@proxysql.com>
Date: Sat, 23 May 2026 18:58:24 +0000
Subject: [PATCH] fix(tokenizer): allow `$` as identifier continuation char in
 PostgreSQL

Per the PostgreSQL lexical syntax docs, unquoted identifiers may contain
`$` after the first character (e.g. `schema$1` is a single identifier,
not `schema` followed by the placeholder `$1`). The tokenizer was
stopping at `$` for both dialects, which broke ProxySQL's
set_parser_algorithm=3 path for inputs like `SET search_path = schema$1`
-- the walker only saw `schema` and the trailing `$1` fell through as a
separate token.

The first-character constraint is preserved: `$<letter>` at the start of
a token still emits TK_ERROR (covers $user, $bareword, etc., which are
not valid PG tokens at that position). Numeric placeholders (`$1`) and
dollar-quoted strings (`$$...$$`) are unaffected -- their branches in
next_token_impl() run before the identifier scanner.

MySQL behaviour is unchanged: `$` still terminates an unquoted MySQL
identifier (MySQL doesn't allow `$` in identifiers without backticks).

Tests: added 4 cases in test_set.cpp covering PG mid-ident `$`,
multi-`$` idents, PG `$<word>` still erroring, and MySQL unchanged.
---
 include/sql_parser/tokenizer.h | 13 +++++++--
 tests/test_set.cpp             | 48 ++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/include/sql_parser/tokenizer.h b/include/sql_parser/tokenizer.h
index f1970be..5d593e7 100644
--- a/include/sql_parser/tokenizer.h
+++ b/include/sql_parser/tokenizer.h
@@ -153,8 +153,17 @@ class Tokenizer {
         const char* start = cursor_;
         while (cursor_ < end_) {
             char c = *cursor_;
-            if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-                (c >= '0' && c <= '9') || c == '_') {
+            bool is_cont = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+                           (c >= '0' && c <= '9') || c == '_';
+            // PostgreSQL allows `$` as an identifier continuation char (but
+            // not as the first char, which is enforced because $ at start
+            // is handled by the $$ / $N branches in next_token_impl()).
+            // e.g. `SET search_path = schema$1` — `schema$1` is a single
+            // identifier, not `schema` followed by the placeholder `$1`.
+            if (!is_cont && D == Dialect::PostgreSQL && c == '$' && cursor_ > start) {
+                is_cont = true;
+            }
+            if (is_cont) {
                 ++cursor_;
             } else {
                 break;
diff --git a/tests/test_set.cpp b/tests/test_set.cpp
index 34524c3..023d66b 100644
--- a/tests/test_set.cpp
+++ b/tests/test_set.cpp
@@ -1027,6 +1027,54 @@ TEST(PgSQLSetP2, NumericPlaceholderStillOk) {
     EXPECT_NE(r.status, ParseResult::ERROR);
 }
 
+// PostgreSQL identifiers can contain $ after the first character (per
+// https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS).
+// Earlier versions of the tokenizer stopped at $, so `schema$1` parsed as
+// the identifier `schema` followed by the placeholder `$1`, which was then
+// truncated/rejected downstream.
+TEST(PgSQLSetP2, DollarInUnquotedIdentIsContinuation) {
+    Parser<Dialect::PostgreSQL> parser;
+    const char* sql = "SET search_path = schema$1";
+    auto r = parser.parse(sql, strlen(sql));
+    EXPECT_EQ(r.status, ParseResult::OK);
+    const AstNode* v = first_value(r.ast);
+    ASSERT_NE(v, nullptr);
+    EXPECT_EQ(std::string(v->value_ptr, v->value_len), "schema$1");
+}
+
+TEST(PgSQLSetP2, DollarInMiddleOfIdent) {
+    Parser<Dialect::PostgreSQL> parser;
+    const char* sql = "SET search_path = my$schema$2_name";
+    auto r = parser.parse(sql, strlen(sql));
+    EXPECT_EQ(r.status, ParseResult::OK);
+    const AstNode* v = first_value(r.ast);
+    ASSERT_NE(v, nullptr);
+    EXPECT_EQ(std::string(v->value_ptr, v->value_len), "my$schema$2_name");
+}
+
+TEST(PgSQLSetP2, DollarAtStartStillEmitsError) {
+    // `$word` (dollar followed by non-digit) is reserved and must still
+    // emit TK_ERROR. Only mid-identifier `$` becomes a continuation char.
+    Parser<Dialect::PostgreSQL> parser;
+    const char* sql = "SET search_path = $bareword";
+    auto r = parser.parse(sql, strlen(sql));
+    EXPECT_EQ(r.status, ParseResult::ERROR);
+}
+
+// MySQL still disallows $ in unquoted identifiers — the PG-only continuation
+// rule must not leak into MySQL parsing.
+TEST(MySQLSet, DollarStillBreaksUnquotedIdent) {
+    Parser<Dialect::MySQL> parser;
+    const char* sql = "SET schema$1 = 1";
+    auto r = parser.parse(sql, strlen(sql));
+    // MySQL: $ stops the identifier so the parse fails (or produces a partial
+    // result without `schema$1` as a single token).
+    const AstNode* v = first_value(r.ast);
+    if (v != nullptr) {
+        EXPECT_NE(std::string(v->value_ptr, v->value_len), "schema$1");
+    }
+}
+
 // ============================================================================
 // Post-1.0.4 audit follow-ups: PG non-GUC SET forms and value-preservation.
 // ============================================================================