Refactor allSourceColumns query to use pg json parser; fix embedding …

…through views with subqueries inside function calls
PostgREST · Nov 18, 2020 · 8e66b2a · 8e66b2a
1 parent 3f690ec
commit 8e66b2a
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
  - #1162, Fix location header for POST request with select= without PK - @wolfgangwalther
  - #1585, Fix error messages on connection failure for localized postgres on Windows - @wolfgangwalther
  - #1636, Fix `application/octet-stream` appending `charset=utf-8` - @steve-chavez
+ - #1608, Fix embedding through views with subqueries in function calls - @wolfgangwalther
 
 ### Changed
 

diff --git a/src/PostgREST/DbStructure.hs b/src/PostgREST/DbStructure.hs
@@ -39,7 +39,7 @@ import GHC.Exts                      (groupWith)
 import Protolude                     hiding (toS)
 import Protolude.Conv                (toS)
 import Protolude.Unsafe              (unsafeHead)
-import Text.InterpolatedString.Perl6 (q, qc)
+import Text.InterpolatedString.Perl6 (q)
 
 import PostgREST.Private.Common
 import PostgREST.Types
@@ -49,7 +49,7 @@ getDbStructure schemas extraSearchPath pgVer = do
   HT.sql "set local schema ''" -- This voids the search path. The following queries need this for getting the fully qualified name(schema.name) of every db object
   tabs    <- HT.statement () allTables
   cols    <- HT.statement schemas $ allColumns tabs
-  srcCols <- HT.statement (schemas, extraSearchPath) $ pfkSourceColumns cols pgVer
+  srcCols <- HT.statement (schemas, extraSearchPath) $ pfkSourceColumns cols
   m2oRels <- HT.statement () $ allM2ORels tabs cols
   keys    <- HT.statement () $ allPrimaryKeys tabs
   procs   <- HT.statement schemas allProcs
@@ -674,18 +674,11 @@ pkFromRow tabs (s, t, n) = PrimaryKey <$> table <*> pure n
   where table = find (\tbl -> tableSchema tbl == s && tableName tbl == t) tabs
 
 -- returns all the primary and foreign key columns which are referenced in views
-pfkSourceColumns :: [Column] -> PgVersion -> H.Statement ([Schema], [Schema]) [SourceColumn]
-pfkSourceColumns cols pgVer =
+pfkSourceColumns :: [Column] -> H.Statement ([Schema], [Schema]) [SourceColumn]
+pfkSourceColumns cols =
   H.Statement sql (contrazip2 (arrayParam HE.text) (arrayParam HE.text)) (decodeSourceColumns cols) True
-  -- query explanation at https://gist.github.com/steve-chavez/7ee0e6590cddafb532e5f00c46275569
   where
-    subselectRegex :: Text
-    -- "result" appears when the subselect is used inside "case when", see `authors_have_book_in_decade` fixture
-    -- "resno"  appears in every other case
-    -- when copying the query into pg make sure you omit one backslash from \\d+, it should be like `\d+` for the regex
-    subselectRegex | pgVer < pgVersion100 = ":subselect {.*?:constraintDeps <>} :location \\d+} :res(no|ult)"
-                   | otherwise = ":subselect {.*?:stmt_len 0} :location \\d+} :res(no|ult)"
-    sql = [qc|
+    sql = [q|
       with recursive
       pks_fks as (
         -- pk + fk referencing col
@@ -713,36 +706,100 @@ pfkSourceColumns cols pgVer =
         join pg_rewrite r on r.ev_class = c.oid
         where c.relkind in ('v', 'm') and n.nspname = ANY($1 || $2)
       ),
-      removed_subselects as(
+      transform_json as (
         select
           view_id, view_schema, view_name,
-          regexp_replace(view_definition, '{subselectRegex}', '', 'g') as x
+          -- the following formatting is without indentation on purpose
+          -- to allow simple diffs, with less whitespace noise
+          replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            regexp_replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+            replace(
+              view_definition::text,
+            -- This conversion to json is heavily optimized for performance.
+            -- The general idea is to use as few regexp_replace() calls as possible.
+            -- Simple replace() is a lot faster, so we jump through some hoops
+            -- to be able to use regexp_replace() only once.
+            -- This has been tested against a huge schema with 250+ different views.
+            -- The unit tests do NOT reflect all possible inputs. Be careful when changing this!
+            -- -----------------------------------------------
+            -- pattern           | replacement         | flags
+            -- -----------------------------------------------
+            -- `,` is not part of the pg_node_tree format, but used in the regex.
+            -- This removes all `,` that might be part of column names.
+               ','               , ''
+            -- The same applies for `(`, `{` and `}`, although those are used a lot in pg_node_tree.
+            -- We remove the escaped ones, which might be part of column names again.
+            ), '\('              , ''
+            ), '\{'              , ''
+            ), '\}'              , ''
+            -- The fields we need are formatted as json manually to protect them from the regex.
+            ), ' :targetList '   , ',"targetList":'
+            ), ' :resno '        , ',"resno":'
+            ), ' :resorigtbl '   , ',"resorigtbl":'
+            ), ' :resorigcol '   , ',"resorigcol":'
+            -- Make the regex also match the node type, e.g. `{QUERY ...`, to remove it in one pass.
+            ), '{'               , '{ :'
+            -- Protect node lists, which start with `({` or `((` from the greedy regex.
+            -- The extra `{` is removed again later.
+            ), '(('              , '{(('
+            ), '({'              , '{({'
+            -- This regex removes all unused fields to avoid the need to format all of them correctly.
+            -- This leads to a smaller json result as well.
+            -- Removal stops at `,` for used fields (see above) and `}` for the end of the current node.
+            -- Nesting can't be parsed correctly with a regex, so we stop at `{` as well and
+            -- add an empty key for the followig node.
+            ), ' :[^{,}]+'       , ',"":'              , 'g'
+            -- For performance, the regex also added those empty keys when hitting a `,` or `}`.
+            -- Those are removed next.
+            ), ',"":}'           , '}'
+            ), ',"":,'           , ','
+            -- This reverses the "node list protection" from above.
+            ), '{('              , '('
+            -- Every key above has been added with a `,` so far. The first key in an object doesn't need it.
+            ), '{,'              , '{'
+            -- pg_node_tree has `()` around lists, but JSON uses `[]`
+            ), '('               , '['
+            ), ')'               , ']'
+            -- pg_node_tree has ` ` between list items, but JSON uses `,`
+            ), '} {'             , '},{'
+            ), '] ['             , '],['
+            -- `<>` in pg_node_tree is the same as `null` in JSON, but due to very poor performance of json_typeof
+            -- we need to make this an empty array here to prevent json_array_elements from throwing an error
+            -- when the targetList is null.
+            ), '<>'              , '[]'
+          )::json as view_definition
         from views
       ),
-      target_lists as(
-        select
-          view_id, view_schema, view_name,
-          string_to_array(x, 'targetList') as x
-        from removed_subselects
-      ),
-      last_target_list_wo_tail as(
-        select
-          view_id, view_schema, view_name,
-          (string_to_array(x[array_upper(x, 1)], ':onConflict'))[1] as x
-        from target_lists
-      ),
       target_entries as(
         select
           view_id, view_schema, view_name,
-          unnest(string_to_array(x, 'TARGETENTRY')) as entry
-        from last_target_list_wo_tail
+          json_array_elements(view_definition->0->'targetList') as entry
+        from transform_json
       ),
       results as(
         select
           view_id, view_schema, view_name,
-          substring(entry from ':resno (\d+)')::int as view_column,
-          substring(entry from ':resorigtbl (\d+)')::oid as resorigtbl,
-          substring(entry from ':resorigcol (\d+)')::int as resorigcol
+          (entry->>'resno')::int as view_column,
+          (entry->>'resorigtbl')::oid as resorigtbl,
+          (entry->>'resorigcol')::int as resorigcol
         from target_entries
       ),
       recursion as(

diff --git a/test/Feature/QuerySpec.hs b/test/Feature/QuerySpec.hs
@@ -456,6 +456,12 @@ spec actualPgVersion = do
           [json|[{"title":"1984","first_publisher":"Secker & Warburg","author":{"name":"George Orwell"}}]|]
           { matchHeaders = [matchContentTypeJson] }
 
+      it "works with views that have subselects in a function call" $
+        get "/authors_have_book_in_decade2?select=*,books(title)&id=eq.3"
+          `shouldRespondWith`
+            [json|[ {"id":3,"name":"Antoine de Saint-Exupéry","has_book_in_forties":true,"has_book_in_fifties":false,
+                     "has_book_in_sixties":false,"books":[{"title":"The Little Prince"}]} ]|]
+
       it "works with views that have CTE" $
         get "/odd_years_publications?select=title,publication_year,first_publisher,author:authors(name)&id=in.(1,2,3)" `shouldRespondWith`
           [json|[

diff --git a/test/fixtures/privileges.sql b/test/fixtures/privileges.sql
@@ -96,6 +96,7 @@ GRANT ALL ON TABLE
     , jsonb_test
     , authors_books_number
     , authors_have_book_in_decade
+    , authors_have_book_in_decade2
     , forties_and_fifties_books
     , odd_years_publications
     , foos

diff --git a/test/fixtures/schema.sql b/test/fixtures/schema.sql
@@ -1385,6 +1385,24 @@ select
   end as has_book_in_sixties
 from private.authors x;
 
+create view test.authors_have_book_in_decade2 as
+select
+  id,
+  name,
+  coalesce(
+    (select true from test.forties_books where author_id=x.id limit 1),
+    false
+  ) as has_book_in_forties,
+  coalesce(
+    (select true from test.fifties_books where author_id=x.id limit 1),
+    false
+  ) as has_book_in_fifties,
+  coalesce(
+    (select true from test.sixties_books where author_id=x.id limit 1),
+    false
+  ) as has_book_in_sixties
+from private.authors x;
+
 create view test.forties_and_fifties_books as
 select x.id, x.title, x.publication_year, y.name as first_publisher, x.author_id
 from (