In [None]:
#hide
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False  # workaround for buggy jedi

In [None]:
# default_exp core

In [None]:
#export
import re
from sql_formatter.utils import *

# core

> Core functions for SQL formatting

In [None]:
#hide
from nbdev.showdoc import *

## General formatting

Basic formatting for SQL queries. Let's use an example throughout the core module.

This is how an input could look like

In [None]:
example_sql = """
create or replace table mytable as -- Mytable example
/* multi line
   comment */
seLecT a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf, 
/* some commenT 
there */
case when a.asdf= 1 then 'b' /* here a case comment */
when b.qwer =2 then 'c' else 'd' end as new_field, -- Some comment
b.asdf2 frOm table1 as a leFt join 
table2 as b -- and here a comment
    on a.asdf = b.asdf  /* joiN this way */
    inner join table3 as c
on a.asdf=c.asdf and a.qwer= b.qwer
whEre a.asdf= 1 -- comment this
anD b.qwer =2 and a.asdf<=1 --comment that
or b.qwer>=5
groUp by a.asdf
"""

and this is how we would like to format it

In [None]:
expected_sql = """CREATE OR REPLACE TABLE mytable AS -- Mytable example
/* multi line
   comment */
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       substr(c.asdf, 1, 2) as substr_asdf,
       /* some commenT
          there */
       case when a.asdf = 1 then 'b' /* here a case comment */
            when b.qwer = 2 then 'c'
            else 'd' end as new_field, -- Some comment
       b.asdf2
FROM   table1 as a
    LEFT JOIN table2 as b -- and here a comment
        ON a.asdf = b.asdf /* joiN this way */
    INNER JOIN table3 as c
        ON a.asdf = c.asdf and
           a.qwer = b.qwer
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf"""

Let's start by defining the main statements. The main statements all require a new line and should be in upper case

In [None]:
#export
MAIN_STATEMENTS = [
    "create.*?table",  # regex for all variants, e.g. CREATE OR REPLACE TABLE
    "create.*?view",  # regex for all variants, e.g. CREATE OR REPLACE VIEW
    "select distinct",
    "select",
    "from",
    "left join",
    "inner join",
    "outer join",
    "right join",
    "union",
    "on",
    "where",
    "group by",
    "order by",
    "over",  # special case: no newline, only capitalized
    "partition by",  # special case: no newline, only capitalized
]

> Remark: For OVER and PARTITION BY we only capitalize without adding a newline

We need to remove newlines and multiple spaces because they may be arbitrary. 

Before removing newlines we also need to mark the end of coments with the special token [C] because we would not know where they end

We also mark the begin of comments /* */ with the special token [CS] (comment start) if they start in a newline

In [None]:
#export
def clean_query(s):
    "Remove redundant whitespaces and mark comments boundaries and remove newlines afterwards in query `s`"
    s = remove_redundant_whitespaces(s)  # remove too many whitespaces but no newlines
    s = mark_comments(s)  # mark comments with special tokens [C], [CS] and [CI]
    s = replace_newline_chars(s)  # remove newlines but not in the comments
    s = remove_whitespaces_newline(s)  # remove whitespaces after and before newline
    s = remove_whitespaces_comments(s)  # remove whitespaces after and before [C], [CS] and [CI]
    s = remove_redundant_whitespaces(s)  # remove too many whitespaces but no newlines
    return s

In [None]:
assert_and_print(
    clean_query(
"""
SELECT asdf, qwer,
 qwer1,    qwer2
FROM table1
"""
    ), "SELECT asdf, qwer, qwer1, qwer2 FROM table1"
)

SELECT asdf, qwer, qwer1, qwer2 FROM table1


With usual comments

In [None]:
assert_and_print(
    clean_query("""
SELECT asdf, qwer, -- some comment
 qwer1,    qwer2
FROM table1
WHERE asdf=1
"""), "SELECT asdf, qwer, -- some comment[C]qwer1, qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, -- some comment[C]qwer1, qwer2 FROM table1 WHERE asdf=1


With other comment form

In [None]:
assert_and_print(
    clean_query("""
SELECT asdf, qwer, /*  another comment */
qwer1,    qwer2
FROM table1
WHERE asdf=1
"""), "SELECT asdf, qwer, /* another comment */[C]qwer1, qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, /* another comment */[C]qwer1, qwer2 FROM table1 WHERE asdf=1


In [None]:
assert_and_print(
    clean_query(
"""
SELECT asdf, qwer, /*  another comment */
qwer1,    
/* inline comment */
qwer2
FROM table1
WHERE asdf=1
"""
    ), 
    "SELECT asdf, qwer, /* another comment */[C]qwer1,[CS]/* inline comment */[C]qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, /* another comment */[C]qwer1,[CS]/* inline comment */[C]qwer2 FROM table1 WHERE asdf=1


In [None]:
assert_and_print(
    clean_query(
"""
create or replace my_table as
/* some comment
   some new comment line */
select asdf,
qwer,   qwer2

from table1
"""
    ),
"create or replace my_table as[CS]/* some comment[CI]some new comment line */[C]select asdf, qwer, qwer2 from table1"
)

create or replace my_table as[CS]/* some comment[CI]some new comment line */[C]select asdf, qwer, qwer2 from table1


In [None]:
print(clean_query(example_sql))

create or replace table mytable as -- Mytable example[C][CS]/* multi line[CI]comment */[C]seLecT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf,[CS]/* some commenT[CI]there */[C]case when a.asdf= 1 then 'b' /* here a case comment */[C]when b.qwer =2 then 'c' else 'd' end as new_field, -- Some comment[C]b.asdf2 frOm table1 as a leFt join table2 as b -- and here a comment[C]on a.asdf = b.asdf /* joiN this way */[C]inner join table3 as c on a.asdf=c.asdf and a.qwer= b.qwer whEre a.asdf= 1 -- comment this[C]anD b.qwer =2 and a.asdf<=1 --comment that[C]or b.qwer>=5 groUp by a.asdf


### Preformatting queries

We would like to have each main statement (SELECT, FROM , ...) in a separate line and in uppercase, ignoring text in comments. This way we can then later format each query statement separately. Also for the special case with PARTITION BY in the SELECT statement we will not add a newline because the main statement in that case is SELECT and not PARTITION BY

In [None]:
#export
def preformat_statements(s):
    """Write a newline in `s` for all `statements` and
    uppercase them but not if they are inside a comment"""
    statements = MAIN_STATEMENTS
    s = clean_query(s)  # clean query and mark comments
    split_s = split_query(s)  # split by comment and non comment
    split_s = compress_dicts(split_s, ["comment", "select"])
    # compile regex before loop
    create_re = re.compile("create", flags=re.I)
    select_re = re.compile("select", flags=re.I)
    for statement in statements:
        if create_re.match(statement):  # special case CREATE with AS capitalize as well
            create_sub = re.compile(rf"\s*({statement} )(.*) as\b", flags=re.I)
            split_s = [{
                "string": create_sub.sub(
                    lambda pat: "\n" + pat.group(1).upper() + pat.group(2) + " AS", 
                    sdict["string"], 
                    ) if not sdict["comment"] else sdict["string"],
                    "comment": sdict["comment"],
                    "select": sdict["select"]
                } for sdict in split_s]
        else:  # normal main statements
            non_select_region_re = re.compile(rf"\s*\b({statement})\b", flags=re.I)
            select_region_statement_re = re.compile(rf"\b({statement})\b", flags=re.I)
            split_s = [{
                "string": non_select_region_re.sub("\n" + statement.upper(), sdict["string"]) 
                    if not sdict["comment"] and not sdict["select"]  # no comment, no select region
                    else non_select_region_re.sub("\n" + statement.upper(), sdict["string"]) 
                    if not sdict["comment"] and sdict["select"] and select_re.match(statement) # no comment, select region and select statement
                    else select_region_statement_re.sub(statement.upper(), sdict["string"]) 
                    if not sdict["comment"] and sdict["select"] and not select_re.match(statement) # no comment, select region and no select statement
                    else sdict["string"],
                "comment": sdict["comment"],
                "select": sdict["select"]
                } for sdict in split_s]
    s = "".join([sdict["string"] for sdict in split_s])
    s = s.strip()  # strip string
    s = remove_whitespaces_newline(s)  # remove whitespaces before and after newline
    return s

In [None]:
assert_and_print(
    preformat_statements("select asdf, qwer as new_var from table1 where asdf = 1"),
    "SELECT asdf, qwer as new_var\nFROM table1\nWHERE asdf = 1"
)

SELECT asdf, qwer as new_var
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(preformat_statements("""
seLect asdf,
       /* some comment inside select */
       qwer
From   table1 where  asdf = 1
"""),
    "SELECT asdf,[CS]/* some comment inside select */[C]qwer\nFROM table1\nWHERE asdf = 1"
)

SELECT asdf,[CS]/* some comment inside select */[C]qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
seLect asdf, /* some comment inside select */
       qwer
From   table1 where  asdf = 1
"""),
    "SELECT asdf, /* some comment inside select */[C]qwer\nFROM table1\nWHERE asdf = 1"
)

SELECT asdf, /* some comment inside select */[C]qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
create or replace view my_view as
seLect asdf,
       /* some comment inside select */
       qwer
From   table1 where  asdf = 1
"""),
    "CREATE OR REPLACE VIEW my_view AS\nSELECT asdf,[CS]/* some comment inside select */[C]qwer\nFROM table1\nWHERE asdf = 1"
)

CREATE OR REPLACE VIEW my_view AS
SELECT asdf,[CS]/* some comment inside select */[C]qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
create or replace view my_view as
seLect asdf,
       qwer_function,
       qwer
From   table1 where  asdf = 1
"""),
    "CREATE OR REPLACE VIEW my_view AS\nSELECT asdf, qwer_function, qwer\nFROM table1\nWHERE asdf = 1"
)

CREATE OR REPLACE VIEW my_view AS
SELECT asdf, qwer_function, qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
create or replace view my_view as
seLect asdf, qwer_function,
       lead(asdf) over (Partition By asdf order BY qwer),
    qwer2
From   table1 where  asdf = 1 order by asdf
"""),
    """
CREATE OR REPLACE VIEW my_view AS
SELECT asdf, qwer_function, lead(asdf) OVER (PARTITION BY asdf ORDER BY qwer), qwer2
FROM table1
WHERE asdf = 1
ORDER BY asdf
""".strip()
)

CREATE OR REPLACE VIEW my_view AS
SELECT asdf, qwer_function, lead(asdf) OVER (PARTITION BY asdf ORDER BY qwer), qwer2
FROM table1
WHERE asdf = 1
ORDER BY asdf


In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace table mytable as -- Mytable example
seLecT a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf
"""
    ),
"""
CREATE OR REPLACE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf
""".strip()
)

CREATE OR REPLACE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf


In [None]:
assert_and_print(
    preformat_statements(
"""
create table mytable as -- Mytable example
seLecT a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf
"""
    ),
"""
CREATE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf
""".strip()
)

CREATE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf


In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace table  my_table as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE OR REPLACE TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".strip()
)

CREATE OR REPLACE TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


With additional keyword `COMMENT`

In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace table  my_table comment='blabla' as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE OR REPLACE TABLE my_table comment='blabla' AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".strip()
)

CREATE OR REPLACE TABLE my_table comment='blabla' AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace transient table  my_table as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE OR REPLACE TRANSIENT TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".strip()
)

CREATE OR REPLACE TRANSIENT TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


In [None]:
assert_and_print(
    preformat_statements(
"""
Create view  my_table as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE VIEW my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".strip()
)

CREATE VIEW my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


In [None]:
print(preformat_statements(example_sql))

CREATE OR REPLACE TABLE mytable AS -- Mytable example[C][CS]/* multi line[CI]comment */[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf,[CS]/* some commenT[CI]there */[C]case when a.asdf= 1 then 'b' /* here a case comment */[C]when b.qwer =2 then 'c' else 'd' end as new_field, -- Some comment[C]b.asdf2
FROM table1 as a
LEFT JOIN table2 as b -- and here a comment[C]
ON a.asdf = b.asdf /* joiN this way */[C]
INNER JOIN table3 as c
ON a.asdf=c.asdf and a.qwer= b.qwer
WHERE a.asdf= 1 -- comment this[C]anD b.qwer =2 and a.asdf<=1 --comment that[C]or b.qwer>=5
GROUP BY a.asdf


### Lowercasing query

In [None]:
#export
def lowercase_query(s):
    "Lowercase query but let comments and text in quotes untouched"
    split_s = split_query(s)
    split_s = [
        d["string"]
        if d["comment"] or d["quote"]
        else d["string"].lower()
        for d in split_s
    ]
    s = "".join([s for s in split_s])
    return s    

In [None]:
assert_and_print(
    lowercase_query("""
--- My nice view 1 --
Create or Replace VieW view_1 as
seLect asdf, -- Some Comment
qwER,
qwerTy, -- Some other comment
FROM table1
"""),
    """
--- My nice view 1 --
create or replace view view_1 as
select asdf, -- Some Comment
qwer,
qwerty, -- Some other comment
from table1
"""
)


--- My nice view 1 --
create or replace view view_1 as
select asdf, -- Some Comment
qwer,
qwerty, -- Some other comment
from table1



In [None]:
assert_and_print(
    lowercase_query("""
-- Some comment --
Create Or rePlace tablE aS
sElEct asdf,
/* sOme CommEnt */
qwer
FroM table1
"""),
"""
-- Some comment --
create or replace table as
select asdf,
/* sOme CommEnt */
qwer
from table1
"""
)


-- Some comment --
create or replace table as
select asdf,
/* sOme CommEnt */
qwer
from table1



In [None]:
assert_and_print(
    lowercase_query("""
-- Some comment --
Create Or rePlace tablE aS
sElEct asdf, replace('J', 'N', Asdf2) as Asdf3
/* sOme CommEnt */
qwer
FroM table1
"""),
"""
-- Some comment --
create or replace table as
select asdf, replace('J', 'N', asdf2) as asdf3
/* sOme CommEnt */
qwer
from table1
"""
)


-- Some comment --
create or replace table as
select asdf, replace('J', 'N', asdf2) as asdf3
/* sOme CommEnt */
qwer
from table1



In [None]:
assert_and_print(
    lowercase_query("""
-- Some comment --
Create Or rePlace tablE aS
sElEct asdf, replace('J', 'N', Asdf2) as Asdf3
/* sOme CommEnt */
qwer
FroM table1 -- Some comment
"""),
"""
-- Some comment --
create or replace table as
select asdf, replace('J', 'N', asdf2) as asdf3
/* sOme CommEnt */
qwer
from table1 -- Some comment
"""
)


-- Some comment --
create or replace table as
select asdf, replace('J', 'N', asdf2) as asdf3
/* sOme CommEnt */
qwer
from table1 -- Some comment



## Add whitespaces between symbols

In [None]:
#export
def add_whitespaces_query(s):
    "Add whitespaces between symbols (=!<>) for query `s` but not for comments"
    split_s = split_comment_quote(s)  # split by comment / non-comment, quote / non-quote
    for d in split_s:
        if not d["comment"] and not d["quote"]:
            d["string"] = add_whitespaces_between_symbols(d["string"])
    s = "".join([d["string"] for d in split_s])
    return s

In [None]:
assert_and_print(
    add_whitespaces_query(
"""
create or replace table my_table as /* some comment 1=1, 1 =1 */
select asdf, case when asdf= 1 then '=' else 0 end as qwerty
from table1 as a
left join table2 as b on a.asdf= b.asdf
where asdf=1 and qwer =2
"""
    ),
"""
create or replace table my_table as /* some comment 1=1, 1 =1 */
select asdf, case when asdf = 1 then '=' else 0 end as qwerty
from table1 as a
left join table2 as b on a.asdf = b.asdf
where asdf = 1 and qwer = 2
"""    
)


create or replace table my_table as /* some comment 1=1, 1 =1 */
select asdf, case when asdf = 1 then '=' else 0 end as qwerty
from table1 as a
left join table2 as b on a.asdf = b.asdf
where asdf = 1 and qwer = 2



## Specific formatting and validation

Now we will format each statement individually

#### PARTITION BY

Helper function for format PARTITION BY Within SELECT

In [None]:
#export
def format_partition_by(s, base_indentation):
    "Format PARTITION BY line in SELECT (DISTINCT)"
    orderby_involved = bool(re.search("order by", s, flags=re.I))
    if orderby_involved:
        split_s = re.split("(partition by.*)(order by.*)", s, flags=re.I)  # split PARTITION BY    
    else:
        split_s = re.split("(partition by.*)", s, flags=re.I)  # split PARTITION BY
    split_s = [sp for sp in split_s if sp != ""]
    begin_s = split_s[0]
    partition_by = split_s[1]
    indentation = base_indentation + len(begin_s) + 13
    # add newline after each comma (no comments) and indentation
    partition_by = add_newline_indentation(partition_by, indentation=indentation)
    # add new line and indentation after order by
    if orderby_involved:
        partition_by = "".join([partition_by, " "] + split_s[2:])
    partition_by = re.sub(
        r"\s(order by.*)", "\n" + " " * (base_indentation + len(begin_s)) + r"\1", 
        partition_by, 
        flags=re.I
    )
    # combine begin of string with formatted partition by
    s = begin_s + partition_by
    s = s.strip()
    return s

#### Remove (mistake) comma at end of SELECT

In [None]:
#export
def remove_wrong_end_comma(split_s):
    """Remove mistakenly placed commas at the end of SELECT statement using `split_s` with keys
    "string", "comment" and "quote"
    """
    reversed_split_s = split_s[::-1]  # reversed split_s
    first_noncomment = True
    # compile regex before loop
    replace_comma_without_comment = re.compile(r"([\w\d]+)[,]+(\s*)$")
    replace_comma_with_comment = re.compile(r"([\w\d]+)[,]+(\s*)$")
    for i, d in enumerate(reversed_split_s):
        s_aux = d["string"]
        if not d["comment"] and not d["quote"] and d["string"] != "" and first_noncomment:
            if i == 0:  # if end of select (no comment afterwards) remove whitespaces
                s_aux = replace_comma_without_comment.sub(r"\1", s_aux)
            else:  # if not end of select (because comment afterwards) do not remove whitespaces
                s_aux = replace_comma_with_comment.sub(r"\1\2", s_aux)
            first_noncomment = False
        # remove whitespaces between newline symbols
        s_aux = remove_whitespaces_newline(s_aux)
        reversed_split_s[i]["string"] = s_aux
    split_s_out = reversed_split_s[::-1]
    return split_s_out

In [None]:
assert_and_print(
    split_apply_concat("select asdf, qwer, ", remove_wrong_end_comma),
    "select asdf, qwer"
)

select asdf, qwer


In [None]:
assert_and_print(
    split_apply_concat("select asdf, qwer, -- some comment", remove_wrong_end_comma),
    "select asdf, qwer -- some comment"
)

select asdf, qwer -- some comment


In [None]:
assert_and_print(
    split_apply_concat("select asdf, qwer, /* another comment */", remove_wrong_end_comma),
    "select asdf, qwer /* another comment */"
)

select asdf, qwer /* another comment */


In [None]:
assert_and_print(
    split_apply_concat("select asdf, qwer,,,, /* more than 1 comma */", remove_wrong_end_comma),    
    "select asdf, qwer /* more than 1 comma */"
)

select asdf, qwer /* more than 1 comma */


### SELECT

In [None]:
#export
def format_select(s):
    "Format SELECT statement line `s`"
    # remove [C] at end of SELECT
    s = re.sub(r"\[C\]$", "", s)
    split_s = split_comment_quote(s)  # split by comment / non-comment, quote / non-quote
    # if comma is found at the end of select statement then remove comma
    split_s = remove_wrong_end_comma(split_s)
    # check whether there is a SELECT DISTINCT in the code (not comments, not text in quotes)
    s_code = "".join([d["string"] for d in split_s if not d["comment"] and not d["quote"]])    
    # save the correct indentation: 16 for select distinct, 7 for only select
    indentation = 16 if re.search("^select distinct", s_code, flags=re.I) else 7
    # get only comment / non-comment
    split_comment = compress_dicts(split_s, ["comment"])
    # add newline after each comma and indentation (this is robust against quotes by construction)
    s = add_newline_indentation("".join([d["string"] for d in split_s if not d["comment"]]), 
                                indentation=indentation)
    # split again
    split_s = split_comment_quote(s)
    # get only quote / non-quote
    split_quote = compress_dicts([d for d in split_s if not d["comment"]], ["quote"])    
    when_else_re = re.compile(r"(?<!case )((?:when|else).*?)", flags=re.I)    
    case_when_re = re.compile("case when", flags=re.I)
    case_and_or = re.compile(r"\b((?:and|or))\b", flags=re.I)
    case_end_re = re.compile(r"\bend\b", flags=re.I)
    # initialize additional indentation for case statements
    case_extra_indentation = 0
    # initialize indicator for case when ... end
    case_ind = False
    # process case when
    for d in split_quote:
        if d["quote"]:
            continue
        # get string out of dictionary
        s_aux = d["string"]
        # add newline before when or else (but not if when is preceded by case)
        s_aux = when_else_re.sub(r"\n\1", s_aux)
        # split by newline characters
        split_aux = s_aux.split("\n")
        # initialize auxiliary output
        split_aux_out = []
        for line in split_aux:
            strip_line = line.strip()
            case_when_search = case_when_re.search(strip_line)
            if case_when_search:
                case_ind = True # turn indicator on if case ... end begins
                case_extra_indentation = case_when_search.start()  # get additional indentation if case inside function
            if case_ind:
                line = when_else_re.sub(  # add newline for each when / else (without case before)
                    " " * (indentation + 5 + case_extra_indentation) + r"\1",
                    line
                )
                line = case_and_or.sub(  # add newline for each and / or within case when + indentation
                    r"\1\n" + " " * (indentation + 9 + case_extra_indentation),
                    line
                )
            if case_end_re.search(line):  # turn indicator off if case ... end ends
                case_ind = False
            split_aux_out.append(line)
        d["string"] = "\n".join(split_aux_out)  # add formatted string back to dictionary
    s = "".join([d["string"] for d in split_quote])
    s = "\n".join([sp.rstrip() for sp in s.split("\n")]) # strip each line from the right
    # format PARTITION BY
    begin_s = s[0:indentation]
    split_s = s[indentation:].split("\n" + (" " * indentation))
    partition_by_re = re.compile("partition by", flags=re.I)
    split_s = [
        format_partition_by(line, base_indentation=indentation)
        if partition_by_re.search(line) else line
        for line in split_s
    ]
    s = begin_s + ("\n" + (" " * indentation)).join(split_s)
    # get comments and preceding string (non-comment)
    comment_dicts = []
    for i, d in enumerate(split_comment):
        if d["comment"]:
            comment_dicts.append({"comment": d["string"], "preceding": split_comment[i-1]["string"]})
    # assign comments to text
    s = assign_comment(s, comment_dicts)    
    return s

Simple usage without comments

In [None]:
assert_and_print(
    format_select("select aSdf, cast(qweR as numeric),  Asdf,qwer1"),
    "select aSdf,\n       cast(qweR as numeric),\n       Asdf,\n       qwer1"
)

select aSdf,
       cast(qweR as numeric),
       Asdf,
       qwer1


More advanced usage with comments in SELECT

In [None]:
assert_and_print(
    format_select("select asdf, cast(qwer as numeric), -- some comment[C]ASDF, qwer1"),
    "select asdf,\n       cast(qwer as numeric), -- some comment\n       ASDF,\n       qwer1"
)

select asdf,
       cast(qwer as numeric), -- some comment
       ASDF,
       qwer1


Correcting common mistake on the flow: comma at end of SELECT

In [None]:
assert_and_print(
    format_select("select qwer1,   asdf,"),
    "select qwer1,\n       asdf"
)

select qwer1,
       asdf


In [None]:
assert_and_print(
    format_select("SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]qwer2"),
"""
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       qwer2
""".strip()
)

SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       qwer2


In [None]:
assert_and_print(
    format_select("SELECT a.asdf,[CS]/* and here is a line comment inside select */"),
"""
SELECT a.asdf
       /* and here is a line comment inside select */
""".strip()
)

SELECT a.asdf
       /* and here is a line comment inside select */


Correcting comma at end of SELECT but having a comment in the last field

In [None]:
assert_and_print(
    format_select("select qwer1 as qwer2,   asdf as asdf3, -- this field"),
    "select qwer1 as qwer2,\n       asdf as asdf3 -- this field"
)

select qwer1 as qwer2,
       asdf as asdf3 -- this field


In [None]:
assert_and_print(
    format_select("select qwer1,   asdf, /* this field */"),
    "select qwer1,\n       asdf /* this field */"
)

select qwer1,
       asdf /* this field */


With `case when` conditions

In [None]:
assert_and_print(
    format_select("select qwer1, case when abc = 1 then 'a' when abc = 2 then 'b' else 'c' end as qwer2"),
    """
select qwer1,
       case when abc = 1 then 'a'
            when abc = 2 then 'b'
            else 'c' end as qwer2
    """.strip()    
)

select qwer1,
       case when abc = 1 then 'a'
            when abc = 2 then 'b'
            else 'c' end as qwer2


In [None]:
assert_and_print(
    format_select("select qwer1, case when abc = 1 then 'a' -- first condition[C]" +
                  "when abc = 2 then 'b' -- second condition[C]" +
                  "else 'c' end as qwer2, /* else condition */[C]"
                  "asdf3"
),
    """
select qwer1,
       case when abc = 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, /* else condition */
       asdf3
    """.strip()    
)

select qwer1,
       case when abc = 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, /* else condition */
       asdf3


Bad formatted `case when` condition

In [None]:
assert_and_print(
    format_select("select qwer1, case when abc <= 1 then 'a' -- first condition[C]" +
                  "when abc = 2 then 'b' -- second condition[C]" +
                  "else 'c' end as qwer2, -- else condition[C]"
                  "asdf3"
),
    """
select qwer1,
       case when abc <= 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, -- else condition
       asdf3
    """.strip()    
)

select qwer1,
       case when abc <= 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, -- else condition
       asdf3


`case when` and comment after condition

In [None]:
assert_and_print(
    format_select("select qwer1, case when abc <= 1 and -- first condition[C]" +
                  "abc >= -1 then 'a' -- second condition[C]" +
                  "else 'c' end as qwer2, -- else condition[C]"
                  "asdf3"
),
    """
select qwer1,
       case when abc <= 1 and -- first condition
                 abc >= -1 then 'a' -- second condition
            else 'c' end as qwer2, -- else condition
       asdf3
    """.strip()    
)

select qwer1,
       case when abc <= 1 and -- first condition
                 abc >= -1 then 'a' -- second condition
            else 'c' end as qwer2, -- else condition
       asdf3


`case when` in comments

In [None]:
assert_and_print(
    format_select("select qwer1, case when abc <= 1 and -- first condition case when[C]" +
                  "abc >= -1 then 'a' -- second condition case when[C]" +
                  "else 'c' end as qwer2, -- else condition[C]"
                  "asdf3"
),
"""
select qwer1,
       case when abc <= 1 and -- first condition case when
                 abc >= -1 then 'a' -- second condition case when
            else 'c' end as qwer2, -- else condition
       asdf3
""".strip()    
)

select qwer1,
       case when abc <= 1 and -- first condition case when
                 abc >= -1 then 'a' -- second condition case when
            else 'c' end as qwer2, -- else condition
       asdf3


In [None]:
assert_and_print(
    format_select("select asdf, case when asdf >= 1 and asdf <= 10 and" +
                  " substr(qwer, 1, 2) = 'abc' and substr(qwer, 3, 2) = 'qwerty'" +
                  " then 1 else 0 end as case_field, asdf2"
    ),
"""
select asdf,
       case when asdf >= 1 and
                 asdf <= 10 and
                 substr(qwer, 1, 2) = 'abc' and
                 substr(qwer, 3, 2) = 'qwerty' then 1
            else 0 end as case_field,
       asdf2
""".strip()
)

select asdf,
       case when asdf >= 1 and
                 asdf <= 10 and
                 substr(qwer, 1, 2) = 'abc' and
                 substr(qwer, 3, 2) = 'qwerty' then 1
            else 0 end as case_field,
       asdf2


2 `case when ... end`

In [None]:
assert_and_print(
    format_select(
        "select asdf, cast(case when asdf = 1 then 0 else 1 end as int) as qwer, " +
        "case when asdf = 0 then 1 else 0 end as qwer2"
    ),
"""
select asdf,
       cast(case when asdf = 1 then 0
                 else 1 end as int) as qwer,
       case when asdf = 0 then 1
            else 0 end as qwer2
""".strip()
)

select asdf,
       cast(case when asdf = 1 then 0
                 else 1 end as int) as qwer,
       case when asdf = 0 then 1
            else 0 end as qwer2


With functions in SELECT

In [None]:
assert_and_print(
    format_select("select aSdf, substr(qweR, 2) as qwer,  Asdf,qwer1"),
    "select aSdf,\n       substr(qweR, 2) as qwer,\n       Asdf,\n       qwer1"
)

select aSdf,
       substr(qweR, 2) as qwer,
       Asdf,
       qwer1


In [None]:
assert_and_print(
    format_select(
"""
select car_id,
       avg(price) as avg_price,
"""
    ),
"""
select car_id,
       avg(price) as avg_price
""".strip()
)

select car_id,
       avg(price) as avg_price


With function in SELECT and case when

In [None]:
assert_and_print(
    format_select("select qwer1, cast(case when asdf = 'J' then 1 else 0 end) as qwer2, qwer3"),
"""
select qwer1,
       cast(case when asdf = 'J' then 1
                 else 0 end) as qwer2,
       qwer3
""".strip()    
)

select qwer1,
       cast(case when asdf = 'J' then 1
                 else 0 end) as qwer2,
       qwer3


In [None]:
assert_and_print(
    format_select("select qwer1, cast(substr(case when asdf = 'CASE WHEN' then 1 else 0 end, 2, 1)) as qwer2, qwer3"),
"""
select qwer1,
       cast(substr(case when asdf = 'CASE WHEN' then 1
                        else 0 end, 2, 1)) as qwer2,
       qwer3
""".strip()    
)

select qwer1,
       cast(substr(case when asdf = 'CASE WHEN' then 1
                        else 0 end, 2, 1)) as qwer2,
       qwer3


With `SELECT DISTINCT`

In [None]:
assert_and_print(
    format_select("select distinct asdf, qwer, qwer2,"),
"""
select distinct asdf,
                qwer,
                qwer2
""".strip()
)

select distinct asdf,
                qwer,
                qwer2


In [None]:
assert_and_print(
    format_select("select distinct asdf, case when asdf = 1 then 1 else 2 end as qwerty, qwer2,"),
"""
select distinct asdf,
                case when asdf = 1 then 1
                     else 2 end as qwerty,
                qwer2
""".strip()
)

select distinct asdf,
                case when asdf = 1 then 1
                     else 2 end as qwerty,
                qwer2


With `PARTITION BY`

In [None]:
assert_and_print(
    format_select("select asdf, lead(asdf) over (partition by qwer, asdf2 order by qwer2) as qwer3, qwerty,"),
"""
select asdf,
       lead(asdf) over (partition by qwer,
                                     asdf2
                        order by qwer2) as qwer3,
       qwerty
""".strip()
)

select asdf,
       lead(asdf) over (partition by qwer,
                                     asdf2
                        order by qwer2) as qwer3,
       qwerty


In [None]:
assert_and_print(
    format_select("select asdf, lead(asdf) over (partition by asdf, qwer order by qwer), cast(qwer as numeric), -- some comment[C]ASDF, "),
"""
select asdf,
       lead(asdf) over (partition by asdf,
                                     qwer
                        order by qwer),
       cast(qwer as numeric), -- some comment
       ASDF
""".strip()
)

select asdf,
       lead(asdf) over (partition by asdf,
                                     qwer
                        order by qwer),
       cast(qwer as numeric), -- some comment
       ASDF


In [None]:
assert_and_print(
    format_select("select asdf, lead(asdf) over (partition by asdf, qwer order by qwer, qwer2), cast(qwer as numeric), -- some comment[C]ASDF, "),
"""
select asdf,
       lead(asdf) over (partition by asdf,
                                     qwer
                        order by qwer, qwer2),
       cast(qwer as numeric), -- some comment
       ASDF
""".strip()
)

select asdf,
       lead(asdf) over (partition by asdf,
                                     qwer
                        order by qwer, qwer2),
       cast(qwer as numeric), -- some comment
       ASDF


In [None]:
assert_and_print(
    format_select("select asdf, lead(asdf, 1, 2) OVER (PARTITION BY snr, qwer ORDER BY asdf, qwer)"),
"""
select asdf,
       lead(asdf, 1, 2) OVER (PARTITION BY snr,
                                           qwer
                              ORDER BY asdf, qwer)
""".strip()
)

select asdf,
       lead(asdf, 1, 2) OVER (PARTITION BY snr,
                                           qwer
                              ORDER BY asdf, qwer)


In [None]:
assert_and_print(
    format_select("select DISTINCT asdf, lead(asdf) over (partition by asdf, qwer order by qwer), cast(qwer as numeric), -- some comment[C]ASDF, "),
"""
select DISTINCT asdf,
                lead(asdf) over (partition by asdf,
                                              qwer
                                 order by qwer),
                cast(qwer as numeric), -- some comment
                ASDF
""".strip()
)

select DISTINCT asdf,
                lead(asdf) over (partition by asdf,
                                              qwer
                                 order by qwer),
                cast(qwer as numeric), -- some comment
                ASDF


With comments within `PARTITION BY`

In [None]:
assert_and_print(
    format_select("select DISTINCT asdf, lead(asdf) over (partition by asdf, -- some comment[C]qwer order by qwer), cast(qwer as numeric), -- some comment[C]ASDF, "),
"""
select DISTINCT asdf,
                lead(asdf) over (partition by asdf, -- some comment
                                              qwer
                                 order by qwer),
                cast(qwer as numeric), -- some comment
                ASDF
""".strip()
)

select DISTINCT asdf,
                lead(asdf) over (partition by asdf, -- some comment
                                              qwer
                                 order by qwer),
                cast(qwer as numeric), -- some comment
                ASDF


### FROM

In [None]:
#export
def format_from(s):
    "Format FROM statement line `s`"
    s = re.sub(  # add indentation
        r"(from )(.*)",
        r"\1  \2",
        s,
        flags=re.I
    )
    return s

In [None]:
assert_and_print(format_from("from table1"), "from   table1")

from   table1


### (LEFT / RIGHT / INNER / OUTER) JOIN

In [None]:
#export
def format_join(s):
    "Format JOIN statement line `s`"
    s = "    " + s  # add indentation
    return s

In [None]:
assert_and_print(format_join("inner join table1"), "    inner join table1")

    inner join table1


### ON

In [None]:
#export
def format_on(s):
    "Format ON statement line `s`"
    s = "        " + s  # add indentation
    split_s = split_comment_quote(s)
    # define regex before loop
    indent_and_or = re.compile(r"\b((?:and|or))\b", flags=re.I)
    for d in split_s:
        if not d["comment"] and not d["quote"]:
            s_aux = d["string"]
            s_aux = indent_and_or.sub(r"\1\n" + " " * 10, s_aux)  # add newline and indentation for and / or
            d["string"] = s_aux
    # get split comment / non comment
    split_comment = compress_dicts(split_s, ["comment"])
    s_code = "".join([d["string"] for d in split_s if not d["comment"]])
    # strip lines of code from the right
    s_code = "\n".join([sp.rstrip() for sp in s_code.split("\n")])
    # get comments and preceding string (non-comment)
    comment_dicts = []
    for i, d in enumerate(split_comment):
        if d["comment"]:
            comment_dicts.append({"comment": d["string"], "preceding": split_comment[i-1]["string"]})
    # assign comments to text
    s = assign_comment(s_code, comment_dicts)
    return s

In [None]:
assert_and_print(
    format_on("on a.asdf = b.asdf /* some comment */[C]"), 
"""
        on a.asdf = b.asdf /* some comment */
""".strip("\n")
)

        on a.asdf = b.asdf /* some comment */


In [None]:
assert_and_print(
    format_on("on a.asdf = b.asdf and a.qwer = b.qwer"), 
"""
        on a.asdf = b.asdf and
           a.qwer = b.qwer
""".strip("\n")
)

        on a.asdf = b.asdf and
           a.qwer = b.qwer


In [None]:
assert_and_print(
    format_on("on a.asdf = b.asdf and a.qwer = b.qwer or a.qwer2 = b.qwer2"), 
"""
        on a.asdf = b.asdf and
           a.qwer = b.qwer or
           a.qwer2 = b.qwer2
""".strip("\n")
)

        on a.asdf = b.asdf and
           a.qwer = b.qwer or
           a.qwer2 = b.qwer2


With comments and bad formatted

In [None]:
assert_and_print(
    format_on("on a.asdf = b.asdf -- some comment[C]and a.qwer = b.qwer or /* another comment */[C]a.qwer2 = b.qwer2"), 
"""
        on a.asdf = b.asdf and -- some comment
           a.qwer = b.qwer or /* another comment */
           a.qwer2 = b.qwer2
""".strip("\n")
)

        on a.asdf = b.asdf and -- some comment
           a.qwer = b.qwer or /* another comment */
           a.qwer2 = b.qwer2


### WHERE

In [None]:
#export
def format_where(s):
    "Format WHERE statement line `s`"
    s = re.sub(r"(where )", r"\1 ", s, flags=re.I)  # add indentation afer WHERE
    # split by comment / non comment, quote / non-quote
    split_s = split_comment_quote(s)
    # define regex before loop
    indent_and = re.compile(r"\s*(and)\b", flags=re.I)
    indent_or = re.compile(r"\s*(or)\b", flags=re.I)
    for d in split_s:
        if not d["comment"] and not d["quote"]:
            s_aux = d["string"]
            s_aux = indent_and.sub("\n" + " " * 3 + r"\1", s_aux)  # add newline and indentation for and
            s_aux = indent_or.sub("\n" + " " * 4 + r"\1", s_aux)  # add newline and indentation for or            
            d["string"] = s_aux
    # get split comment / non comment
    split_comment = compress_dicts(split_s, ["comment"])
    s_code = "".join([d["string"] for d in split_s if not d["comment"]])
    # strip from the right each code line
    s_code = "\n".join([sp.rstrip() for sp in s_code.split("\n")])
    # get comments and preceding string (non-comment)
    comment_dicts = []
    for i, d in enumerate(split_comment):
        if d["comment"]:
            comment_dicts.append({"comment": d["string"], "preceding": split_comment[i-1]["string"]})
    # assign comments to text
    s = assign_comment(s_code, comment_dicts)
    return s

In [None]:
assert_and_print(
    format_where(
        "WHERE asdf = 1 and qwer = 1 or blabla = 'asdf'"
    ), "WHERE  asdf = 1\n   and qwer = 1\n    or blabla = 'asdf'"
)

WHERE  asdf = 1
   and qwer = 1
    or blabla = 'asdf'


In [None]:
assert_and_print(
    format_where(
        "WHERE asdf = 1 -- and some comment[C]and qwer = 1 or blabla = 'asdf'"
    ), "WHERE  asdf = 1 -- and some comment\n   and qwer = 1\n    or blabla = 'asdf'"
)

WHERE  asdf = 1 -- and some comment
   and qwer = 1
    or blabla = 'asdf'


In [None]:
assert_and_print(
    format_where(
        "WHERE asdf = 1 and -- and some comment[C]qwer = 1 or blabla = 'asdf'"
    ), "WHERE  asdf = 1 -- and some comment\n   and qwer = 1\n    or blabla = 'asdf'"
)

WHERE  asdf = 1 -- and some comment
   and qwer = 1
    or blabla = 'asdf'


## Format all statements

In [None]:
#export
def format_statement_line(s):
    "Format statement line `s`"
    statement_funcs = {
        "select": format_select,
        "from": format_from,
        "left join": format_join,
        "right join": format_join,
        "inner join": format_join,
        "outer join": format_join,
        "on": format_on,
        "where": format_where,
    }
    for key, format_func in statement_funcs.items():
        if re.match(key, s, flags=re.I):
            s = format_func(s)
    return s

In [None]:
assert_and_print(
    format_statement_line("select asdf, qwer"),
    """
select asdf,
       qwer
""".strip())

select asdf,
       qwer


In [None]:
assert_and_print(
    format_statement_line("left join table1 as abc"),
    "    left join table1 as abc"
)

    left join table1 as abc


In [None]:
assert_and_print(
    format_statement_line("where asdf = 1 and qwer = 'things' and blabla = 0 or stuff = -1"),
    """
where  asdf = 1
   and qwer = 'things'
   and blabla = 0
    or stuff = -1
    """.strip())

where  asdf = 1
   and qwer = 'things'
   and blabla = 0
    or stuff = -1


In [None]:
#export
def format_statements(s):
    "Format statements lines `s`"
    statement_lines = s.split("\n")
    formatted_lines = [
        format_statement_line(line) for line in statement_lines
    ]
    formatted_s = "\n".join(formatted_lines)
    return formatted_s

In [None]:
assert_and_print(
    format_statements("select asdf, qwer\nfrom table1"),
"""
select asdf,
       qwer
from   table1
""".strip()
)

select asdf,
       qwer
from   table1


### Format multiline comments

In [None]:
#export
def format_multiline_comments(s):
    "Format multiline comments by replacing multiline comment [CI] by newline and adding indentation"
    split_s = s.split("\n")
    split_out = []
    for sp in split_s:  # loop on query lines
        if re.search(r"\[CI\]", sp):
            indentation = re.search(r"\/\*", sp).start() + 3
            sp_indent = re.sub(r"\[CI\]", "\n" + " " * indentation, sp)
            split_out.append(sp_indent)
        else:
            split_out.append(sp)
    s = "\n".join(split_out)
    return s

##  Putting everything together

to format a simple query without subqueries

In [None]:
#export
def format_simple_sql(s):
    "Format a simple SQL query without subqueries `s`"
    s = lowercase_query(s)  # everything lowercased but not the comments
    s = preformat_statements(s)  # add breaklines for the main statements
    s = add_whitespaces_query(s)  # add whitespaces between symbols in query
    s = format_statements(s)  # format statements
    s = re.sub(r"\[C\]", "", s)  # replace remaining [C]
    s = re.sub(r"\[CS\]", "\n", s)  # replace remaining [CS]
    s = re.sub(r"\s+\n", "\n", s)  # replace redundant whitespaces before newline
    s = format_multiline_comments(s)  # format multline comments
    s = s.strip()  # strip query
    return s

In [None]:
assert_and_print(
    format_simple_sql(example_sql),
    expected_sql
)

CREATE OR REPLACE TABLE mytable AS -- Mytable example
/* multi line
   comment */
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       substr(c.asdf, 1, 2) as substr_asdf,
       /* some commenT
          there */
       case when a.asdf = 1 then 'b' /* here a case comment */
            when b.qwer = 2 then 'c'
            else 'd' end as new_field, -- Some comment
       b.asdf2
FROM   table1 as a
    LEFT JOIN table2 as b -- and here a comment
        ON a.asdf = b.asdf /* joiN this way */
    INNER JOIN table3 as c
        ON a.asdf = c.asdf and
           a.qwer = b.qwer
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf


In [None]:
assert_and_print(
    format_simple_sql(
"""
create or replace table first_table as -- my first table
select car_id,
       avg(price) as avg_price,
from first_view
group by car_id
"""
    ),
"""
CREATE OR REPLACE TABLE first_table AS -- my first table
SELECT car_id,
       avg(price) as avg_price
FROM   first_view
GROUP BY car_id
""".strip()
)

CREATE OR REPLACE TABLE first_table AS -- my first table
SELECT car_id,
       avg(price) as avg_price
FROM   first_view
GROUP BY car_id


## Queries with subqueries

This is how we could (badly) write a query with subqueries

In [None]:
example_with_subqueries = """
select asdf, cast(qwer as numeric), -- some comment
substr(qwer1, 3, 2) as substr_qwer /* some field */
from 
(select asdf, qwer, /* some nice field */ from table1 where asdf = 1) as a
left 
join (select asdf, qwer2 from table2 where qwer2 = 1) as b
on a.asdf = b.asdf
where qwer1 >= 0
"""

and this is the way we would like to have it nicely formatted

In [None]:
expected_with_subqueries = """
SELECT asdf,
       cast(qwer as numeric), -- some comment
       substr(qwer1, 3, 2) as substr_qwer /* some field */
FROM   (SELECT asdf,
               qwer /* some nice field */
        FROM   table1
        WHERE  asdf = 1) as a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) as b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0
""".strip()

### Main function handling queries with subqueries

In [None]:
#export
def format_sql(s):
    "Format SQL query with subqueries `s`"
    s = format_simple_sql(s)  # basic query formatting
    # get first outer subquery positions
    subquery_pos = extract_outer_subquery(s)
    # loop over subqueries
    while subquery_pos is not None:
        # get split
        split_s = [
            s[0:subquery_pos[0]],
            s[subquery_pos[0]:(subquery_pos[1]+1)],
            s[(subquery_pos[1]+1):]
        ]
        # format subquery (= split_s[1])
        split_s[1] = format_subquery(split_s[1], split_s[0])
        # join main part and subquery
        s = "".join(split_s)
        # get first outer subquery positions
        subquery_pos = extract_outer_subquery(s)
    # remove whitespace between word and parenthesis
    s = re.sub(r"\s*\)", ")", s)
    return s

In [None]:
assert_and_print(
    format_sql(example_with_subqueries),
    expected_with_subqueries
)

SELECT asdf,
       cast(qwer as numeric), -- some comment
       substr(qwer1, 3, 2) as substr_qwer /* some field */
FROM   (SELECT asdf,
               qwer /* some nice field */
        FROM   table1
        WHERE  asdf = 1) as a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) as b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0


It even works with simple queries without subqueries, therefore generalizing the `format_simple_sql()` function

In [None]:
assert_and_print(
    format_sql(example_sql),
    expected_sql
)

CREATE OR REPLACE TABLE mytable AS -- Mytable example
/* multi line
   comment */
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       substr(c.asdf, 1, 2) as substr_asdf,
       /* some commenT
          there */
       case when a.asdf = 1 then 'b' /* here a case comment */
            when b.qwer = 2 then 'c'
            else 'd' end as new_field, -- Some comment
       b.asdf2
FROM   table1 as a
    LEFT JOIN table2 as b -- and here a comment
        ON a.asdf = b.asdf /* joiN this way */
    INNER JOIN table3 as c
        ON a.asdf = c.asdf and
           a.qwer = b.qwer
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf


## Nested subqueries

The function is also robust against nested subqueries

In [None]:
example_nested_subqueries = """
select asdf, qwer
from (select a.asdf,  lead(a.substr_qwer) over (partition by a.asdf, asdf2 order by qwer) as lead_qwerty
    from (select asdf, substr(qwer, 3, 2) as substr_qwer from table2) as a
        inner join (select asdf, qwer from table3) as b
            on a.qwer = b.qwer
)
"""

In [None]:
expected_nested = """SELECT asdf,
       qwer
FROM   (SELECT a.asdf,
               lead(a.substr_qwer) OVER (PARTITION BY a.asdf,
                                                      asdf2
                                         ORDER BY qwer) as lead_qwerty
        FROM   (SELECT asdf,
                       substr(qwer, 3, 2) as substr_qwer
                FROM   table2) as a
            INNER JOIN (SELECT asdf,
                               qwer
                        FROM   table3) as b
                ON a.qwer = b.qwer)"""

In [None]:
assert_and_print(
    format_sql(example_nested_subqueries),
    expected_nested
)

SELECT asdf,
       qwer
FROM   (SELECT a.asdf,
               lead(a.substr_qwer) OVER (PARTITION BY a.asdf,
                                                      asdf2
                                         ORDER BY qwer) as lead_qwerty
        FROM   (SELECT asdf,
                       substr(qwer, 3, 2) as substr_qwer
                FROM   table2) as a
            INNER JOIN (SELECT asdf,
                               qwer
                        FROM   table3) as b
                ON a.qwer = b.qwer)


### With SELECT DISTINCT

In [None]:
assert_and_print(
    format_sql(
"""
select asdf, qwer from (select distinct asdf, qwer from table1)
"""
    ),
"""
SELECT asdf,
       qwer
FROM   (SELECT DISTINCT asdf,
                        qwer
        FROM   table1)
""".strip()
)

SELECT asdf,
       qwer
FROM   (SELECT DISTINCT asdf,
                        qwer
        FROM   table1)


### More convoluted nested subquery

In [None]:
example_convoluted = """
select asdf
from (
    select asdf, qwer, /* some comment */
    from (select a.asdf, b.qwer, --some comment
          from (select asdf 
                from table1) as a 
            right join (select qwer 
                        from table2) as b
                on a.asdf = b.asdf)
)
"""

In [None]:
expected_convoluted = """SELECT asdf
FROM   (SELECT asdf,
               qwer /* some comment */
        FROM   (SELECT a.asdf,
                       b.qwer --some comment
                FROM   (SELECT asdf
                        FROM   table1) as a
                    RIGHT JOIN (SELECT qwer
                                FROM   table2) as b
                        ON a.asdf = b.asdf))"""

In [None]:
assert_and_print(
    format_sql(example_convoluted),
    expected_convoluted
)

SELECT asdf
FROM   (SELECT asdf,
               qwer /* some comment */
        FROM   (SELECT a.asdf,
                       b.qwer --some comment
                FROM   (SELECT asdf
                        FROM   table1) as a
                    RIGHT JOIN (SELECT qwer
                                FROM   table2) as b
                        ON a.asdf = b.asdf))


In [None]:
assert_and_print(
    format_sql(
"""
select asdf,
qwer
from table1 union select qwer,
asdf, asdf2 from table3
where asdf2 >=2
"""
    ),
"""
SELECT asdf,
       qwer
FROM   table1
UNION
SELECT qwer,
       asdf,
       asdf2
FROM   table3
WHERE  asdf2 >= 2
""".strip()
)

SELECT asdf,
       qwer
FROM   table1
UNION
SELECT qwer,
       asdf,
       asdf2
FROM   table3
WHERE  asdf2 >= 2


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_format_file.ipynb.
Converted 02_utils.ipynb.
Converted 03_validation.ipynb.
Converted 04_release.ipynb.
Converted index.ipynb.
