In [None]:
#hide
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False  # workaround for buggy jedi

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# default_exp core

In [None]:
#export
import re
from sql_formatter.utils import *

# core

> Core functions for SQL formatting

In [None]:
#hide
from nbdev.showdoc import *

## General formatting

Basic formatting for SQL queries. Let's use an example throughout the core module.

This is how an input could look like

In [None]:
example_sql = """
create or replace table mytable as -- Mytable example
/* multi line
   comment */
seLecT a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf, 
/* some commenT there */
case when a.asdf= 1 then 'b' /* here a case comment */
when b.qwer =2 then 'c' else 'd' end as new_field, -- Some comment
b.asdf2 frOm table1 as a leFt join 
table2 as b -- and here a comment
    on a.asdf = b.asdf  /* joiN this way */
    inner join table3 as c
on a.asdf=c.asdf
whEre a.asdf= 1 -- comment this
anD b.qwer =2 and a.asdf<=1 --comment that
or b.qwer>=5
groUp by a.asdf
"""

and this is how we would like to format it

In [None]:
expected_sql = """CREATE OR REPLACE TABLE mytable AS -- Mytable example
/* multi line
comment */
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       substr(c.asdf, 1, 2) as substr_asdf,
       /* some commenT there */
       case when a.asdf = 1 then 'b' /* here a case comment */
            when b.qwer = 2 then 'c'
            else 'd' end as new_field, -- Some comment
       b.asdf2
FROM   table1 as a
    LEFT JOIN table2 as b -- and here a comment
        ON a.asdf = b.asdf /* joiN this way */
    INNER JOIN table3 as c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;"""

Let's start by defining the main statements. The main statements all require a new line and should be in upper case

In [None]:
#export
MAIN_STATEMENTS = [
    "create.*table",
    "create.*view",
    "select distinct",
    "select",
    "from",
    "left join",
    "inner join",
    "outer join",
    "right join",
    "union",
    "on",
    "where",
    "group by",
    "order by",
    "over",  # special case: no newline, only capitalized
    "partition by",  # special case: no newline, only capitalized
]

> Remark: For OVER we only capitalize without adding a newline

We need to remove newlines and multiple spaces because the may be arbitrary. 

Before removing newlines we also need to mark the end of coments because we would not know where they end

We also mark the begin of comments /* */ if they are in a newline

In [None]:
#export
def clean_query(s):
    "Remove redundant whitespaces and mark comments boundaries and remove newlines afterwards in query `s`"
    s = remove_redundant_whitespaces(s)  # remove too many whitespaces but no newlines
    s = mark_comments(s)  # mark comments with special tokens [C] and [CS]
    s = replace_newline_chars(s)  # remove newlines but not in the comments
    s = remove_whitespaces_newline(s)  # remove whitespaces after and before newline
    s = remove_whitespaces_comments(s)  # remove whitespaces after and before [C] and [CS]
    s = remove_redundant_whitespaces(s)  # remove too many whitespaces but no newlines
    return s

In [None]:
assert_and_print(
    clean_query(
"""
SELECT asdf, qwer,
 qwer1,    qwer2
FROM table1
"""
    ), "SELECT asdf, qwer, qwer1, qwer2 FROM table1"
)

SELECT asdf, qwer, qwer1, qwer2 FROM table1


With usual comments

In [None]:
assert_and_print(
    clean_query("""
SELECT asdf, qwer, -- some comment
 qwer1,    qwer2
FROM table1
WHERE asdf=1
"""), "SELECT asdf, qwer, -- some comment[C]qwer1, qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, -- some comment[C]qwer1, qwer2 FROM table1 WHERE asdf=1


With other comment form

In [None]:
assert_and_print(
    clean_query("""
SELECT asdf, qwer, /*  another comment */
qwer1,    qwer2
FROM table1
WHERE asdf=1
"""), "SELECT asdf, qwer, /* another comment */[C]qwer1, qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, /* another comment */[C]qwer1, qwer2 FROM table1 WHERE asdf=1


In [None]:
assert_and_print(
    clean_query(
"""
SELECT asdf, qwer, /*  another comment */
qwer1,    
/* inline comment */
qwer2
FROM table1
WHERE asdf=1
"""
    ), 
    "SELECT asdf, qwer, /* another comment */[C]qwer1,[CS]/* inline comment */[C]qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, /* another comment */[C]qwer1,[CS]/* inline comment */[C]qwer2 FROM table1 WHERE asdf=1


In [None]:
assert_and_print(
    clean_query(
"""
create or replace my_table as
/* some comment
   some new comment line */
select asdf,
qwer,   qwer2

from table1
"""
    ),
"""create or replace my_table as[CS]/* some comment
some new comment line */[C]select asdf, qwer, qwer2 from table1"""
)

create or replace my_table as[CS]/* some comment
some new comment line */[C]select asdf, qwer, qwer2 from table1


Next we would like to have each main statement in a separate line and in uppercase. The function below get somewhat convoluted because we have to take care of not formatting main statements in the comments like /* comment inside **select** */

In [None]:
#export
def preformat_statements(s):
    """Write a newline in `s` for all `statements` and
    uppercase them but not if they are inside a comment"""
    statements = MAIN_STATEMENTS
    s = clean_query(s)  # clean query and mark comments
    # initialize container
    split_select_out = []
    # isolate SELECT statement
    split_select = split_by_select_from(s)
    split_c_out = []
    for line in split_select:
        split_c = re.split(r"((?:\[C\]|\[CS\]))", line)  # split by comment marker
        split_c = [elem for elem in split_c if elem != ""]
        split_cn_out = []
        for uline in split_c:
            split_cn = re.split("((?:--.*|\/\*.*\*\/))", uline)  # split by comment / no comment
            split_cn = [elem for elem in split_cn if elem != ""]
            for statement in statements:
                if re.match("^select", line, flags=re.I):
                    split_cn = [  # update list
                        re.sub(rf"\s*\b({statement})\b", "\n" + statement.upper(), sline, flags=re.I)
                        if not re.search("(?:--.*|\/\*.*\*\/)", sline) and statement == "select"
                        else re.sub(rf"\b({statement})\b", statement.upper(), sline, flags=re.I)
                        if not re.search("(?:--.*|\/\*.*\*\/)", sline)
                        else sline
                        for sline in split_cn
                    ]
                else:
                    if re.match("^create", statement, flags=re.I):  # special case CREATE with AS capitalize as well
                        split_cn = [  # update list
                            re.sub(rf"\s*({statement} )(.*) as\b", lambda pat: "\n" + pat.group(1).upper() + pat.group(2) + " AS", sline, flags=re.I)
                            if not re.search("(?:--.*|\/\*.*\*\/)", sline)
                            else sline
                            for sline in split_cn
                        ]
                    else:  # normal main statements
                        split_cn = [  # update list
                            re.sub(rf"\s*\b({statement})\b", "\n" + statement.upper(), sline, flags=re.I)
                            if not re.search("(?:--.*|\/\*.*\*\/)", sline)
                            else sline
                            for sline in split_cn
                        ]
            split_cn_out.append("".join(split_cn))
        split_c_out.append("".join(split_cn_out))
    split_select_out.append("".join(split_c_out))
    s = "".join(split_select_out)
    s = remove_whitespaces_newline(s)  # remove whitespaces before and after newline
    return s

In [None]:
assert_and_print(
    preformat_statements("select asdf, qwer as new_var from table1 where asdf = 1"),
    "\nSELECT asdf, qwer as new_var\nFROM table1\nWHERE asdf = 1"
)


SELECT asdf, qwer as new_var
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(preformat_statements("""
seLect asdf,
       /* some comment inside select */
       qwer
From   table1 where  asdf = 1
"""),
    "\nSELECT asdf,[CS]/* some comment inside select */[C]qwer\nFROM table1\nWHERE asdf = 1"
)


SELECT asdf,[CS]/* some comment inside select */[C]qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
seLect asdf, /* some comment inside select */
       qwer
From   table1 where  asdf = 1
"""),
    "\nSELECT asdf, /* some comment inside select */[C]qwer\nFROM table1\nWHERE asdf = 1"
)


SELECT asdf, /* some comment inside select */[C]qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
create or replace view my_view as
seLect asdf,
       /* some comment inside select */
       qwer
From   table1 where  asdf = 1
"""),
    "\nCREATE OR REPLACE VIEW my_view AS\nSELECT asdf,[CS]/* some comment inside select */[C]qwer\nFROM table1\nWHERE asdf = 1"
)


CREATE OR REPLACE VIEW my_view AS
SELECT asdf,[CS]/* some comment inside select */[C]qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
create or replace view my_view as
seLect asdf,
       qwer_function,
       qwer
From   table1 where  asdf = 1
"""),
    "\nCREATE OR REPLACE VIEW my_view AS\nSELECT asdf, qwer_function, qwer\nFROM table1\nWHERE asdf = 1"
)


CREATE OR REPLACE VIEW my_view AS
SELECT asdf, qwer_function, qwer
FROM table1
WHERE asdf = 1


In [None]:
assert_and_print(
    preformat_statements("""
create or replace view my_view as
seLect asdf, qwer_function,
       lead(asdf) over (Partition By asdf order BY qwer),
    qwer2
From   table1 where  asdf = 1 order by asdf
"""),
    """
CREATE OR REPLACE VIEW my_view AS
SELECT asdf, qwer_function, lead(asdf) OVER (PARTITION BY asdf ORDER BY qwer), qwer2
FROM table1
WHERE asdf = 1
ORDER BY asdf"""
)


CREATE OR REPLACE VIEW my_view AS
SELECT asdf, qwer_function, lead(asdf) OVER (PARTITION BY asdf ORDER BY qwer), qwer2
FROM table1
WHERE asdf = 1
ORDER BY asdf


In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace table mytable as -- Mytable example
seLecT a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf
"""
    ),
"""
CREATE OR REPLACE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf
""".rstrip()
)


CREATE OR REPLACE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf


In [None]:
assert_and_print(
    preformat_statements(
"""
create table mytable as -- Mytable example
seLecT a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf
"""
    ),
"""
CREATE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf
""".rstrip()
)


CREATE TABLE mytable AS -- Mytable example[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf


In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace table  my_table as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE OR REPLACE TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".rstrip()
)


CREATE OR REPLACE TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


In [None]:
assert_and_print(
    preformat_statements(
"""
create or replace transient table  my_table as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE OR REPLACE TRANSIENT TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".rstrip()
)


CREATE OR REPLACE TRANSIENT TABLE my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


In [None]:
assert_and_print(
    preformat_statements(
"""
Create view  my_table as -- mytable
select distinct asdf, qwer, -- some comment
from table1
"""
    ),
"""
CREATE VIEW my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1
""".rstrip()
)


CREATE VIEW my_table AS -- mytable[C]
SELECT DISTINCT asdf, qwer, -- some comment[C]
FROM table1


In [None]:
print(preformat_statements(example_sql))


CREATE OR REPLACE TABLE mytable AS -- Mytable example[C][CS]/* multi line
comment */[C]
SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]substr(c.asdf, 1, 2) as substr_asdf,[CS]/* some commenT there */[C]case when a.asdf= 1 then 'b' /* here a case comment */[C]when b.qwer =2 then 'c' else 'd' end as new_field, -- Some comment[C]b.asdf2
FROM table1 as a
LEFT JOIN table2 as b -- and here a comment[C]
ON a.asdf = b.asdf /* joiN this way */[C]
INNER JOIN table3 as c
ON a.asdf=c.asdf
WHERE a.asdf= 1 -- comment this[C]anD b.qwer =2 and a.asdf<=1 --comment that[C]or b.qwer>=5
GROUP BY a.asdf


We would like to have every non capital statement lower cased but let comments untouched

In [None]:
#export
def lowercase_query(s):
    "Lowercase query but let comments untouched"
    split_s = s.split("\n")  # split by newline
    split_s = [
        line if re.match(r"^--", line) # comment at the beginning
        else re.sub(r"(.*)(--.*)$", lambda pat: pat.group(1).lower() + pat.group(2), line)
        if re.match(r".*--", line) # comment in between
        else re.sub(r"(.*)(\/\*.*\*\/)(.*)$", 
                    lambda pat: pat.group(1).lower() + pat.group(2) + pat.group(3).lower(), line)
        if re.match(r".*\/\*.*\*\/.*", line) # comment in between
        else line.lower()  # no comment
        for line in split_s
    ]
    s = "\n".join(split_s)
    return s    

In [None]:
assert_and_print(
    lowercase_query("""
--- My nice view 1 --
Create or Replace VieW view_1 as
seLect asdf, -- Some Comment
qwER,
qwerTy, -- Some other comment
FROM table1
"""),
    """
--- My nice view 1 --
create or replace view view_1 as
select asdf, -- Some Comment
qwer,
qwerty, -- Some other comment
from table1
"""
)


--- My nice view 1 --
create or replace view view_1 as
select asdf, -- Some Comment
qwer,
qwerty, -- Some other comment
from table1



In [None]:
assert_and_print(
    lowercase_query("""
-- Some comment --
Create Or rePlace tablE aS
sElEct asdf,
/* sOme CommEnt */
qwer
FroM table1
"""),
"""
-- Some comment --
create or replace table as
select asdf,
/* sOme CommEnt */
qwer
from table1
"""
)


-- Some comment --
create or replace table as
select asdf,
/* sOme CommEnt */
qwer
from table1



## Specific formatting and validation

Now we will format each statement individually

#### PARTITION BY

Within SELECT

In [None]:
#export
def format_partition_by(s, base_indentation):
    "Format PARTITION BY line in SELECT (DISTINCT)"
    split_s = re.split("(partition by.*)", s, flags=re.I)  # split PARTITION BY
    split_s = [sp for sp in split_s if sp != ""]
    begin_s = split_s[0]
    partition_by = split_s[1]
    indentation = base_indentation + len(begin_s) + 13
    # add newline after each comma (no comments) and indentation
    partition_by = add_newline_indentation(partition_by, indentation=indentation)
    # add new line and indentation after order by
    partition_by = re.sub(
        r"\s(order by.*)", "\n" + " " * (base_indentation + len(begin_s)) + r"\1", 
        partition_by, 
        flags=re.I
    )
    # combine begin of string with formatted partition by
    s = begin_s + partition_by
    s = s.strip()
    return s

### SELECT

In [None]:
#export
def format_select(s):
    "Format SELECT statement line `s`"
    # remove [C] at end of SELECT
    if re.search(r"\[C\]$", s):
        s = re.sub(r"\[C\]$", "", s)
    # if comma is found at the end of select statement then remove comma
    if re.search(r"[\w\d]+,\s*$", s, flags=re.I):
        s = re.sub(r"([\w\d]+)(,+)(\s*)$", r"\1", s, flags=re.I)
    elif re.search(r"[\w\d]+,\s*--[\w\d\s]*$", s, flags=re.I):
        s = re.sub(r"([\w\d]+)(,+)(\s*)(--.*)$", r"\1 \4", s, flags=re.I)
    elif re.search(r"[\w\d]+,\s*\/\*.*\*\/$", s, flags=re.I):
        s = re.sub(r"([\w\d]+)(,+)(\s*)(\/\*.*\*\/)$", r"\1 \4", s, flags=re.I)
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols
    # check whether is is a SELECT DISTINCT
    if re.search("^select distinct", s):
        indentation = 16
    else:
        indentation = 7
    s = add_newline_indentation(s, indentation=indentation)  # add newline after each comma (no comments) and indentation
    s = re.sub(r"\[C\]\[CS\]", "[C]", s)  # replace [C][CS] by [C]
    s = re.sub(r"\[C\]", "\n" + " " * indentation, s)  # replace [C] by newline
    s = re.sub(r"\[CS\]", "\n" + " " * indentation, s)  # replace [CS] by newline    
    s = re.sub(r"(then.*?) ((?:when|else).*?)", r"\1\n\2", s)  # add newline before when or else
    split_s = s.split("\n")
    split_s = [
        line if not re.match("^(?:when|else)", line.strip())
        else " " * 12 + line.strip()
        for line in split_s
    ]
    s = "\n".join(split_s)
    s = s.strip()
    # format partition by
    begin_s = s[0:indentation]
    split_s = s[indentation:].split("\n" + (" " * indentation))
    split_s = [
        format_partition_by(line, base_indentation=indentation)
        if re.search("partition by", line, flags=re.I)
        else line
        for line in split_s
    ]
    s = begin_s + ("\n" + (" " * indentation)).join(split_s)
    return s

Simple usage without comments

In [None]:
assert_and_print(
    format_select("select aSdf, cast(qweR as numeric),  Asdf,qwer1"),
    "select aSdf,\n       cast(qweR as numeric),\n       Asdf,\n       qwer1"
)

select aSdf,
       cast(qweR as numeric),
       Asdf,
       qwer1


More advanced usage with comments in SELECT

In [None]:
assert_and_print(
    format_select("select asdf, cast(qwer as numeric), -- some comment[C]ASDF, qwer1"),
    "select asdf,\n       cast(qwer as numeric), -- some comment\n       ASDF,\n       qwer1"
)

select asdf,
       cast(qwer as numeric), -- some comment
       ASDF,
       qwer1


In [None]:
assert_and_print(
    format_select("select asdf, cast(qwer as numeric), -- some comment[C]ASDF, qwer1"),
    "select asdf,\n       cast(qwer as numeric), -- some comment\n       ASDF,\n       qwer1"
)

select asdf,
       cast(qwer as numeric), -- some comment
       ASDF,
       qwer1


Correcting common mistake on the flow: comma at end of SELECT

In [None]:
assert_and_print(
    format_select("select qwer1,   asdf,"),
    "select qwer1,\n       asdf"
)

select qwer1,
       asdf


In [None]:
assert_and_print(
    format_select("SELECT a.asdf, b.qwer, -- some comment here[C][CS]/* and here is a line comment inside select */[C]qwer2"),
"""
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       qwer2
""".strip()
)

SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       qwer2


In [None]:
assert_and_print(
    format_select("SELECT a.asdf, [CS]/* and here is a line comment inside select */"),
"""
SELECT a.asdf, 
       /* and here is a line comment inside select */
""".strip()
)

SELECT a.asdf, 
       /* and here is a line comment inside select */


Correcting comma at end of SELECT but having a comment in the last field

In [None]:
assert_and_print(
    format_select("select qwer1 as qwer2,   asdf as asdf3, -- this field"),
    "select qwer1 as qwer2,\n       asdf as asdf3 -- this field"
)

select qwer1 as qwer2,
       asdf as asdf3 -- this field


In [None]:
assert_and_print(
    format_select("select qwer1,   asdf, /* this field */"),
    "select qwer1,\n       asdf /* this field */"
)

select qwer1,
       asdf /* this field */


With `case when` conditions

In [None]:
assert_and_print(
    format_select("select qwer1, case when abc = 1 then 'a' when abc = 2 then 'b' else 'c' end as qwer2"),
    """
select qwer1,
       case when abc = 1 then 'a'
            when abc = 2 then 'b'
            else 'c' end as qwer2
    """.strip()    
)

select qwer1,
       case when abc = 1 then 'a'
            when abc = 2 then 'b'
            else 'c' end as qwer2


In [None]:
assert_and_print(
    format_select("select qwer1, case when abc = 1 then 'a' -- first condition[C]" +
                  "when abc = 2 then 'b' -- second condition[C]" +
                  "else 'c' end as qwer2, /* else condition */[C]"
                  "asdf3"
),
    """
select qwer1,
       case when abc = 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, /* else condition */
       asdf3
    """.strip()    
)

select qwer1,
       case when abc = 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, /* else condition */
       asdf3


Bad formatted `case when` condition

In [None]:
assert_and_print(
    format_select("select qwer1, case when abc<= 1 then 'a' -- first condition[C]" +
                  "when abc=2 then 'b' -- second condition[C]" +
                  "else 'c' end as qwer2, -- else condition[C]"
                  "asdf3"
),
    """
select qwer1,
       case when abc <= 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, -- else condition
       asdf3
    """.strip()    
)

select qwer1,
       case when abc <= 1 then 'a' -- first condition
            when abc = 2 then 'b' -- second condition
            else 'c' end as qwer2, -- else condition
       asdf3


With functions in SELECT

In [None]:
assert_and_print(
    format_select("select aSdf, substr(qweR, 2) as qwer,  Asdf,qwer1"),
    "select aSdf,\n       substr(qweR, 2) as qwer,\n       Asdf,\n       qwer1"
)

select aSdf,
       substr(qweR, 2) as qwer,
       Asdf,
       qwer1


In [None]:
assert_and_print(
    format_select(
"""
select car_id,
       avg(price) as avg_price,
"""
    ),
"""
select car_id,
       avg(price) as avg_price
""".strip()
)

select car_id,
       avg(price) as avg_price


With `SELECT DISTINCT`

In [None]:
assert_and_print(
    format_select("select distinct asdf, qwer, qwer2,"),
"""
select distinct asdf,
                qwer,
                qwer2
""".strip()
)

select distinct asdf,
                qwer,
                qwer2


With `PARTITION BY`

In [None]:
assert_and_print(
    format_select("select asdf, lead(asdf) over (partition by qwer, asdf2 order by qwer2) as qwer3, qwerty,"),
"""
select asdf,
       lead(asdf) over (partition by qwer,
                                     asdf2
                        order by qwer2) as qwer3,
       qwerty
""".strip()
)

select asdf,
       lead(asdf) over (partition by qwer,
                                     asdf2
                        order by qwer2) as qwer3,
       qwerty


In [None]:
assert_and_print(
    format_select("select asdf, lead(asdf) over (partition by asdf, qwer order by qwer), cast(qwer as numeric), -- some comment[C]ASDF, "),
"""
select asdf,
       lead(asdf) over (partition by asdf,
                                     qwer
                        order by qwer),
       cast(qwer as numeric), -- some comment
       ASDF
""".strip()
)

select asdf,
       lead(asdf) over (partition by asdf,
                                     qwer
                        order by qwer),
       cast(qwer as numeric), -- some comment
       ASDF


In [None]:
assert_and_print(
    format_select("select distinct asdf, lead(asdf) over (partition by asdf, qwer order by qwer), cast(qwer as numeric), -- some comment[C]ASDF, "),
"""
select distinct asdf,
                lead(asdf) over (partition by asdf,
                                              qwer
                                 order by qwer),
                cast(qwer as numeric), -- some comment
                ASDF
""".strip()
)

select distinct asdf,
                lead(asdf) over (partition by asdf,
                                              qwer
                                 order by qwer),
                cast(qwer as numeric), -- some comment
                ASDF


### FROM

In [None]:
#export
def format_from(s):
    "Format FROM statement line `s`"
    s = re.sub(  # add indentation
        r"(from )(.*)",
        r"\1  \2",
        s,
        flags=re.I
    )
    return s

In [None]:
assert_and_print(format_from("from table1"), "from   table1")

from   table1


### (LEFT / RIGHT / INNER / OUTER) JOIN

In [None]:
#export
def format_join(s):
    "Format JOIN statement line `s`"
    s = "    " + s  # add indentation
    return s

In [None]:
assert_and_print(format_join("inner join table1"), "    inner join table1")

    inner join table1


### ON

In [None]:
#export
def format_on(s):
    "Format ON statement line `s`"
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols in join    
    s = "        " + s  # add indentation
    return s

In [None]:
assert_and_print(format_on("on a.asdf =b.asdf"), "        on a.asdf = b.asdf")

        on a.asdf = b.asdf


### WHERE

In [None]:
#export
def format_where(s):
    "Format WHERE statement line `s`"
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols
    s = s.replace("[C]", " ")
    s = re.sub(r"(where )", r"\1 ", s, flags=re.I)  # add indentation afer WHERE
    s = re.sub(r"\sand", r"\n   and", s, flags=re.I)  # add new line before every 'and' and indentation
    s = re.sub(r"\sor", r"\n    or", s, flags=re.I)  # add new line before every 'or' and indentation    
    return s

In [None]:
assert_and_print(
    format_where(
        "WHERE asdf= 1 and qwer=1 or blabla ='asdf'"
    ), "WHERE  asdf = 1\n   and qwer = 1\n    or blabla = 'asdf'"
)

WHERE  asdf = 1
   and qwer = 1
    or blabla = 'asdf'


## Format all statements

In [None]:
#export
def format_statement_line(s):
    "Format statement line `s`"
    statement_funcs = {
        "select": format_select,
        "from": format_from,
        "left join": format_join,
        "right join": format_join,
        "inner join": format_join,
        "outer join": format_join,
        "on": format_on,
        "where": format_where,
    }
    for key, format_func in statement_funcs.items():
        if re.match(key, s, flags=re.I):
            s = format_func(s)
    return s

In [None]:
assert_and_print(
    format_statement_line("select asdf, qwer"),
    """
select asdf,
       qwer
""".strip())

select asdf,
       qwer


In [None]:
assert_and_print(
    format_statement_line("left join table1 as abc"),
    "    left join table1 as abc"
)

    left join table1 as abc


In [None]:
assert_and_print(
    format_statement_line("where asdf=1 and qwer='things' and blabla=0 or stuff=-1"),
    """
where  asdf = 1
   and qwer = 'things'
   and blabla = 0
    or stuff = -1
    """.strip())

where  asdf = 1
   and qwer = 'things'
   and blabla = 0
    or stuff = -1


In [None]:
#export
def format_statements(s):
    "Format statements lines `s`"
    statement_lines = s.split("\n")
    formatted_lines = [
        format_statement_line(line) for line in statement_lines
    ]
    formatted_s = "\n".join(formatted_lines)
    return formatted_s

In [None]:
assert_and_print(
    format_statements("select asdf, qwer\nfrom table1"),
"""
select asdf,
       qwer
from   table1
""".strip()
)

select asdf,
       qwer
from   table1


### Write a ; at the end of query

In [None]:
#export
def add_ending_semicolon(s):
    "Add ending semicolon for SQL query `s`"
    s = s.strip()
    if re.match(r".*[^;]$", s, flags=re.DOTALL):
        s = s + ";"
    return s

Basic usage

In [None]:
assert_and_print(
    add_ending_semicolon("select asdf from table1"), 
    "select asdf from table1;"
)

select asdf from table1;


Also works with multiple lines

In [None]:
assert_and_print(
    add_ending_semicolon("select asdf\nfrom table1\nwhere asdf = 1"), 
    "select asdf\nfrom table1\nwhere asdf = 1;"
)

select asdf
from table1
where asdf = 1;


It does not add another semicolon if there is already one there

In [None]:
assert_and_print(
    add_ending_semicolon("select asdf\nfrom table1\nwhere asdf = 1;"), 
    "select asdf\nfrom table1\nwhere asdf = 1;"
)

select asdf
from table1
where asdf = 1;


##  Putting everything together

to format a simple query without subqueries or `PARTITION BY`

In [None]:
#export
def format_simple_sql(s, add_semicolon=True):
    """Format a simple SQL query without subqueries `s`. 
    If `add_semicolon` is True, then add a semicolon at the end
    """
    s = lowercase_query(s)  # everything lowercased but not the comments
    s = preformat_statements(s)  # add breaklines for the main statements
    s = format_statements(s)  # format statements
    s = re.sub(r"\[C\]", "", s)  # replace remaining [C]
    s = re.sub(r"\[CS\]", "\n", s)  # replace remainig [CS]
    if add_semicolon:
        s = add_ending_semicolon(s)  # add ending semicolon if not there yet
    return s

In [None]:
assert_and_print(
    format_simple_sql(example_sql),
    expected_sql
)

CREATE OR REPLACE TABLE mytable AS -- Mytable example
/* multi line
comment */
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       substr(c.asdf, 1, 2) as substr_asdf,
       /* some commenT there */
       case when a.asdf = 1 then 'b' /* here a case comment */
            when b.qwer = 2 then 'c'
            else 'd' end as new_field, -- Some comment
       b.asdf2
FROM   table1 as a
    LEFT JOIN table2 as b -- and here a comment
        ON a.asdf = b.asdf /* joiN this way */
    INNER JOIN table3 as c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;


In [None]:
assert_and_print(
    format_simple_sql(
"""
create or replace table first_table as -- my first table
select car_id,
       avg(price) as avg_price,
from first_view
group by car_id
"""
    ),
"""
CREATE OR REPLACE TABLE first_table AS -- my first table
SELECT car_id,
       avg(price) as avg_price
FROM   first_view
GROUP BY car_id;
""".strip()
)

CREATE OR REPLACE TABLE first_table AS -- my first table
SELECT car_id,
       avg(price) as avg_price
FROM   first_view
GROUP BY car_id;


## Queries with subqueries

This is how we could (badly) write a query with subqueries

In [None]:
example_with_subqueries = """
select asdf, cast(qwer as numeric), -- some comment
substr(qwer1, 3, 2) as substr_qwer /* some field */
from 
(select asdf, qwer, /* some nice field */ from table1 where asdf = 1) as a
left 
join (select asdf, qwer2 from table2 where qwer2 = 1) as b
on a.asdf = b.asdf
where qwer1 >= 0
"""

and this is the way we would like to have it nicely formatted

In [None]:
expected_with_subqueries = """
SELECT asdf,
       cast(qwer as numeric), -- some comment
       substr(qwer1, 3, 2) as substr_qwer /* some field */
FROM   (SELECT asdf,
               qwer /* some nice field */
        FROM   table1
        WHERE  asdf = 1) as a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) as b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0;
""".strip()

### Main function handling queries with subqueries

In [None]:
#export
def format_sql(s, add_semicolon=True):
    "Format SQL query with subqueries. If `add_semicolon` is True then add a semicolon at the end"
    s = format_simple_sql(s, add_semicolon)  # basic query formatting
    # get first outer subquery positions
    subquery_pos = extract_outer_subquery(s)
    # loop over subqueries
    while subquery_pos is not None:
        # get split
        split_s = [
            s[0:subquery_pos[0]],
            s[subquery_pos[0]:(subquery_pos[1]+1)],
            s[(subquery_pos[1]+1):]
        ]
        # format subquery (= split_s[1])
        split_s[1] = format_subquery(split_s[1], split_s[0])
        # join main part and subquery
        s = "".join(split_s)
        # get first outer subquery positions
        subquery_pos = extract_outer_subquery(s)
    # remove whitespace between word and parenthesis
    s = re.sub(r"\s\)", ")", s)
    return s

In [None]:
assert_and_print(
    format_sql(example_with_subqueries),
    expected_with_subqueries
)

SELECT asdf,
       cast(qwer as numeric), -- some comment
       substr(qwer1, 3, 2) as substr_qwer /* some field */
FROM   (SELECT asdf,
               qwer /* some nice field */
        FROM   table1
        WHERE  asdf = 1) as a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) as b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0;


It even works with simple queries without subqueries, therefore generalizing the `format_simple_sql()` function

In [None]:
assert_and_print(
    format_sql(example_sql),
    expected_sql
)

CREATE OR REPLACE TABLE mytable AS -- Mytable example
/* multi line
comment */
SELECT a.asdf,
       b.qwer, -- some comment here
       /* and here is a line comment inside select */
       substr(c.asdf, 1, 2) as substr_asdf,
       /* some commenT there */
       case when a.asdf = 1 then 'b' /* here a case comment */
            when b.qwer = 2 then 'c'
            else 'd' end as new_field, -- Some comment
       b.asdf2
FROM   table1 as a
    LEFT JOIN table2 as b -- and here a comment
        ON a.asdf = b.asdf /* joiN this way */
    INNER JOIN table3 as c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;


## Nested subqueries

The function is also robust against nested subqueries

In [None]:
example_nested_subqueries = """
select asdf, qwer
from (select a.asdf,  lead(a.substr_qwer) over (partition by a.asdf, asdf2 order by qwer) as lead_qwerty
    from (select asdf, substr(qwer, 3, 2) as substr_qwer from table2) as a
        inner join (select asdf, qwer from table3) as b
            on a.qwer = b.qwer
)
"""

In [None]:
expected_nested = """SELECT asdf,
       qwer
FROM   (SELECT a.asdf,
               lead(a.substr_qwer) OVER (PARTITION BY a.asdf,
                                                      asdf2
                                         ORDER BY qwer) as lead_qwerty
        FROM   (SELECT asdf,
                       substr(qwer, 3, 2) as substr_qwer
                FROM   table2) as a
            INNER JOIN (SELECT asdf,
                               qwer
                        FROM   table3) as b
                ON a.qwer = b.qwer);"""

In [None]:
assert_and_print(
    format_sql(example_nested_subqueries),
    expected_nested
)

SELECT asdf,
       qwer
FROM   (SELECT a.asdf,
               lead(a.substr_qwer) OVER (PARTITION BY a.asdf,
                                                      asdf2
                                         ORDER BY qwer) as lead_qwerty
        FROM   (SELECT asdf,
                       substr(qwer, 3, 2) as substr_qwer
                FROM   table2) as a
            INNER JOIN (SELECT asdf,
                               qwer
                        FROM   table3) as b
                ON a.qwer = b.qwer);


### More convoluted nested subquery

In [None]:
example_convoluted = """
select asdf
from (
    select asdf, qwer, /* some comment */
    from (select a.asdf, b.qwer, --some comment
          from (select asdf 
                from table1) as a 
            right join (select qwer 
                        from table2) as b
                on a.asdf = b.asdf)
)
"""

In [None]:
expected_convoluted = """SELECT asdf
FROM   (SELECT asdf,
               qwer /* some comment */
        FROM   (SELECT a.asdf,
                       b.qwer --some comment
                FROM   (SELECT asdf
                        FROM   table1) as a
                    RIGHT JOIN (SELECT qwer
                                FROM   table2) as b
                        ON a.asdf = b.asdf));"""

In [None]:
assert_and_print(
    format_sql(example_convoluted),
    expected_convoluted
)

SELECT asdf
FROM   (SELECT asdf,
               qwer /* some comment */
        FROM   (SELECT a.asdf,
                       b.qwer --some comment
                FROM   (SELECT asdf
                        FROM   table1) as a
                    RIGHT JOIN (SELECT qwer
                                FROM   table2) as b
                        ON a.asdf = b.asdf));


In [None]:
assert_and_print(
    format_sql(
"""
select asdf,
qwer
from table1 union select qwer,
asdf, asdf2 from table3
where asdf2 >=2
"""
    ),
"""
SELECT asdf,
       qwer
FROM   table1
UNION
SELECT qwer,
       asdf,
       asdf2
FROM   table3
WHERE  asdf2 >= 2;
""".strip()
)

SELECT asdf,
       qwer
FROM   table1
UNION
SELECT qwer,
       asdf,
       asdf2
FROM   table3
WHERE  asdf2 >= 2;


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_format_file.ipynb.
Converted 02_utils.ipynb.
Converted index.ipynb.
