In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp core

In [None]:
#export
import re

# core

> Core functions for SQL formatting

In [None]:
#hide
from nbdev.showdoc import *

## General helper functions

In [None]:
#export
def assert_and_print(s_in, s_expected):
    "Assert equality of `s_in` and `s_expected` and print the result of `s_in` if the assertion worked"
    try:
        assert s_in == s_expected
    except:
        print("Assertion failed\n")
        print("Input:\n")
        print(s_in)
        print("\n")
        print("Expected:\n")
        print(s_expected)
        assert s_in == s_expected
    print(s_in)
    return None

In [None]:
assert_and_print("select asdf\nfrom table1", "select asdf\nfrom table1")

select asdf
from table1


## General formatting

Basic formatting for SQL queries. Let's use an example throughout the core module.

This is how an input could look like

In [None]:
example_sql = """
create or replace table mytable as -- mytable example
seLecT a.asdf, b.qwer, -- some comment here
c.asdf, -- some comment there
b.asdf2 frOm table1 as a leFt join 
table2 as b -- and here a comment
    on a.asdf = b.asdf  -- join this way
    inner join table3 as c
on a.asdf=c.asdf
whEre a.asdf= 1 -- comment this
anD b.qwer =2 and a.asdf<=1 --comment that
or b.qwer>=5
groUp by a.asdf
"""
print(example_sql)


create or replace table mytable as -- mytable example
seLecT a.asdf, b.qwer, -- some comment here
c.asdf, -- some comment there
b.asdf2 frOm table1 as a leFt join 
table2 as b -- and here a comment
    on a.asdf = b.asdf  -- join this way
    inner join table3 as c
on a.asdf=c.asdf
whEre a.asdf= 1 -- comment this
anD b.qwer =2 and a.asdf<=1 --comment that
or b.qwer>=5
groUp by a.asdf



and this is how we would like to format it

In [None]:
expected_sql = """CREATE OR REPLACE TABLE mytable AS -- mytable example
SELECT a.asdf,
       b.qwer, -- some comment here
       c.asdf, -- some comment there
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b -- and here a comment
        ON a.asdf = b.asdf -- join this way
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;"""
print(expected_sql)

CREATE OR REPLACE TABLE mytable AS -- mytable example
SELECT a.asdf,
       b.qwer, -- some comment here
       c.asdf, -- some comment there
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b -- and here a comment
        ON a.asdf = b.asdf -- join this way
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;


Let's start by defining the main statements. The main statements all also require a new line

In [None]:
#export
MAIN_STATEMENTS = [
    "create table",
    "create or replace table",
    "create view",
    "create or replace view",
    "select",
    "from",
    "left join",
    "inner join",
    "outer join",
    "right join",
    "on",
    "where",
    "group by"
]

We also would like to capitalize the AS operator

In [None]:
#export
CAPITAL_STATEMENTS = MAIN_STATEMENTS + ["as"]

We need first to capitalize the main statements

In [None]:
#export
def capitalize_statements(s, statements):
    "Capitalize SQL statements `statements` in string `s`"
    for statement in statements:
        s = re.sub(rf"\b({statement})\b", statement.upper(), s, flags=re.I)
    return s

In [None]:
print(capitalize_statements(example_sql, CAPITAL_STATEMENTS))


CREATE OR REPLACE TABLE mytable AS -- mytable example
SELECT a.asdf, b.qwer, -- some comment here
c.asdf, -- some comment there
b.asdf2 FROM table1 AS a LEFT JOIN 
table2 AS b -- and here a comment
    ON a.asdf = b.asdf  -- join this way
    INNER JOIN table3 AS c
ON a.asdf=c.asdf
WHERE a.asdf= 1 -- comment this
anD b.qwer =2 and a.asdf<=1 --comment that
or b.qwer>=5
GROUP BY a.asdf



In [None]:
assert_and_print(
    capitalize_statements(
        "seLecT asdf, qwer FrOM table1", 
        CAPITAL_STATEMENTS
    ), "SELECT asdf, qwer FROM table1"
)

SELECT asdf, qwer FROM table1


We need also to remove newlines and multiple spaces because the may be arbitrary

In [None]:
#export
def remove_newlines_mspaces(s):
    """Remove newline and too many spaces characters.
    Newline characters in SELECT statement with comments
    are replaced by special token [EOC] (end of comment)
    because otherwise we cannot format SELECT statements
    with comments properly    
    """
    s = s.strip()  # strip sentence
    s = re.sub(r"\s{2,}", " ", s)  # remove too many whitespaces    
    split_s = re.split(r"(select .*?)(from)", s, flags=re.I | re.DOTALL)  # split to get individual select lines
    split_s = [
        re.sub(r"(--.*?)(\n)", r"\1[EOC]", line)  # add special token for select lines
        if re.match("select", line, flags=re.I)  # for select statements
        else line  # else no special token
        for line in split_s
    ]
    s = "".join(split_s)  # join all the lines
    s = re.sub("\n", " ", s)  # remove newlines
    return s

In [None]:
print(remove_newlines_mspaces(example_sql))

create or replace table mytable as -- mytable example seLecT a.asdf, b.qwer, -- some comment here[EOC]c.asdf, -- some comment there[EOC]b.asdf2 frOm table1 as a leFt join table2 as b -- and here a comment on a.asdf = b.asdf -- join this way inner join table3 as c on a.asdf=c.asdf whEre a.asdf= 1 -- comment this anD b.qwer =2 and a.asdf<=1 --comment that or b.qwer>=5 groUp by a.asdf


In [None]:
assert_and_print(
    remove_newlines_mspaces(
"""
SELECT asdf, qwer,
qwer1,    qwer2
FROM table1
"""
    ), "SELECT asdf, qwer, qwer1, qwer2 FROM table1"
)

SELECT asdf, qwer, qwer1, qwer2 FROM table1


In [None]:
assert_and_print(
    remove_newlines_mspaces("""
SELECT asdf, qwer, -- some comment
qwer1,    qwer2
FROM table1
WHERE asdf=1
"""), "SELECT asdf, qwer, -- some comment[EOC]qwer1, qwer2 FROM table1 WHERE asdf=1"
)

SELECT asdf, qwer, -- some comment[EOC]qwer1, qwer2 FROM table1 WHERE asdf=1


Next we would like to have each main statement in a separate line

In [None]:
#export
def breakline_statement(s, statements):
    "Write a newline in `s` for all `statements`"
    for statement in statements:
        s = re.sub(rf"\s*({statement})\b", rf"\n{statement}", s, flags=re.I)
    return s

In [None]:
print(breakline_statement(example_sql, MAIN_STATEMENTS))


create or replace table mytable as -- mytable example
select a.asdf, b.qwer, -- some comment here
c.asdf, -- some comment there
b.asdf2
from table1 as a
left join 
table2 as b -- and here a comment
on a.asdf = b.asdf  -- join this way
inner join table3 as c
on a.asdf=c.asdf
where a.asdf= 1 -- comment this
anD b.qwer =2 and a.asdf<=1 --comment that
or b.qwer>=5
group by a.asdf



In combination with removing newlines and multiple spaces

In [None]:
print(breakline_statement(remove_newlines_mspaces(example_sql), MAIN_STATEMENTS))


create or replace table mytable as -- mytable example
select a.asdf, b.qwer, -- some comment here[EOC]c.asdf, -- some comment there[EOC]b.asdf2
from table1 as a
left join table2 as b -- and here a comment
on a.asdf = b.asdf -- join this way
inner join table3 as c
on a.asdf=c.asdf
where a.asdf= 1 -- comment this anD b.qwer =2 and a.asdf<=1 --comment that or b.qwer>=5
group by a.asdf


## Specific formatting and validation

Now we will format each statement individually

### SELECT

In [None]:
#export
def format_select(s):
    "Format SELECT statement line `s`"
    if re.match(r".*,\s*$", s, flags=re.I):
        print("Correcting mistake: Comma at the end of SELECT statement")
        s = re.sub("(.*)(,+)(\s*)$", r"\1", s, flags=re.I)
    s = re.sub(r"(,)(\s*)([\w\d]+)", r"\1\n       \3", s)  # add newline after each comma (no comments) and indentation
    s = re.sub(r"\[EOC\]", "\n       ", s)  # replace [EOC] by newline
    return s

Simple usage without comments

In [None]:
assert_and_print(
    format_select("select aSdf, cast(qweR as numeric),  Asdf,qwer1"),
    "select aSdf,\n       cast(qweR as numeric),\n       Asdf,\n       qwer1"
)

select aSdf,
       cast(qweR as numeric),
       Asdf,
       qwer1


More advanced usage with comments in SELECT

In [None]:
assert_and_print(
    format_select("select asdf, cast(qwer as numeric), -- some comment[EOC]ASDF, qwer1"),
    "select asdf,\n       cast(qwer as numeric), -- some comment\n       ASDF,\n       qwer1"
)

select asdf,
       cast(qwer as numeric), -- some comment
       ASDF,
       qwer1


Correcting common mistake on the flow: comma at end of SELECT

In [None]:
assert_and_print(
    format_select("select qwer1,   asdf,"),
    "select qwer1,\n       asdf"
)

Correcting mistake: Comma at the end of SELECT statement
select qwer1,
       asdf


### FROM

In [None]:
#export
def format_from(s):
    "Format FROM statement line `s`"
    s = re.sub(  # add indentation
        r"(from )(.*)",
        r"\1  \2",
        s,
        flags=re.I
    )
    return s

In [None]:
assert_and_print(format_from("from table1"), "from   table1")

from   table1


### (LEFT / RIGHT / INNER / OUTER) JOIN

In [None]:
#export
def format_join(s):
    "Format JOIN statement line `s`"
    s = "    " + s  # add indentation
    return s

In [None]:
assert_and_print(format_join("inner join table1"), "    inner join table1")

    inner join table1


### ON

#### Helper function

In [None]:
#export
def add_whitespaces_between_symbols(s):
    "Add whitespaces between symbols in line `s`"
    s = re.sub(r"([^\s=!<>])([=!<>]+)", r"\1 \2", s, flags=re.I)  # no space left
    s = re.sub(r"([=!<>]+)([^\s=!<>])", r"\1 \2", s, flags=re.I)  # no space right
    s = re.sub(r"([^\s=!<>])([=!<>]+)([^\s=!<>])", r"\1 \2 \3", s, flags=re.I)  # no space left and right
    return s

In [None]:
assert_and_print(
    add_whitespaces_between_symbols(
        "WHERE asdf= 1 and qwer=>1 or blabla ='asdf'"
    ), "WHERE asdf = 1 and qwer => 1 or blabla = 'asdf'"
)

WHERE asdf = 1 and qwer => 1 or blabla = 'asdf'


In [None]:
#export
def format_on(s):
    "Format ON statement line `s`"
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols in join    
    s = "        " + s  # add indentation
    return s

In [None]:
assert_and_print(format_on("on a.asdf =b.asdf"), "        on a.asdf = b.asdf")

        on a.asdf = b.asdf


### WHERE

In [None]:
#export
def format_where(s):
    "Format WHERE statement line `s`"
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols
    s = re.sub(r"(where )", r"\1 ", s, flags=re.I)  # add indentation afer WHERE
    s = re.sub(r"\sand", r"\n   and", s, flags=re.I)  # add new line before every 'and' and indentation
    s = re.sub(r"\sor", r"\n    or", s, flags=re.I)  # add new line before every 'or' and indentation    
    return s

In [None]:
assert_and_print(
    format_where(
        "WHERE asdf= 1 and qwer=1 or blabla ='asdf'"
    ), "WHERE  asdf = 1\n   and qwer = 1\n    or blabla = 'asdf'"
)

WHERE  asdf = 1
   and qwer = 1
    or blabla = 'asdf'


## Format all statements

In [None]:
#export
def format_statement_line(s):
    "Format statement line `s`"
    statement_funcs = {
        "select": format_select,
        "from": format_from,
        "left join": format_join,
        "right join": format_join,
        "inner join": format_join,
        "outer join": format_join,
        "on": format_on,
        "where": format_where
    }
    for key, format_func in statement_funcs.items():
        if re.match(key, s, flags=re.I):
            s = format_func(s)
    return s

In [None]:
print(format_statement_line("select asdf, qwer"))

select asdf,
       qwer


In [None]:
print(format_statement_line("left join table1 as abc"))

    left join table1 as abc


In [None]:
print(format_statement_line("where asdf=1 and qwer='things' and blabla=0 or stuff=-1"))

where  asdf = 1
   and qwer = 'things'
   and blabla = 0
    or stuff = -1


In [None]:
#export
def format_statements(s):
    "Format statements lines `s`"
    statement_lines = s.split("\n")
    formatted_lines = [
        format_statement_line(line) for line in statement_lines
    ]
    formatted_s = "\n".join(formatted_lines)
    return formatted_s

In [None]:
print(format_statements("select asdf, qwer\nfrom table1"))

select asdf,
       qwer
from   table1


### Write a ; at the end of query

In [None]:
#export
def add_ending_semicolon(s):
    "Add ending semicolon for SQL query `s`"
    s = s.strip()
    if re.match(r".*[^;]$", s, flags=re.DOTALL):
        s = s + ";"
    return s

Basic usage

In [None]:
assert_and_print(
    add_ending_semicolon("select asdf from table1"), 
    "select asdf from table1;"
)

select asdf from table1;


Also works with multiple lines

In [None]:
assert_and_print(
    add_ending_semicolon("select asdf\nfrom table1\nwhere asdf = 1"), 
    "select asdf\nfrom table1\nwhere asdf = 1;"
)

select asdf
from table1
where asdf = 1;


It does not add another semicolon if there is already one there

In [None]:
assert_and_print(
    add_ending_semicolon("select asdf\nfrom table1\nwhere asdf = 1;"), 
    "select asdf\nfrom table1\nwhere asdf = 1;"
)

select asdf
from table1
where asdf = 1;


##  Putting everything together

to format a simple query without subqueries

In [None]:
#export
def format_simple_sql(s):
    "Format a simple SQL query without subqueries `s`"
    s = s.lower()  # everything lowercased
    s = remove_newlines_mspaces(s)  # remove newlines and multiple spaces
    s = breakline_statement(s, MAIN_STATEMENTS)  # add breaklines for the main statements
    s = capitalize_statements(s, CAPITAL_STATEMENTS)  # capitalize capital statements
    s = format_statements(s)  # format statements
    s = add_ending_semicolon(s)  # add ending semicolon if not there yet
    return s

In [None]:
assert_and_print(
    format_simple_sql(example_sql),
    expected_sql
)

CREATE OR REPLACE TABLE mytable AS -- mytable example
SELECT a.asdf,
       b.qwer, -- some comment here
       c.asdf, -- some comment there
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b -- and here a comment
        ON a.asdf = b.asdf -- join this way
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;


## Queries with subqueries

This is how we could (badly) write a query with subqueries

In [None]:
example_with_subqueries = """
select asdf, cast(qwer as numeric), -- some comment
qwer1
from 
(select asdf, qwer, from table1 where asdf = 1) as a
left 
join (select asdf, qwer2 from table2 where qwer2 = 1) as b
on a.asdf = b.asdf
where qwer1 >= 0
"""
print(example_with_subqueries)


select asdf, cast(qwer as numeric), -- some comment
qwer1
from 
(select asdf, qwer, from table1 where asdf = 1) as a
left 
join (select asdf, qwer2 from table2 where qwer2 = 1) as b
on a.asdf = b.asdf
where qwer1 >= 0



and this is the way we would like to have it nicely formatted

In [None]:
expected_with_subqueries = """
SELECT asdf,
       cast(qwer AS numeric), -- some comment
       qwer1
FROM   (SELECT asdf,
               qwer
        FROM   table1
        WHERE  asdf = 1) AS a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) AS b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0;
""".strip()
print(expected_with_subqueries)

SELECT asdf,
       cast(qwer AS numeric), -- some comment
       qwer1
FROM   (SELECT asdf,
               qwer
        FROM   table1
        WHERE  asdf = 1) AS a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) AS b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0;


### Helper function to handle subqueries

In [None]:
#export
def format_subquery(s, previous_s):
    "Format subquery in line `s` based on indentation on `previous_s`"
    s = re.sub(r"\(\nSELECT", "(SELECT", s)  # remove newline between parenthesis and SELECT
    indentation = len(previous_s.split("\n")[-1]) + 1  # get indentation level
    split_s = s.split("\n")
    indented_s = [
        " " * indentation + line
        for line in split_s[1:]
    ]
    formatted_split = [split_s[0]] + indented_s
    formatted_s = "\n".join(formatted_split)
    return formatted_s

### Main function handling queries with subqueries

In [None]:
#export
def format_sql(s):
    "Format SQL query with subqueries"
    s = format_simple_sql(s)  # format query
    split_s = re.split(r"(\(.SELECT.*?\))", s, flags=re.DOTALL)  # split on (SELECT ...)
    split_s = [
        format_subquery(split_s[i], split_s[i-1])
        if re.match(r"\(.SELECT.*\)", split_s[i], flags=re.DOTALL)
        else split_s[i]
        for i in range(len(split_s))
    ]
    s = "".join(split_s)
    return s

In [None]:
assert_and_print(
    format_sql(example_with_subqueries),
    expected_with_subqueries
)

Correcting mistake: Comma at the end of SELECT statement
SELECT asdf,
       cast(qwer AS numeric), -- some comment
       qwer1
FROM   (SELECT asdf,
               qwer
        FROM   table1
        WHERE  asdf = 1) AS a
    LEFT JOIN (SELECT asdf,
                      qwer2
               FROM   table2
               WHERE  qwer2 = 1) AS b
        ON a.asdf = b.asdf
WHERE  qwer1 >= 0;


It even works with simple queries without subqueries, therefore generalizing the `format_simple_sql()` function

In [None]:
assert_and_print(
    format_sql(example_sql),
    expected_sql
)

CREATE OR REPLACE TABLE mytable AS -- mytable example
SELECT a.asdf,
       b.qwer, -- some comment here
       c.asdf, -- some comment there
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b -- and here a comment
        ON a.asdf = b.asdf -- join this way
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1 -- comment this
   and b.qwer = 2
   and a.asdf <= 1 --comment that
    or b.qwer >= 5
GROUP BY a.asdf;


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
