In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp core

In [None]:
#export
import re

# sql_formatter

> A SQL formatter to automatically format your SQL files

In [None]:
#hide
from nbdev.showdoc import *

## General formatting

Basic formatting for SQL queries. Let's use an example throughout the basics module.

This is how an input could look like

In [None]:
ex_sql = """
seLecT a.asdf, b.qwer,
b.asdf2 frOm table1 as a
leFt join table2 as b
    on a.asdf = b.asdf
whEre a.asdf = 1
anD b.qwer = 2
groUp by a.asdf
"""
print(ex_sql)


seLecT a.asdf, b.qwer,
b.asdf2 frOm table1 as a
leFt join table2 as b
    on a.asdf = b.asdf
whEre a.asdf = 1
anD b.qwer = 2
groUp by a.asdf



and this is how we would like to format it

In [None]:
print("""
SELECT a.asdf, 
       b.qwer,
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b
        ON a.asdf = b.asdf
WHERE  a.asdf = 1
   and b.qwer = 2
GROUP BY a.asdf
""")


SELECT a.asdf, 
       b.qwer,
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b
        ON a.asdf = b.asdf
WHERE  a.asdf = 1
   and b.qwer = 2
GROUP BY a.asdf



Let's start by defining the main statements. The main statements all also require a new line

In [None]:
#export
MAIN_STATEMENTS = [
    "create table",
    "create or replace table",
    "create view",
    "create or replace view",
    "select",
    "from",
    "left join",
    "inner join",
    "outer join",
    "right join",
    "on",
    "where",
    "group by"
]

We also would like to capitalize the AS operator

In [None]:
CAPITAL_STATEMENTS = MAIN_STATEMENTS + ["as"]

We need first to capitalize the main statements

In [None]:
#export
def capitalize_statements(s, statements):
    "Capitalize SQL statements `statements` in string `s`"
    for statement in statements:
        s = re.sub(rf"\b({statement})\b", statement.upper(), s, flags=re.I)
    return s

In [None]:
print(capitalize_statements(ex_sql, CAPITAL_STATEMENTS))


SELECT a.asdf, b.qwer,
b.asdf2 FROM table1 AS a
LEFT JOIN table2 AS b
    ON a.asdf = b.asdf
WHERE a.asdf = 1
anD b.qwer = 2
GROUP BY a.asdf



In [None]:
assert (
    capitalize_statements(
        "seLecT asdf, qwer FrOM table1", 
        CAPITAL_STATEMENTS
    ) == "SELECT asdf, qwer FROM table1"
)

We need also to remove newlines because the may be arbitrary

In [None]:
#export
def remove_newlines(s):
    "Remove newline characters"
    s = re.sub("\n", " ", s)
    return s

In [None]:
print(remove_newlines(ex_sql))

 seLecT a.asdf, b.qwer, b.asdf2 frOm table1 as a leFt join table2 as b     on a.asdf = b.asdf whEre a.asdf = 1 anD b.qwer = 2 groUp by a.asdf 


Next we would like to have each main statement in a separate line

In [None]:
#export
def breakline_statement(s, statements):
    "Write a newline in `s` for all `statements`"
    for statement in statements:
        s = re.sub(rf"\s*({statement})\b", rf"\n{statement}", s, flags=re.I)
    return s        

In [None]:
print(breakline_statement(ex_sql, MAIN_STATEMENTS))


select a.asdf, b.qwer,
b.asdf2
from table1 as a
left join table2 as b
on a.asdf = b.asdf
where a.asdf = 1
anD b.qwer = 2
group by a.asdf



In combination with removing newlines

In [None]:
print(breakline_statement(remove_newlines(ex_sql), MAIN_STATEMENTS))


select a.asdf, b.qwer, b.asdf2
from table1 as a
left join table2 as b
on a.asdf = b.asdf
where a.asdf = 1 anD b.qwer = 2
group by a.asdf 


We would also like to lowercase simple operators like and, or

In [None]:
#export
OPERATORS = [
    "and",
    "or"
]

In [None]:
#export
def lower_operators(s, operators):
    "Lowercase `operators` in `s`"
    for operator in operators:
        s = re.sub(rf"\b{operator}\b", f"{operator}", s, flags=re.I)
    return s

In [None]:
print(lower_operators(ex_sql, OPERATORS))


seLecT a.asdf, b.qwer,
b.asdf2 frOm table1 as a
leFt join table2 as b
    on a.asdf = b.asdf
whEre a.asdf = 1
and b.qwer = 2
groUp by a.asdf



In [None]:
assert (
    lower_operators(
        "select asdf frOm table1 Where asdf >= 1 aNd asdf < 5", 
        OPERATORS
    ) == "select asdf frOm table1 Where asdf >= 1 and asdf < 5"
)

## Specific formatting and validation

Now we will format (and validate) each statement individually

### SELECT

In [None]:
#export
def validate_select(s):
    "Validate SELECT statement line `s`"
    exit_code = 0  # if no error then return 0
    # if there is a comma at the end of select line then add 1
    exit_code += 1 if re.match(r".*,\s*$", s) else exit_code
    return exit_code

In [None]:
assert validate_select("select asdf, qwer") == 0

In [None]:
assert validate_select("select   asdf, qwer,  asdf   ") == 0

In [None]:
assert validate_select("select asdf, qwer,") == 1

In [None]:
assert validate_select("select asdf, qwer,    ") == 1

In [None]:
#export
def format_select(s):
    "Format SELECT statement line `s`"
    s = re.sub(r"\s{2,}", " ", s)  # replace too many spaces
    s = re.sub(  # lowercase fields in select statement
        r"(select )(.*)", 
        lambda match: match.group(1) + match.group(2).lower(),
        remove_newlines(s),
        flags=re.I
    )    
    s = re.sub(r",\s*", ",\n       ", s)  # add newline after each comma
    return s

In [None]:
print(format_select("select aSdf, qweR,  Asdf,\nqwer1,   asdf"))

select asdf,
       qwer,
       asdf,
       qwer1,
       asdf


### FROM

In [None]:
#export
def format_from(s):
    "Format FROM statement line `s`"
    s = re.sub(r"\s{2,}", " ", s)  # replace too many spaces
    s = re.sub(  # lowercase fields in from statement
        r"(from )(.*)", 
        lambda match: match.group(1) + match.group(2).lower(),
        remove_newlines(s),
        flags=re.I
    )
    s = re.sub(
        r"(from )(.*)",
        r"\1   \2",
        s,
        flags=re.I
    )
    return s

In [None]:
print(format_from("from table1"))

from    table1


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
