In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp core

In [None]:
#export
import re

# sql_formatter

> A SQL formatter to automatically format your SQL files

In [None]:
#hide
from nbdev.showdoc import *

## General formatting

Basic formatting for SQL queries. Let's use an example throughout the core module.

This is how an input could look like

In [None]:
ex_sql = """
seLecT a.asdf, b.qwer,
b.asdf2 frOm table1 as a leFt join 
table2 as b
    on a.asdf = b.asdf  inner join table3 as c
on a.asdf=c.asdf
whEre a.asdf= 1
anD b.qwer =2
and a.asdf<=1
or b.qwer>=5
groUp by a.asdf
"""
print(ex_sql)


seLecT a.asdf, b.qwer,
b.asdf2 frOm table1 as a leFt join 
table2 as b
    on a.asdf = b.asdf  inner join table3 as c
on a.asdf=c.asdf
whEre a.asdf= 1
anD b.qwer =2
and a.asdf<=1
or b.qwer>=5
groUp by a.asdf



and this is how we would like to format it

In [None]:
print("""
SELECT a.asdf, 
       b.qwer,
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b
        ON a.asdf = b.asdf
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1
   and b.qwer = 2
   and a.asdf <=1
    or b.qwer >= 5
GROUP BY a.asdf
""")


SELECT a.asdf, 
       b.qwer,
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b
        ON a.asdf = b.asdf
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1
   and b.qwer = 2
   and a.asdf <=1
    or b.qwer >= 5
GROUP BY a.asdf



Let's start by defining the main statements. The main statements all also require a new line

In [None]:
#export
MAIN_STATEMENTS = [
    "create table",
    "create or replace table",
    "create view",
    "create or replace view",
    "select",
    "from",
    "left join",
    "inner join",
    "outer join",
    "right join",
    "on",
    "where",
    "group by"
]

We also would like to capitalize the AS operator

In [None]:
#export
CAPITAL_STATEMENTS = MAIN_STATEMENTS + ["as"]

We need first to capitalize the main statements

In [None]:
#export
def capitalize_statements(s, statements):
    "Capitalize SQL statements `statements` in string `s`"
    for statement in statements:
        s = re.sub(rf"\b({statement})\b", statement.upper(), s, flags=re.I)
    return s

In [None]:
print(capitalize_statements(ex_sql, CAPITAL_STATEMENTS))


SELECT a.asdf, b.qwer,
b.asdf2 FROM table1 AS a LEFT JOIN 
table2 AS b
    ON a.asdf = b.asdf  INNER JOIN table3 AS c
ON a.asdf=c.asdf
WHERE a.asdf= 1
anD b.qwer =2
and a.asdf<=1
or b.qwer>=5
GROUP BY a.asdf



In [None]:
assert (
    capitalize_statements(
        "seLecT asdf, qwer FrOM table1", 
        CAPITAL_STATEMENTS
    ) == "SELECT asdf, qwer FROM table1"
)

We need also to remove newlines and multiple spaces because the may be arbitrary

In [None]:
#export
def remove_newlines_mspaces(s):
    "Remove newline and too many spaces characters"
    s = re.sub("\n", " ", s)  # remove newlines
    s = re.sub(r"\s{2,}", " ", s)  # remove too many whitespaces
    s = s.strip()  # strip sentence
    return s

In [None]:
print(remove_newlines_mspaces(ex_sql))

seLecT a.asdf, b.qwer, b.asdf2 frOm table1 as a leFt join table2 as b on a.asdf = b.asdf inner join table3 as c on a.asdf=c.asdf whEre a.asdf= 1 anD b.qwer =2 and a.asdf<=1 or b.qwer>=5 groUp by a.asdf


In [None]:
assert (
    remove_newlines_mspaces(
"""
SELECT asdf, qwer,
qwer1,    qwer2
FROM table1
"""
    ) == "SELECT asdf, qwer, qwer1, qwer2 FROM table1"
)

Next we would like to have each main statement in a separate line

In [None]:
#export
def breakline_statement(s, statements):
    "Write a newline in `s` for all `statements`"
    for statement in statements:
        s = re.sub(rf"\s*({statement})\b", rf"\n{statement}", s, flags=re.I)
    return s        

In [None]:
print(breakline_statement(ex_sql, MAIN_STATEMENTS))


select a.asdf, b.qwer,
b.asdf2
from table1 as a
left join 
table2 as b
on a.asdf = b.asdf
inner join table3 as c
on a.asdf=c.asdf
where a.asdf= 1
anD b.qwer =2
and a.asdf<=1
or b.qwer>=5
group by a.asdf



In combination with removing newlines and multiple spaces

In [None]:
print(breakline_statement(remove_newlines_mspaces(ex_sql), MAIN_STATEMENTS))


select a.asdf, b.qwer, b.asdf2
from table1 as a
left join table2 as b
on a.asdf = b.asdf
inner join table3 as c
on a.asdf=c.asdf
where a.asdf= 1 anD b.qwer =2 and a.asdf<=1 or b.qwer>=5
group by a.asdf


## Specific formatting and validation

Now we will format each statement individually

### SELECT

In [None]:
#export
def format_select(s):
    "Format SELECT statement line `s`"
    if re.match(r".*,\s*$", s, flags=re.I):
        print("Correcting mistake: Comma at the end of SELECT statement")
        s = re.sub("(.*)(,+)(\s*)$", r"\1", s, flags=re.I)
    s = re.sub(r",\s*", ",\n       ", s)  # add newline after each comma and indentation
    return s

In [None]:
print(format_select("select aSdf, qweR,  Asdf,qwer1,   asdf"))

select aSdf,
       qweR,
       Asdf,
       qwer1,
       asdf


In [None]:
print(format_select("select aSdf, qweR,  Asdf,qwer1,   asdf,"))

Correcting mistake: Comma at the end of SELECT statement
select aSdf,
       qweR,
       Asdf,
       qwer1,
       asdf


### FROM

In [None]:
#export
def format_from(s):
    "Format FROM statement line `s`"
    s = re.sub(  # add indentation
        r"(from )(.*)",
        r"\1  \2",
        s,
        flags=re.I
    )
    return s

In [None]:
print(format_from("from table1"))

from   table1


### (LEFT / RIGHT / INNER / OUTER) JOIN

In [None]:
#export
def format_join(s):
    "Format JOIN statement line `s`"
    s = "    " + s  # add indentation
    return s

In [None]:
format_join("inner join table1")

'    inner join table1'

### ON

#### Helper function

In [None]:
#export
def add_whitespaces_between_symbols(s):
    "Add whitespaces between symbols in line `s`"
    s = re.sub(r"([^\s])([=!<>]+)", r"\1 \2", s, flags=re.I)  # no space left
    s = re.sub(r"([=!<>]+)([^\s])", r"\1 \2", s, flags=re.I)  # no space right
    s = re.sub(r"([^\s])([=!<>]+)([^\s])", r"\1 \2 \3", s, flags=re.I)  # no space left and right
    return s

In [None]:
assert (
    add_whitespaces_between_symbols(
        "WHERE asdf= 1 and qwer=1 or blabla ='asdf'"
    ) == "WHERE asdf = 1 and qwer = 1 or blabla = 'asdf'"
)

In [None]:
#export
def format_on(s):
    "Format ON statement line `s`"
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols in join    
    s = "        " + s  # add indentation
    return s

In [None]:
format_on("on a.asdf =b.asdf")

'        on a.asdf = b.asdf'

### WHERE

In [None]:
#export
def format_where(s):
    "Format WHERE statement line `s`"
    s = add_whitespaces_between_symbols(s)  # add whitespaces between symbols
    s = re.sub(r"(where )", r"\1 ", s, flags=re.I)  # add indentation afer WHERE
    s = re.sub(r"\sand", r"\n   and", s, flags=re.I)  # add new line before every 'and' and indentation
    s = re.sub(r"\sor", r"\n    or", s, flags=re.I)  # add new line before every 'or' and indentation    
    return s

In [None]:
print(format_where("WHERE asdf= 1 and qwer=1 or blabla ='asdf'"))

WHERE  asdf = 1
   and qwer = 1
    or blabla = 'asdf'


In [None]:
assert (
    format_where(
        "WHERE asdf= 1 and qwer=1 or blabla ='asdf'"
    ) == "WHERE  asdf = 1\n   and qwer = 1\n    or blabla = 'asdf'"
)

## Format all statements

In [None]:
#export
def format_statement_line(s):
    "Format statement line `s`"
    statement_funcs = {
        "select": format_select,
        "from": format_from,
        "left join": format_join,
        "right join": format_join,
        "inner join": format_join,
        "outer join": format_join,
        "on": format_on,
        "where": format_where
    }
    for key, format_func in statement_funcs.items():
        if re.match(key, s, flags=re.I):
            s = format_func(s)
    return s

In [None]:
print(format_statement_line("select asdf, qwer"))

select asdf,
       qwer


In [None]:
print(format_statement_line("left join table1 as abc"))

    left join table1 as abc


In [None]:
print(format_statement_line("where asdf=1 and qwer='things' and blabla=0 or stuff=-1"))

where  asdf = 1
   and qwer = 'things'
   and blabla = 0
    or stuff = -1


In [None]:
#export
def format_statements(s):
    "Format statements lines `s`"
    statement_lines = s.split("\n")
    formatted_lines = [
        format_statement_line(line) for line in statement_lines
    ]
    formatted_s = "\n".join(formatted_lines)
    return formatted_s

In [None]:
print(format_statements("select asdf, qwer\nfrom table1"))

select asdf,
       qwer
from   table1


##  Putting everything together

In [None]:
#export
def format_sql(s):
    "Format a SQL query `s`"
    s = s.lower()  # everything lowercased
    s = remove_newlines_mspaces(s)  # remove newlines and multiple spaces
    s = breakline_statement(s, MAIN_STATEMENTS)  # add breaklines for the main statements
    s = capitalize_statements(s, CAPITAL_STATEMENTS)  # capitalize capital statements
    s = format_statements(s)  # format statements 
    return s

In [None]:
print(format_sql(ex_sql))


SELECT a.asdf,
       b.qwer,
       b.asdf2
FROM   table1 AS a
    LEFT JOIN table2 AS b
        ON a.asdf = b.asdf
    INNER JOIN table3 AS c
        ON a.asdf = c.asdf
WHERE  a.asdf = 1
   and b.qwer = 2
   and a.asdf <= 1
    or b.qwer >= 5
GROUP BY a.asdf


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
