In [None]:
#hide
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False  # workaround for buggy jedi

In [None]:
# default_exp utils

In [None]:
#export
import re
from itertools import accumulate
from collections import Counter
import operator

# utils

> Helper functions for SQL-text preparation

In [None]:
#hide
from nbdev.showdoc import *

### General utils

In [None]:
#export
def assert_and_print(s_in, s_expected):
    "Assert equality of `s_in` and `s_expected` and print the result of `s_in` if the assertion worked"
    try:
        assert s_in == s_expected
    except:
        print("Assertion failed\n")
        print("Observed:\n")
        print(s_in)
        print("\n")
        print("Expected:\n")
        print(s_expected)
        if isinstance(s_expected, str):
            for i in range(min(len(s_in), len(s_expected))):
                if s_in[i] != s_expected[i]:
                    break
            print(f"Exception found at position {i}:\n")
            print("10-characters window:\n")
            print("Observed:\n")
            print(s_in[max(i-5, 0):i+5])
            print("\n")
            print("Expected:\n")
            print(s_expected[max(i-5, 0):i+5])        
        assert s_in == s_expected
    print(s_in)
    return None

In [None]:
assert_and_print("some string", "some string")

some string


### Compress list of dictionaries

In [None]:
#export
def compress_dicts(ld, keys):
    "Compress list of dicts `ld` with same `keys` concatenating key 'string'"
    # make sure keys are a list
    keys = [keys] if not isinstance(keys, list) and not isinstance(keys, tuple) else keys
    # make sure we only use the needed keys and not more
    ld = [{k:v for k,v in d.items() if k in set(keys).union(["string"])} for d in ld]
    ld_out = [ld[0]]  # initialize output with reference dict = first element
    for d in ld[1:]:
        # reference comparison items
        ref_cp_items = {k:v for k, v in ld_out[-1].items() if k in keys}
        cp_items = {k:v for k, v in d.items() if k in keys}
        if ref_cp_items == cp_items:
            ld_out[-1]["string"] += d["string"]
        else:
            ld_out.append(d)
    return ld_out

In [None]:
assert_and_print(
    compress_dicts(
        [
            {"string": "some string", "key1": True, "key2": True},
            {"string": ". qwerty", "key1": True, "key2": True},
            {"string": " asdf", "key1": True, "key2": True},
            {"string": "another string", "key1": True, "key2": False},
            {"string": " bla bla", "key1": True, "key2": False},
            {"string": "1234", "key1": False, "key2": False},
        ],
        ("key1", "key2")
    ),
    [
            {"string": "some string. qwerty asdf", "key1": True, "key2": True},
            {"string": "another string bla bla", "key1": True, "key2": False},
            {"string": "1234", "key1": False, "key2": False},        
    ]
)

[{'string': 'some string. qwerty asdf', 'key1': True, 'key2': True}, {'string': 'another string bla bla', 'key1': True, 'key2': False}, {'string': '1234', 'key1': False, 'key2': False}]


### Removing redundant whitespaces

In [None]:
#export
def remove_whitespaces_newline(s):
    "Remove whitespaces before and after newline in `s`"
    s = re.sub(r"\n[\r\t\f\v ]+", "\n", s)  # remove whitespaces after newline
    s = re.sub(r"[\r\t\f\v ]+\n", "\n", s)  # remove whitespaces before newline
    return s

In [None]:
assert_and_print(
    remove_whitespaces_newline("select asdf\n from table1 \nwhere asdf = 1 \n group by asdf"),
    "select asdf\nfrom table1\nwhere asdf = 1\ngroup by asdf"
)

select asdf
from table1
where asdf = 1
group by asdf


In [None]:
#export
def remove_whitespaces_comments(s):
    "Remove whitespaces before and after comment tokens in `s`"
    s = re.sub(r"\[C\][\r\t\f\v ]+", "[C]", s)  # remove whitespaces after comment token [C]
    s = re.sub(r"[\r\t\f\v ]+\[C\]", "[C]", s)  # remove whitespaces before comment token [C]
    s = re.sub(r"\[CS\][\r\t\f\v ]+", "[CS]", s)  # remove whitespaces after comment token [CS]
    s = re.sub(r"[\r\t\f\v ]+\[CS\]", "[CS]", s)  # remove whitespaces before comment token [CS]
    s = re.sub(r"\[CI\][\r\t\f\v ]+", "[CI]", s)  # remove whitespaces after comment token [CI]
    s = re.sub(r"[\r\t\f\v ]+\[CI\]", "[CI]", s)  # remove whitespaces before comment token [CI]    
    return s

In [None]:
assert_and_print(
    remove_whitespaces_comments(
        "select asdf[C] from table1 [CS]where asdf = 1 [C] group by asdf"
    ),
    "select asdf[C]from table1[CS]where asdf = 1[C]group by asdf"
)

select asdf[C]from table1[CS]where asdf = 1[C]group by asdf


In [None]:
assert_and_print(
    remove_whitespaces_comments(
        "select asdf[C][CS]/* asdf [CI]  */[C] from table1 [CS]where asdf = 1 [C] group by asdf"
    ),
    "select asdf[C][CS]/* asdf[CI]*/[C]from table1[CS]where asdf = 1[C]group by asdf"
)

select asdf[C][CS]/* asdf[CI]*/[C]from table1[CS]where asdf = 1[C]group by asdf


In [None]:
#export
def remove_redundant_whitespaces(s):
    "Strip and remove redundant (more than 2) whitespaces in `s` but no newlines in between"
    s = s.strip()
    s = re.sub(r"[\r\t\f\v ]{2,}", " ", s)  # remove too many whitespaces but not newlines
    return s

Only whitespaces

In [None]:
assert_and_print(
    remove_redundant_whitespaces("\nsome string     many whitespaces   some more"),
    "some string many whitespaces some more"
)

some string many whitespaces some more


Whitespaces and newlines

In [None]:
assert_and_print(
    remove_redundant_whitespaces("\nsome string   \n  many whitespaces\n   \nsome more\n"),
    "some string \n many whitespaces\n \nsome more"
)

some string 
 many whitespaces
 
some more


In [None]:
#export
def remove_whitespaces_parenthesis(s):
    "Remove whitespaces between parenthesis in query `s`"
    s = re.sub(r"\([\r\t\f\v ]+", "(", s)  # remove whitespaces after (
    s = re.sub(r"[\r\t\f\v ]+\)", ")", s)  # remove whitespaces before )
    return s

In [None]:
assert_and_print(
    remove_whitespaces_parenthesis("select asdf, substr( qwer, 1, 2 ) as qwerty"),
    "select asdf, substr(qwer, 1, 2) as qwerty"
)

select asdf, substr(qwer, 1, 2) as qwerty


### Add missing whitespaces between symbols

In [None]:
#export
def add_whitespaces_between_symbols(s):
    "Add whitespaces between symbols in line `s`"
    s = re.sub(r"([^\s=!<>])([=!<>]+)", r"\1 \2", s, flags=re.I)  # no space left
    s = re.sub(r"([=!<>]+)([^\s=!<>])", r"\1 \2", s, flags=re.I)  # no space right
    s = re.sub(r"([^\s=!<>])([=!<>]+)([^\s=!<>])", r"\1 \2 \3", s, flags=re.I)  # no space left and right
    return s

In [None]:
assert_and_print(
    add_whitespaces_between_symbols(
        "WHERE asdf= 1 and qwer=>1 or blabla ='asdf'"
    ), "WHERE asdf = 1 and qwer => 1 or blabla = 'asdf'"
)

WHERE asdf = 1 and qwer => 1 or blabla = 'asdf'


### Mark comments with special tokens `[C]` (newline after comment), `[CS]` (start of comment in new line) and `[CI]` (newline in multiline comment)

In [None]:
#export
def mark_ci_comments(s):
    "Replace new lines in multiline comments by special token [CI]"
    positions = []  # positions of \n in multiline /* */ comments
    # counter for comments
    k = 0  # 0 = no comment range
    comment_open1 = False # comment indicator for /* */ comments
    comment_open2 = False  # comment indicator for -- comments
    quote_open1 = False  # quote '
    quote_open2 = False # quote "
    # loop over character positions
    for i, c in enumerate(s):
        if (
            c == "\n" and 
            comment_open1 and
            not comment_open2 and
            not quote_open1 and
            not quote_open2
        ):
            positions.append(i)
        elif (
            s[i:i+2] == "/*" and 
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            not quote_open2 
        ):  # if there is an opening comment /*
            comment_open1 = True
        elif (
            s[i:i+2] == "*/" and
            comment_open1 and
            not comment_open2 and
            not quote_open1 and
            not quote_open2
        ):  # if there is a closing comment */
            comment_open1 = False
        elif (
            s[i:i+2] == "--" and 
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            not quote_open2 
        ):  # if there is an opening comment --
            comment_open2 = True
        elif (
            (c == "\n" or s[i:i+3] == "[c]") and
            not comment_open1 and
            comment_open2 and
            not quote_open1 and
            not quote_open2
        ):  # if the -- comment ends
            comment_open2 = False
        elif (
            c == "'" and
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            not quote_open2            
        ):  # if opening quote '
            quote_open1 = True
        elif (
            c == "'" and
            not comment_open1 and 
            not comment_open2 and
            quote_open1 and 
            not quote_open2            
        ):  # if opening quote '
            quote_open1 = False
        elif (
            c == '"' and
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            quote_open2            
        ):  # if opening quote '
            quote_open2 = True
        elif (
            c == '"' and
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            quote_open2            
        ):  # if opening quote '
            quote_open2 = False
    if len(positions) == 0:
        return s
    else:
        s = "".join([c if i not in positions else "[CI]" for i, c in enumerate(s)])
        return s

In [None]:
assert_and_print(
    mark_ci_comments("select /* asdf \n qwer */"),
    "select /* asdf [CI] qwer */"
)

select /* asdf [CI] qwer */


In [None]:
assert_and_print(
    mark_ci_comments("select /* asdf \n qwe \n rqwer */"),
    "select /* asdf [CI] qwe [CI] rqwer */"
)

select /* asdf [CI] qwe [CI] rqwer */


In [None]:
#export
def mark_comments(s):
    "Mark end of comments -- and begin of comments /* */ if they are in a new line with token [C]"
    s = re.sub(r"(--.*?)(\n)", r"\1[C]\2", s)  # mark end of -- comments
    s = re.sub(r"(\/\*.*?\*\/)", r"\1[C]", s, flags=re.DOTALL)  # mark end of /* */ comments
    s = re.sub(r"(\n)\s*(--.*?)", r"\1[CS]\2", s, flags=re.DOTALL)  # mark start of comment line with --    
    s = re.sub(r"(\n)\s*(\/\*.*\*\/)", r"\1[CS]\2", s)  # mark start of comment line with /*
    s = re.sub(r"(\n)\s*(\/\*.*?\*\/)", r"\1[CS]\2", s, flags=re.DOTALL)  # mark start of comment line with /*
    s = mark_ci_comments(s)  # replace intercomment new lines by [CI]
    return s

In [None]:
assert_and_print(
    mark_comments(
"""
select asdf, -- some comment
qwer, qwer2,
/* comment line */
qwer3,
qwer4 -- comment
"""
    ),
"""
select asdf, -- some comment[C]
qwer, qwer2,
[CS]/* comment line */[C]
qwer3,
qwer4 -- comment[C]
"""
)


select asdf, -- some comment[C]
qwer, qwer2,
[CS]/* comment line */[C]
qwer3,
qwer4 -- comment[C]



In [None]:
assert_and_print(
    mark_comments(
"""
select asdf, -- some comment
qwer, qwer2, -- another comment
/* comment line */
qwer3,
-- another comment line
qwer4 -- comment
"""
    ),
"""
select asdf, -- some comment[C]
qwer, qwer2, -- another comment[C]
[CS]/* comment line */[C]
qwer3,
[CS]-- another comment line[C]
qwer4 -- comment[C]
"""
)


select asdf, -- some comment[C]
qwer, qwer2, -- another comment[C]
[CS]/* comment line */[C]
qwer3,
[CS]-- another comment line[C]
qwer4 -- comment[C]



In [None]:
assert_and_print(
    mark_comments(
"""
create or replace my_table as
/* some comment
 some new comment line */
select asdf,
qwer, qwer2

from table1
"""    
    ),
"""
create or replace my_table as
[CS]/* some comment[CI] some new comment line */[C]
select asdf,
qwer, qwer2

from table1
"""    
)


create or replace my_table as
[CS]/* some comment[CI] some new comment line */[C]
select asdf,
qwer, qwer2

from table1



In [None]:
assert_and_print(
    mark_comments(
"""
SELECT asdf, qwer, /* another comment */
qwer1, 
/* inline comment */
qwer2
FROM table1
WHERE asdf=1"""
    ),
    "\nSELECT asdf, qwer, /* another comment */[C]\nqwer1, \n[CS]/* inline comment */[C]\nqwer2\nFROM table1\nWHERE asdf=1"
)


SELECT asdf, qwer, /* another comment */[C]
qwer1, 
[CS]/* inline comment */[C]
qwer2
FROM table1
WHERE asdf=1


In [None]:
assert_and_print(
    mark_comments(
"""
select a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf, 
/* some commenT there */
"""
    ),
"""
select a.asdf, b.qwer, -- some comment here[C]
[CS]/* and here is a line comment inside select */[C]
substr(c.asdf, 1, 2) as substr_asdf, 
[CS]/* some commenT there */[C]
"""
)



select a.asdf, b.qwer, -- some comment here[C]
[CS]/* and here is a line comment inside select */[C]
substr(c.asdf, 1, 2) as substr_asdf, 
[CS]/* some commenT there */[C]



In [None]:
assert_and_print(
    mark_comments(
"""
/* multi line
comment */
select a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
substr(c.asdf, 1, 2) as substr_asdf, 
/* some commenT 
there */
case when a.asdf= 1 then 'b' /* here a case comment */
"""
    ),
"""
[CS]/* multi line[CI]comment */[C]
select a.asdf, b.qwer, -- some comment here[C]
[CS]/* and here is a line comment inside select */[C]
substr(c.asdf, 1, 2) as substr_asdf, 
[CS]/* some commenT [CI]there */[C]
case when a.asdf= 1 then 'b' /* here a case comment */[C]
"""
)


[CS]/* multi line[CI]comment */[C]
select a.asdf, b.qwer, -- some comment here[C]
[CS]/* and here is a line comment inside select */[C]
substr(c.asdf, 1, 2) as substr_asdf, 
[CS]/* some commenT [CI]there */[C]
case when a.asdf= 1 then 'b' /* here a case comment */[C]



In [None]:
assert_and_print(
    mark_comments(
"""
select a.asdf, b.qwer, -- some comment here
/* and here is a line comment inside select */
/* some commenT there */
case when a.asdf= 1 then 'b' /* here a case comment */
"""
    ),
"""
select a.asdf, b.qwer, -- some comment here[C]
[CS]/* and here is a line comment inside select */[C]
[CS]/* some commenT there */[C]
case when a.asdf= 1 then 'b' /* here a case comment */[C]
"""
)


select a.asdf, b.qwer, -- some comment here[C]
[CS]/* and here is a line comment inside select */[C]
[CS]/* some commenT there */[C]
case when a.asdf= 1 then 'b' /* here a case comment */[C]



### Splitting functions

#### Split query into comment / non-comment, quote / non-quote, select / non-select

In [None]:
#export
def split_query(s):
    """Split query into comment / non-comment, quote / non-quote, select / non-select
    
    Return a dict with keys "string", "comment" in (True, False) "quote" in (True, False) 
    and "select" in (True, False)
    """
    s_low = s.lower()  # lowercased string
    k = 0  #     # counter for comments; 0 = no comment
    comment_open1 = False # comment indicator for /* */ comments
    comment_open2 = False  # comment indicator for -- comments
    quote_open1 = False  # quote '
    quote_open2 = False # quote "
    select_region = False # start with non-select
    quote_region = False # start with non-quote
    comment_region = False # start with non-quote
    s_comp = []  # container for string components
    start = 0
    # loop over character positions
    for i, c in enumerate(s):
        if s_low[i:i+6] == "select" and k == 0:  # k = 0 -> no comment
            s_comp.append({
                "string": s[start:i], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            })
            start = i
            select_region = True # after select starts the select region
        elif s_low[i:i+4] == "from" and k == 0:
            select_open = False
            s_comp.append({
                "string": s[start:i], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            })
            start = i
            select_region = False # after from ends the select region
        elif (
            (s[i:i+2] == "/*" or s[i:i+4] == "[CS]") and 
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            not quote_open2 
        ):  # if there is an opening comment /*
            k += 1
            # before opening comment it was no comment
            s_comp.append({
                "string": s[start:i], 
                "comment": comment_region,
                "quote": quote_region,
                "select": select_region
            })
            start = i
            comment_open1 = True
            comment_region = True
        elif (
            s[i:i+5] == "*/[C]" and
            comment_open1 and
            not comment_open2 and
            not quote_open1 and
            not quote_open2
        ):  # if there is a closing comment */
            k -= 1
            s_comp.append({
                "string": s[start:i+5],
                "comment": comment_region,
                "quote": quote_region,
                "select": select_region
            }) # before closing comment it was comment
            comment_open1 = False
            comment_region = False
            start = i+5
        elif (
            s[i:i+2] == "*/" and
            comment_open1 and
            not comment_open2 and
            not quote_open1 and
            not quote_open2
        ):  # if there is a closing comment */
            k -= 1
            s_comp.append({
                "string": s[start:i+2],
                "comment": comment_region,
                "quote": quote_region,
                "select": select_region
            }) # before closing comment it was comment
            comment_open1 = False
            comment_region = False
            start = i+2
        elif (
            s[i:i+2] == "--" and 
            not comment_open1 and 
            not comment_open2 and
            not quote_open1 and 
            not quote_open2 
        ):  # if there is an opening comment --
            k += 1
            s_comp.append({
                "string": s[start:i], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            }) # before opening comment it was no comment
            comment_open2 = True
            comment_region = True
            start = i
        elif (
            (c == "\n" or s[i:i+3] == "[C]") and
            not comment_open1 and
            comment_open2 and
            not quote_open1 and
            not quote_open2
        ):  # if the -- comment ends
            k -= 1
            comment_open2 = False
            if c == "\n":
                s_comp.append({
                    "string": s[start:i], 
                    "comment": comment_region, 
                    "quote": quote_region, 
                    "select": select_region
                }) # before closing comment it was comment
                start = i
            else: # [C]
                s_comp.append({
                    "string": s[start:i+3], 
                    "comment": comment_region,
                    "quote": quote_region,
                    "select": select_region
                }) # before closing comment it was comment                
                start = i+3
            comment_region = False
        elif (
            c == "'" and
            not comment_open1 and
            not comment_open2 and
            not quote_open1 and
            not quote_open2
        ):
            s_comp.append({
                "string": s[start:i+1], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            }) # before opening comment it was no comment
            quote_open1 = True
            quote_region = True
            start = i+1
        elif (
            c == "'" and
            not comment_open1 and
            not comment_open2 and
            quote_open1 and
            not quote_open2
        ):
            s_comp.append({
                "string": s[start:i], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            }) # before opening comment it was no comment
            quote_open1 = False
            quote_region = False
            start = i
        elif (
            c == '"' and
            not comment_open1 and
            not comment_open2 and
            not quote_open1 and
            not quote_open2
        ):
            s_comp.append({
                "string": s[start:i + 1], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            }) # before opening comment it was no comment
            quote_open2 = True
            quote_region = True
            start = i + 1
        elif (
            c == '"' and
            not comment_open1 and
            not comment_open2 and
            not quote_open1 and
            quote_open2
        ):
            s_comp.append({
                "string": s[start:i], 
                "comment": comment_region, 
                "quote": quote_region, 
                "select": select_region
            }) # before opening comment it was no comment
            quote_open2 = False
            quote_region = False
            start = i
    s_comp.append({
        "string": s[start:], 
        "comment": comment_region,
        "quote": quote_region,
        "select": select_region
    })
    s_comp = [d for d in s_comp if d["string"] != ""]  # remove empty strings
    return s_comp

In [None]:
assert_and_print(
    split_query(
"""
create table my_table as -- some table
seLect asdf, [CS]/* some comment */ qwer, 'blabla' as qwerty
from table1
""".strip()
    ),
    [
        {"string": "create table my_table as ", "comment": False, "quote": False, "select": False},
        {"string": "-- some table", "comment": True, "quote": False, "select": False},
        {"string": "\n", "comment": False, "quote": False, "select": False},
        {"string": "seLect asdf, ", "comment": False, "quote": False, "select": True},
        {"string": "[CS]/* some comment */", "comment": True, "quote": False, "select": True},
        {"string": " qwer, '", "comment": False, "quote": False, "select": True},
        {"string": "blabla", "comment": False, "quote": True, "select": True},
        {"string": "' as qwerty\n", "comment": False, "quote": False, "select": True},        
        {"string": "from table1", "comment": False, "quote": False, "select": False},
    ]
)

[{'string': 'create table my_table as ', 'comment': False, 'quote': False, 'select': False}, {'string': '-- some table', 'comment': True, 'quote': False, 'select': False}, {'string': '\n', 'comment': False, 'quote': False, 'select': False}, {'string': 'seLect asdf, ', 'comment': False, 'quote': False, 'select': True}, {'string': '[CS]/* some comment */', 'comment': True, 'quote': False, 'select': True}, {'string': " qwer, '", 'comment': False, 'quote': False, 'select': True}, {'string': 'blabla', 'comment': False, 'quote': True, 'select': True}, {'string': "' as qwerty\n", 'comment': False, 'quote': False, 'select': True}, {'string': 'from table1', 'comment': False, 'quote': False, 'select': False}]


This function is for more comfortable testing purposes in `core`

In [None]:
#export
def split_apply_concat(s, f):
    "Split query `s`, apply function `f` and concatenate strings"
    return "".join([d["string"] for d in f(split_query(s))])

In [None]:
assert_and_print(
    split_apply_concat(
        "select asdf, /* some comment */", lambda split_s: [d for d in split_s if not d["comment"]]
    ),
    "select asdf, "
)

select asdf, 


#### Split by comment / non-comment, quote / non-quote

In [None]:
#export
def split_comment_quote(s):
    "Split query `s` into dictionaries with keys 'string', 'comment' and 'quote'"
    split_s = split_query(s)
    # compress all strings with same keys
    split_s = compress_dicts(split_s, keys=["comment", "quote"])
    return split_s    

In [None]:
assert_and_print(
    split_comment_quote(
"""
create table my_table as -- some table
seLect asdf, [CS]/* some comment */ qwer, 'blabla' as qwerty
from table1
""".strip()
    ),
    [
        {"string": "create table my_table as ", "comment": False, "quote": False},
        {"string": "-- some table", "comment": True, "quote": False},
        {"string": "\nseLect asdf, ", "comment": False, "quote": False},
        {"string": "[CS]/* some comment */", "comment": True, "quote": False},
        {"string": " qwer, '", "comment": False, "quote": False},
        {"string": "blabla", "comment": False, "quote": True},
        {"string": "' as qwerty\nfrom table1", "comment": False, "quote": False},        
    ]
)

[{'string': 'create table my_table as ', 'comment': False, 'quote': False}, {'string': '-- some table', 'comment': True, 'quote': False}, {'string': '\nseLect asdf, ', 'comment': False, 'quote': False}, {'string': '[CS]/* some comment */', 'comment': True, 'quote': False}, {'string': " qwer, '", 'comment': False, 'quote': False}, {'string': 'blabla', 'comment': False, 'quote': True}, {'string': "' as qwerty\nfrom table1", 'comment': False, 'quote': False}]


In [None]:
assert_and_print(
    split_comment_quote(
"""replace('"";', "';'", asdf2) as asdf5"""
    ),
    [
        {"string": """replace('""", "comment": False, "quote": False},
        {"string": """"";""", "comment": False, "quote": True},
        {"string": """', \"""", "comment": False, "quote": False},
        {"string": """';'""", "comment": False, "quote": True},
        {"string": """\", asdf2) as asdf5""", "comment": False, "quote": False}        
    ]
)

[{'string': "replace('", 'comment': False, 'quote': False}, {'string': '"";', 'comment': False, 'quote': True}, {'string': '\', "', 'comment': False, 'quote': False}, {'string': "';'", 'comment': False, 'quote': True}, {'string': '", asdf2) as asdf5', 'comment': False, 'quote': False}]


#### Split by comment / non-comment

In [None]:
#export
def split_comment(s):
    "Split query `s` into dictionaries with keys 'string', 'comment'"
    split_s = split_query(s)
    # compress all strings with same keys
    split_s = compress_dicts(split_s, keys=["comment"])
    return split_s

In [None]:
assert_and_print(
    split_comment(
"""
create table my_table as -- some table
seLect asdf, [CS]/* some comment */ qwer, 'blabla' as qwerty
from table1
""".strip()
    ),
    [
        {"string": "create table my_table as ", "comment": False},
        {"string": "-- some table", "comment": True},
        {"string": "\nseLect asdf, ", "comment": False},
        {"string": "[CS]/* some comment */", "comment": True},
        {"string": " qwer, 'blabla' as qwerty\nfrom table1", "comment": False},
    ]
)

[{'string': 'create table my_table as ', 'comment': False}, {'string': '-- some table', 'comment': True}, {'string': '\nseLect asdf, ', 'comment': False}, {'string': '[CS]/* some comment */', 'comment': True}, {'string': " qwer, 'blabla' as qwerty\nfrom table1", 'comment': False}]


### Get positions of specific keywords in query ignoring comments

In [None]:
#export
def identify_in_sql(regex, s):
    "Find positions of `regex` (str or list) in string `s` ignoring comment and text in quotes"
    split_s = split_comment_quote(s)  # split by comment / non-comment and quote / non-quote
    regex = [regex] if not isinstance(regex, list) else regex # put keyword into list if it is a string
    positions = []  # define output container
    cumul_len = 0  # cumulative length of string
    for d in split_s:  # loop on dictionaries with strings
        if not d["comment"] and not d["quote"]:  # only for non comments and non text in quotes
            for reg in regex:  # loop on regex
                aux_positions = [match.start() for match in re.finditer(reg, d["string"], flags=re.I)]
                if len(aux_positions) > 0:  # if found some matches
                    # add the cumulative length of the string for the actual position in the whole string
                    aux_positions = [pos + cumul_len for pos in aux_positions]
                    positions.extend(aux_positions)
        # increase the cumulative length
        cumul_len += len(d["string"])
    positions = sorted(positions)  # sort positions before returning
    return positions

In [None]:
assert_and_print(
    identify_in_sql(r"\bcase\b", "select asdf, qwer, case when blabla case"),
    [19, 36]
)

[19, 36]


#### Split individual queries based on semicolon

In [None]:
#export
def split_by_semicolon(s):
    "Split string `s` by semicolon but not between parenthesis or in comments"
    positions = identify_in_sql(";", s)  # get semicolon positions
    if positions is []:  # if no semicolon then return full string
        return s
    # add the 0 position if there is no one
    positions = [0] + positions if 0 not in positions else positions
    # loop on start-end of string pairs
    split_s = []  # initialize output
    for start, end in zip(positions, positions[1:]+[None]):
        # return splits
        if start == 0:
            split_s.append(s[start:end])
        else:
            split_s.append(s[start+1:end])  # do not take the semicolon
    return split_s

In [None]:
assert_and_print(
    split_by_semicolon(
"""
use database my_database; -- ; ; ;;
use schema my_schema; /* -- ; */

create or replace table my_table as
select asdf, qwer, /* ;; -- ; */
replace('"";', "';'", asdf2) as asdf5
qwer2 from -- ;
table2;

use schema_another_schema;
"""
    ),
    [
        "\nuse database my_database",
        " -- ; ; ;;\nuse schema my_schema",
        """ /* -- ; */\n\ncreate or replace table my_table as\nselect asdf, qwer, /* ;; -- ; */\nreplace(\'"";\', "\';\'", asdf2) as asdf5\nqwer2 from -- ;\ntable2""",
        "\n\nuse schema_another_schema",
        "\n"
    ]
)

['\nuse database my_database', ' -- ; ; ;;\nuse schema my_schema', ' /* -- ; */\n\ncreate or replace table my_table as\nselect asdf, qwer, /* ;; -- ; */\nreplace(\'"";\', "\';\'", asdf2) as asdf5\nqwer2 from -- ;\ntable2', '\n\nuse schema_another_schema', '\n']


In [None]:
#export
def replace_newline_chars(s):
    "Replace newline characters in `s` by whitespace but not in the comments"
    positions = identify_in_sql("\n", s)
    clean_s = "".join([c if i not in positions else " " for i, c in enumerate(s)])
    return clean_s

In [None]:
assert_and_print(
    replace_newline_chars(
        "select asdf,\nqwer, /* some comment \n with multiple lines \n*/[C], some_field from\n table"
    ),
    "select asdf, qwer, /* some comment \n with multiple lines \n*/[C], some_field from  table"
)

select asdf, qwer, /* some comment 
 with multiple lines 
*/[C], some_field from  table


### Substitute regex in SQL ignoring comments and quotes

In [None]:
#export
def sub_in_sql(regex, repl, s):
    "Subsitute `regex` with `repl` in query `s` ignoring comments and text in quotes"
    split_s = split_comment_quote(s)  # split by comment / non-comment and quote / non-quote
    for d in split_s:  # loop on dictionaries with strings
        if not d["comment"] and not d["quote"]:  # only for non comments and non text in quotes
            d["string"] = re.sub(regex, repl, d["string"])
    s = "".join(d["string"] for d in split_s)
    return s

In [None]:
assert_and_print(
    sub_in_sql(
        r",([\w\d])", r", \1", "select asdf,qwer, /*asdf,qwer*/ substr(',asdf',1, 2)"
    ),
    "select asdf, qwer, /*asdf,qwer*/ substr(',asdf', 1, 2)"
)

select asdf, qwer, /*asdf,qwer*/ substr(',asdf', 1, 2)


### Add whitespaces after comma

In [None]:
#export
def add_whitespaces_after_comma(s):
    "Add whitespace after comma in query `s` if there is no whitespace"
    s = sub_in_sql(r",([\w\d]+)", r", \1", s)
    return s

In [None]:
assert_and_print(
    add_whitespaces_after_comma(
        "select asdf,qwer, /*asdf,qwer*/ substr(',asdf',1, 2)"
    ),
    "select asdf, qwer, /*asdf,qwer*/ substr(',asdf', 1, 2)"
)

select asdf, qwer, /*asdf,qwer*/ substr(',asdf', 1, 2)


In [None]:
assert_and_print(
    add_whitespaces_after_comma("select asdf,qwer,substr(asdf,1,2) as qwerty"),
    "select asdf, qwer, substr(asdf, 1, 2) as qwerty"
)

select asdf, qwer, substr(asdf, 1, 2) as qwerty


In [None]:
assert_and_print(
    add_whitespaces_after_comma("select asdf, qwer, substr(asdf,1,2) as qwerty"),
    "select asdf, qwer, substr(asdf, 1, 2) as qwerty"
)

select asdf, qwer, substr(asdf, 1, 2) as qwerty


Function to identify end of fields in SELECT. Usually this is a comma but more generally specially when using functions this strategy does not work anymore. For the sake of explanation, consider for instance

`substr(asdf, 1, 2)`

In that case, we would like to not add a newline for each comma.

In [None]:
#export
def identify_end_of_fields(s):
    "Identify end of fields in query `s`"
    # container for positions
    end_of_fields = []
    # counter for parenthesis and comments
    k = 0
    # loop over string characters
    for i, c in enumerate(s):
        if c == "," and k == 0:  # field without parenthesis or after closing parenthesis
            after_c = s[i:i+6]
            if not bool(re.search(r"(?:--|\/\*|\[C\]|\[CS\])", after_c)):
                end_of_fields.append(i)
        elif c == "(" or s[i:i+2] in ("--" ,"/*"): # if there is an opening parenthesis or comment
            k += 1
        elif c == ")" or s[i:i+3] == "[C]":  # if there is a closing parenthesis or closing comment
            k -= 1
    return end_of_fields

In [None]:
assert_and_print(
    identify_end_of_fields(
"""
select asdf, substr(asdf, 1, 2) as qwer, concat(substr(asdf, 1, 2), substr(asdf, 3, 2)) as qwer2, asdf5
"""), 
    [12, 40, 97]
)

[12, 40, 97]


More advanced with comments

In [None]:
assert_and_print(identify_end_of_fields(
"""
select asdf, /* Some commnent */[C]qwerty, substr(asdf, 1, 2) as qwer, -- Some comment[C] asdf5
"""), [42]
)

[42]


In [None]:
assert_and_print(identify_end_of_fields(
"""
select asdf, [CS]/* Some commnent */[C]qwerty, substr(asdf, 1, 2) as qwer, -- Some comment[C] asdf5
"""), [46]
)

[46]


Even more tricky with a comma inside comment

In [None]:
assert_and_print(identify_end_of_fields(
"""
select asdf, /* Some, commnent */[C]qwerty, substr(asdf, 1, 2) as qwer, -- Some, comment[C] asdf5
"""), [43]
)

[43]


Having the end of fields positions, we need function to add a newline and proper indentation

In [None]:
#export
def add_newline_indentation(s, indentation):
    "Add newline and indentation for end of fields in query `s`"
    split_s = []
    positions = identify_end_of_fields(s)
    if positions is []:
        return s
    else:  # add the first position 0
        # add + 1 for the end position
        positions = [0] + [pos + 1 for pos in positions]
    for start, end in zip(positions, positions[1:]+[None]):
        # strip from the left to remove whitespaces
        split_s.append(s[start:end].lstrip())  # get string part
        split_s.append("\n" + " " * indentation)  # add indentation
    s = "".join(split_s)
    s = s.strip()    
    return s

In [None]:
assert_and_print(
    add_newline_indentation(
        "select asdf, substr(asdf, 1, 2) as qwer, concat(substr(asdf, 1, 2), substr(asdf, 3, 2)) as qwer2, asdf5",
        7
    ),
"""
select asdf,
       substr(asdf, 1, 2) as qwer,
       concat(substr(asdf, 1, 2), substr(asdf, 3, 2)) as qwer2,
       asdf5
""".strip()
)

select asdf,
       substr(asdf, 1, 2) as qwer,
       concat(substr(asdf, 1, 2), substr(asdf, 3, 2)) as qwer2,
       asdf5


In [None]:
assert_and_print(
    add_newline_indentation(
        "select asdf, substr(asdf, 1, 2) as qwer, lead(qwer) OVER (partition by asdf order by qwer), asdf2",
        7
    ),
"""
select asdf,
       substr(asdf, 1, 2) as qwer,
       lead(qwer) OVER (partition by asdf order by qwer),
       asdf2
""".strip()
)

select asdf,
       substr(asdf, 1, 2) as qwer,
       lead(qwer) OVER (partition by asdf order by qwer),
       asdf2


In [None]:
assert_and_print(
    add_newline_indentation(
        "select asdf, replace(',', '', asdf) as qwer, lead(qwer) OVER (partition by asdf order by qwer), asdf2",
        7
    ),
"""
select asdf,
       replace(',', '', asdf) as qwer,
       lead(qwer) OVER (partition by asdf order by qwer),
       asdf2
""".strip()
)

select asdf,
       replace(',', '', asdf) as qwer,
       lead(qwer) OVER (partition by asdf order by qwer),
       asdf2


### Handling subqueries

In [None]:
#export
def extract_outer_subquery(s):
    "Extract outer subquery in query `s`"
    # initialize container for subquery positions
    # in string `s`
    subquery_pos = []
    # auxiliar indicator to get the subquery right
    ind = True
    # counter for parenthesis
    k = 0
    # loop over string characters
    for i, c in enumerate(s):
        if s[i:(i+8)] == "(\nSELECT" and ind: # query start
            subquery_pos.append(i)
            k = 0  # set the parenthesis counter to 0
            # turn off the indicator for the program to know
            # that we already hit the subquery start
            ind = False
        elif c == "(": # if there is a parenthesis not involving a subquery
            k += 1
        elif c == ")" and k == 0 and not ind: # end position for subquery
            subquery_pos.append(i)
            return subquery_pos
        elif c == ")":
            k -= 1

In [None]:
assert (
    extract_outer_subquery(
        "() () (\nSELECT () (\nSELECT ())) ()"
    ) == [6, 30]
)

In [None]:
#export
def format_subquery(s, previous_s):
    "Format subquery in line `s` based on indentation on `previous_s`"
    s = re.sub(r"^\(\nSELECT", "(SELECT", s)  # remove newline between parenthesis and SELECT
    # get reference line for the indentation level
    # and remove whitespaces from the left
    last_line = previous_s.split("\n")[-1]
    ref_line = last_line.lstrip()
    # if the line contains a JOIN statement then indent with
    # 4 whitespaces
    if re.match(r"\w+ join", ref_line, flags=re.I):
        ref_line = "    " + ref_line
    indentation = len(ref_line) + 1  # get indentation level
    split_s = s.split("\n")  # get lines in subquery
    indented_s = [
        " " * indentation + line  # indent all lines the same
        if not re.match(r"SELECT", line)
        else line
        for line in split_s[1:]
    ]
    # SELECT line + indented lines afterwards
    formatted_split = [split_s[0]] + indented_s
    # concatenate with newline character
    formatted_s = "\n".join(formatted_split)
    return formatted_s

### Query identification

In [None]:
#export
def check_sql_query(s):
    """Checks whether `s` is a SQL query based on match of CREATE TABLE / VIEW or SELECT ignoring comments and text
    in quotes"""
    split_s = split_query(s)  # split in comment / non-comment, quote / non-quote regions
    s_code = "".join([d["string"] for d in split_s if not d["comment"] and not d["quote"]])
    return (bool(re.search(pattern=r".*(?:select|create.{0,10}(?:table|view)).*", string=s_code, flags=re.I)) and
            not bool(re.search(pattern=r"create(?!.*(?:table|view))", string=s_code, flags=re.I)))

In [None]:
assert check_sql_query("""
--- Table 1---
creaTe or replace table my_table as
select asdf
from table
where asdf = 1
""".strip())

In [None]:
assert check_sql_query("""
--- Table 1---
creaTe or replace view my_table as
select asdf
from table
where asdf = 1
""".strip())

In [None]:
assert check_sql_query("""
SELECT qwer, asdf
""")

In [None]:
assert not check_sql_query("use database my_database;")

In [None]:
assert not check_sql_query("use database my_database; /* create table */")

In [None]:
assert not check_sql_query("""
create or replace task my_task as
""")

In [None]:
assert not check_sql_query("""
create or replace task my_task as
create or replace table as
select asdf
""")

In [None]:
assert not check_sql_query("""

use schema my_schema;
""")

### Marker to not format queries specified by the user

In [None]:
#export
def check_skip_marker(s):
    "Checks whether user set marker /*skip-formatter*/ to not format query"
    return bool(re.search(r"\/\*skip-formatter\*\/", s))

In [None]:
assert check_skip_marker("""
SELECT asdf,
 qwer,
 /*skip-formatter*/
 asdf2
FRoM table1
""")

In [None]:
assert not check_skip_marker("""
SELECT asdf,
 qwer,
 asdf2
FRoM table1
""")

#### Check lines were CREATE .. TABLE / VIEW appear

This will be used for query split validation. After splitting by individual queries, if the statement CREATE .. TABLE / VIEW appears twice then the user most probably has forgotten a semicolon

In [None]:
#export
def identify_create_table_view(s):
    "Identify positions of CREATE .. TABLE / VIEW statements"
    split_s = split_query(s)
    s_without_comments = "".join([sd["string"] for sd in split_s if not sd["comment"]])
    s_lines = s_without_comments.split("\n")
    line_numbers = [
        i + 1
        for i, line in enumerate(s_lines)
        if re.search("(?:create.*?table|create.*?view)", line, flags=re.I)        
    ]
    return line_numbers

In [None]:
assert_and_print(
    identify_create_table_view(
"""
cReate or Replace table my_table as
select asdf, qwer
from table1

create table qwerty as
select field, field2
from table2;
"""
    ), [2, 6]
)

[2, 6]


In [None]:
#export
def count_lines(s):
    "Count the number of lines in `s`"
    return s.count("\n")

In [None]:
assert_and_print(
    count_lines(
"""
cReate or Replace table my_table as
select asdf, qwer
from table1

create table qwerty as
select field, field2
from table2;
"""
    ), 8
)

8


In [None]:
#export
def find_line_number(s, positions):
    "Find line number in `s` out of `positions`"
    return [s[0:pos].count("\n") + 1 for pos in positions]

In [None]:
assert_and_print(
    find_line_number(
"""
cReate or Replace table my_table as
select asdf, qwer
from table1

create table qwerty as
select field, field2
from table2;
""", [1, 68]),
    [2, 6]
)

[2, 6]


In [None]:
#export
def disimilarity(str1, str2):
    "Calculate disimilarity between two strings by word"
    # split by space or comma
    split1 = re.split(r"(?:\s|,)", str1)
    split1 = [sp for sp in split1 if sp != ""]
    split2 = re.split(r"(?:\s|,)", str2)
    split2 = [sp for sp in split2 if sp != ""]    
    count1 = Counter(split1)
    count2 = Counter(split2)
    all_words = set(list(count1.keys()) + list(count2.keys()))
    disimilarity = 0
    for w in all_words:
        disimilarity += abs(count1[w] - count2[w])
    return disimilarity

In [None]:
assert_and_print(disimilarity("hello world", "hello world"), 0)

0


In [None]:
assert_and_print(disimilarity("hello world", "hello world!"), 2)

2


In [None]:
#export
def assign_comment(fs, cds):
    """Assign comments in list of dictionaries `cds` to formatted string `fs` using Jaccard distance
    
    The comment dictionaries `cds` should contain the keys "comment" and "preceding" (string)
    """
    fsplit_s = fs.split("\n")
    number_of_lines = len(fsplit_s)
    # define container for output
    fsplit_s_out = fsplit_s.copy()
    # compile regex before loop
    replace_and_or = re.compile(r"\b(?:and|or)\b", flags=re.I)
    replace_c = re.compile(r"\[C\]")
    match_beginn_cs = re.compile(r"^\[CS\]")
    replace_select = re.compile(r"\b(?:select distinct |select )", flags=re.I)
    # loop on comments to be assigned
    for i, d in enumerate(cds):
        cum_preceding = "".join([d["preceding"] for d in cds[0:i+1]])
        cp_list = [
            disimilarity(replace_and_or.sub("", s.strip()), cum_preceding)
            for s in accumulate([s for s in fsplit_s], operator.add)
        ]
        # get line number with maximal jaccard distance (most similar)
        line_number = min(enumerate(cp_list), key=lambda x: x[1])[0]
        line = fsplit_s[line_number]
        next_line = (
            fsplit_s[line_number + 1]  # next line is relevant for indentation of whole line comments
            if line_number < number_of_lines - 1
            else fsplit_s[line_number]  # if there is no next line then take the current line
        )
        indentation = len(next_line) - len(replace_select.sub("", next_line.lstrip()))
        # add comment to it and replace [C] by empty string and [CS] by newline + proper indentation
        whitespace = "" if match_beginn_cs.match(d["comment"]) else " "
        fsplit_s_out[line_number] += whitespace + re.sub(
            "\[CS\]", 
            "\n" + " " * indentation, 
            replace_c.sub("", d["comment"])
        )        
    s_out = "\n".join(fsplit_s_out)
    return s_out

In [None]:
assert_and_print(
    assign_comment(
"""
select asdf,
       qwer,
       case when asdf = 1 and
                 asdf = 2 then 2
            when asdf = 3 then 3
            else 0 end as qwerty,
       qwer2
""".strip(),
        [
            {"comment": "/* some comment */[C]", "preceding": "select asdf, qwer, "},
            {"comment": "-- comment there[C]", "preceding": "case when asdf = 1 "},
            {"comment": "-- comment here[C]", "preceding": "and asdf = 2 "},
            {"comment": "/* bla bla */[C]", "preceding": "then 2 when asdf = 3 then 3"}
        ]
    ),
"""
select asdf,
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2 -- comment here
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       qwer2
""".strip()
)

select asdf,
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2 -- comment here
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       qwer2


In [None]:
assert_and_print(
    assign_comment(
"""
select asdf,
       qwer,
       case when asdf = 1 and
                 asdf = 2 then 2
            when asdf = 3 then 3
            else 0 end as qwerty,
       qwer2
""".strip(),
        [
            {"comment": "/* some comment */[C]", "preceding": "select asdf, qwer, "},
            {"comment": "-- comment there[C]", "preceding": "case when asdf = 1"},
            {"comment": "-- comment here[C][CS]/* Whole line comment */[C]", "preceding": "and asdf = 2"},
            {"comment": "/* bla bla */[C]", "preceding": "then 2 when asdf = 3 then 3"}
        ]
    ),
"""
select asdf,
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2 -- comment here
            /* Whole line comment */
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       qwer2
""".strip()
)

select asdf,
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2 -- comment here
            /* Whole line comment */
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       qwer2


In [None]:
assert_and_print(
    assign_comment(
"""
select asdf,
       qwer,
       case when asdf = 1 and
                 asdf = 2 then 2
            when asdf = 3 then 3
            else 0 end as qwerty,
       qwer2
""".strip(),
        [
            {"comment": "/* some comment */[C]", "preceding": "select asdf, qwer, "},
            {"comment": "-- comment there[C]", "preceding": "case when asdf = 1"},
            {"comment": "[CS]/* Whole line comment */[C]", "preceding": "and asdf = 2"},
            {"comment": "/* bla bla */[C]", "preceding": "then 2 when asdf = 3 then 3"}
        ]
    ),
"""
select asdf,
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2
            /* Whole line comment */
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       qwer2
""".strip()
)

select asdf,
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2
            /* Whole line comment */
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       qwer2


In [None]:
assert_and_print(
    assign_comment(
"""
select asdf,
       qwer,
       case when asdf = 1 and
                 asdf = 2 then 2
            when asdf = 3 then 3
            else 0 end as qwerty,
       qwer2
""".strip(),
        [
            {"comment": "[CS]/* Whole line after select */[C]", "preceding": "select asdf,"},
            {"comment": "/* some comment */[C]", "preceding": "qwer, "},
            {"comment": "-- comment there[C]", "preceding": "case when asdf = 1"},
            {"comment": "[CS]/* Whole line comment */[C]", "preceding": "and asdf = 2"},
            {"comment": "/* bla bla */[C]", "preceding": "then 2 when asdf = 3 then 3"},
            {"comment": "[CS]/* another whole line comment */[C]", "preceding": "else 0 end as qwerty,"}
        ]
    ),
"""
select asdf,
       /* Whole line after select */
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2
            /* Whole line comment */
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       /* another whole line comment */
       qwer2
""".strip()
)

select asdf,
       /* Whole line after select */
       qwer, /* some comment */
       case when asdf = 1 and -- comment there
                 asdf = 2 then 2
            /* Whole line comment */
            when asdf = 3 then 3 /* bla bla */
            else 0 end as qwerty,
       /* another whole line comment */
       qwer2


In [None]:
assert_and_print(
    assign_comment(
"""
select distinct asdf,
                qwer,
                case when asdf = 1 and
                          asdf = 2 then 2
                     when asdf = 3 then 3
                     else 0 end as qwerty,
                qwer2
""".strip(),
        [
            {"comment": "[CS]/* Whole line after select */[C]", "preceding": "select distinct asdf,"},
            {"comment": "/* some comment */[C]", "preceding": "qwer, "},
            {"comment": "-- comment there[C]", "preceding": "case when asdf = 1"},
            {"comment": "[CS]/* Whole line comment */[C]", "preceding": "and asdf = 2"},
            {"comment": "/* bla bla */[C]", "preceding": "then 2 when asdf = 3 then 3"}
        ]
    ),
"""
select distinct asdf,
                /* Whole line after select */
                qwer, /* some comment */
                case when asdf = 1 and -- comment there
                          asdf = 2 then 2
                     /* Whole line comment */
                     when asdf = 3 then 3 /* bla bla */
                     else 0 end as qwerty,
                qwer2
""".strip()
)

select distinct asdf,
                /* Whole line after select */
                qwer, /* some comment */
                case when asdf = 1 and -- comment there
                          asdf = 2 then 2
                     /* Whole line comment */
                     when asdf = 3 then 3 /* bla bla */
                     else 0 end as qwerty,
                qwer2


In [None]:
assert_and_print(
    assign_comment(
        "select asdf,\n       antworqwer",
        [{'comment': '-- Some comment[C]', 'preceding': 'select asdf, '},
         {'comment': '-- Another comment', 'preceding': 'antworqwer '}]
    ),
"""
select asdf, -- Some comment
       antworqwer -- Another comment
""".strip()
)


select asdf, -- Some comment
       antworqwer -- Another comment


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_format_file.ipynb.
Converted 02_utils.ipynb.
Converted 03_validation.ipynb.
Converted index.ipynb.
