<a href="https://colab.research.google.com/github/Shalala06/SQL-Formatting-Python/blob/seb/SQL-Formatting-Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
def format_sql(script):
    # Defining key words
    keywords = [
        'SELECT', 'FROM', 'WHERE', 'GROUP BY', 'ORDER BY', 'HAVING', 'JOIN', 'INNER JOIN',
        'LEFT JOIN', 'RIGHT JOIN', 'OUTER JOIN', 'INSERT INTO', 'UPDATE', 'SET', 'DELETE',
        'ALTER', 'DROP TABLE', 'PARTITION BY', 'OVER', 'ROW_NUMBER', 'IS', 'IS NOT', 'NULL',
        'CASE', 'WHEN', 'THEN', 'END', 'ON', 'AND', 'OR', 'AS', 'CREATE TABLE'
    ]
    primary_keywords = [
        'SELECT', 'FROM', 'WHERE', 'GROUP BY', 'ORDER BY', 'HAVING', 'LEFT JOIN', 'RIGHT JOIN',
        'OUTER JOIN', 'UPDATE', 'SET', 'DELETE', 'ALTER'
    ]
    join_keywords = ['INNER JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'OUTER JOIN']

    special_keywords = ['AND', 'ON', ')']
    # Normalize the script into one line
    script = ' '.join(script.split())
    # Capitalize keywords
    def capitalize_keywords(match):
        return match.group(0).upper()
    for keyword in keywords:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
        script = pattern.sub(capitalize_keywords, script)
    # Handle DROP TABLE statements
    script = re.sub(r'\bDROP TABLE\b', '\n  DROP TABLE', script, flags=re.IGNORECASE)
    # Handle CREATE TABLE statements
    script = re.sub(r'\bCREATE TABLE\b', '\n\nCREATE TABLE', script, flags=re.IGNORECASE)
    # Drop down and indent primary keywords
    for keyword in primary_keywords:
        script = re.sub(r'\b' + re.escape(keyword) + r'\b(?!\s*\n\s*)', f'\n\n{keyword}\n  ', script, flags=re.IGNORECASE)
    # Indent JOIN keywords
    for join_keyword in join_keywords:
        script = re.sub(r'\b' + re.escape(join_keyword) + r'\b', '\n\n  ' + join_keyword, script, flags=re.IGNORECASE)

    # Indent Special keywords
    for keyword in special_keywords:
        script = re.sub(r'\b(?!\s*\n\s*)' + re.escape(keyword) + r'\b(?!\s*\n\s*)', '\n  ' + keyword, script, flags=re.IGNORECASE)

    script = re.sub(r'\)', '\n)', script)
    # New line and comma for multiple columns in clauses
    script = re.sub(r'\s*,\s*', '\n   , ', script)

    # Identifies each subquery in the input, it's indentation level and the start and end positions
    def subquery_position(script):
        sq_level_list = []
        sq_start = []
        sq_end = []
        sq = []

        # Identifies positions of each SELECT in script
        select_positions = [m.start() for m in re.finditer(r'\bSELECT\b', script, re.IGNORECASE)]
        # Loops through positions and count parentheses before each SELECT - Takes the difference between open and closed brackets to calculate
        # subquery/indentation level
        for index, subquery_start in enumerate(select_positions):
            before_select = script[:subquery_start]
            open_count = before_select.count('(')
            close_count = before_select.count(')')
            subquery_level = open_count - close_count
            sq_level_list.append(subquery_level)

            # Loops through the script from each subquery start onwards - i is used as a count to identify subquery end which is indicated
            # by balancing parentheses to match 1 - subqueries are then be extracted from script using start and end positions
            subquery = ""
            open_count_sub = 0
            close_count_sub = 0
            for i, char in enumerate(script[subquery_start:]):
                subquery += char
                if char == '(':
                    open_count_sub += 1
                elif char == ')':
                    close_count_sub += 1
                if close_count_sub - open_count_sub == 1:
                    break
            subquery_end = i + 1 + subquery_start  # end position is one past the last character
            subquery = script[subquery_start:subquery_end]
            sq_start.append(subquery_start)
            sq_end.append(subquery_end)
            sq.append(subquery)

        return sq_level_list, sq_start, sq_end, sq # sq_level_list is each indent level, sq_start is the 'S' in SELECT, sq_end is the ')' character + 1, sq is the whole subquery from sq_start to sq_end

    # Uses the output of find subqueries to indent based on the subquery level
    def indent_subqueries(script):
        subquery_level, sq_start, sq_end, sq = subquery_position(script)
        adjusted_sq = []
        subquery_list = []
        subquery_indent_list = []
        end_subqueries = []
        end_subquery_indent_list = []
        end_subquery_list = []

        select_positions = [m.start() for m in re.finditer(r'\bSELECT\b', script, re.IGNORECASE)]

        for i in range(len(sq_end) - 1):
          sq_end_to_end = script[sq_end[i+1]+1:sq_end[i]+1]  # Slice between current and next element
          end_subqueries.append(sq_end_to_end)

        # Adjusting subqueries - Separates subqueries into constituent parts if it contains nested subqueries -
        # this breaks the subquery into two parts (prior nested subquery & post nested subquery)
        # Loops through the subqueries and skips the index[0] as this is the entire script
        # If current subquery is in previous subquery then the position of the previous SELECT up to 2 characters before the SELECT
        # of the current subquery will be taken - the previous subquery is updated in sq list to its prior nested subquery part
        for idx in range(1, len(sq)):
            if idx == 1:
                continue

            if sq[idx] in sq[idx - 1]:

                prev_subquery_start = select_positions[idx - 1]
                adjusted_sub = script[prev_subquery_start:select_positions[idx] - 2]
                sq[idx - 1] = adjusted_sub

        del sq[0]
        del subquery_level[0]

        # Indenting adjusted sq list
        for index, sub in enumerate(sq):
            line_list = ""
            line_indented_list = ""
            for line in sub.split('\n'):
                line_indented = 's' * (subquery_level[index] * 4) + line  # Indent with 4 spaces per level
                line_list += '\n' + line
                line_indented_list += '\n' + line_indented
            subquery_indent_list.append(line_indented_list)
            subquery_list.append(line_list)

            end_line_list = ""
            end_line_indented_list = ""
            for line in end_subqueries[index].split('\n'):
                end_line_indented = 's' * (subquery_level[index] * 4) + line  # Indent with 4 spaces per level
                end_line_list += '\n' + line
                end_line_indented_list += '\n' + end_line_indented
            end_subquery_indent_list.append(end_line_indented_list)
            end_subquery_list.append(end_line_list)

        for i in range(len(subquery_list)):
            script = script.replace(subquery_list[i], subquery_indent_list[i])
            script = script.replace(end_subquery_list[i], end_subquery_indent_list[i])

        return script

    return indent_subqueries(script)

sql_script = """
SELECT
    customers.customer_id,
    customers.customer_name,
    orders.order_id,
    orders.order_date,
    order_details.quantity,
    products.product_name
FROM
    customers
    INNER JOIN (
        SELECT
            orders.order_id,
            orders.attribute_id
        FROM
            orders
            INNER JOIN (
                SELECT
                    attribute_id
                FROM
                    product_attribute
                INNER JOIN (
                    SELECT
                        attribute
                        , row_number() over (partition by person_id order by order) as rn
                    FROM
                        attribute
                    INNER JOIN (
                        SELECT
                            id
                        FROM
                            product
                    ) AS product4
                    ON product_attribute.attribute_id = product4.attribute_id AND order.order_id IS NOT NULL
                ) AS product3
                ON product_attribute.attribute_id = product3.attribute_id
            WHERE order.product IS NULL) AS product2
            ON orders.attribute_id = product2.attribute_id
        WHERE
            orders.order_id IS NOT NULL
    ) AS orders
    ON customers.order_id = orders.order_id
ORDER BY
    product.product_id;
"""
formatted_sql = format_sql(sql_script)
print(formatted_sql)



SELECT
   customers.customer_id
   , customers.customer_name
   , orders.order_id
   , orders.order_date
   , order_details.quantity
   , products.product_name 

FROM
   customers 

  INNER JOIN ( 

ssssSELECT
ssss   orders.order_id
ssss   , orders.attribute_id 
ssss
ssssFROM
ssss   orders 
ssss
ssss  INNER JOIN ( 

ssssssssSELECT
ssssssss   attribute_id 
ssssssss
ssssssssFROM
ssssssss   product_attribute 
ssssssss
ssssssss  INNER JOIN ( 

ssssssssssssSELECT
ssssssssssss   attribute
ssssssssssss   , ROW_NUMBER(
ssssssssssss) OVER (PARTITION BY person_id 
ssssssssssss
ssssssssssssORDER BY
ssssssssssss   order
ssssssssssss) AS rn 
ssssssssssss
ssssssssssssFROM
ssssssssssss   attribute 
ssssssssssss
ssssssssssss  INNER JOIN ( 

ssssssssssssssssSELECT
ssssssssssssssss   id 
ssssssssssssssss
ssssssssssssssssFROM
ssssssssssssssss   product 
ssssssssssssssss) AS product4 
  ON product_attribute.attribute_id = product4.attribute_id 
  AND order.order_id IS NOT NULL 
) AS product3 
  ON produ

In [60]:
import re
def format_sql(script):
    # Defining key words
    keywords = [
        'SELECT', 'FROM', 'WHERE', 'GROUP BY', 'ORDER BY', 'HAVING', 'JOIN', 'INNER JOIN',
        'LEFT JOIN', 'RIGHT JOIN', 'OUTER JOIN', 'INSERT INTO', 'UPDATE', 'SET', 'DELETE',
        'ALTER', 'DROP TABLE', 'PARTITION BY', 'OVER', 'ROW_NUMBER', 'IS', 'IS NOT', 'NULL',
        'CASE', 'WHEN', 'THEN', 'END', 'ON', 'AND', 'OR', 'AS', 'CREATE TABLE'
    ]
    primary_keywords = [
        'SELECT', 'FROM', 'WHERE', 'GROUP BY', 'ORDER BY', 'HAVING', 'LEFT JOIN', 'RIGHT JOIN',
        'OUTER JOIN', 'UPDATE', 'SET', 'DELETE', 'ALTER'
    ]
    join_keywords = ['INNER JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'OUTER JOIN']

    special_keywords = ['AND', 'ON', ')']
    # Normalize the script into one line
    script = ' '.join(script.split())
    # Capitalize keywords
    def capitalize_keywords(match):
        return match.group(0).upper()
    for keyword in keywords:
        pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
        script = pattern.sub(capitalize_keywords, script)
    # Handle DROP TABLE statements
    script = re.sub(r'\bDROP TABLE\b', '\n  DROP TABLE', script, flags=re.IGNORECASE)
    # Handle CREATE TABLE statements
    script = re.sub(r'\bCREATE TABLE\b', '\n\nCREATE TABLE', script, flags=re.IGNORECASE)
    # Drop down and indent primary keywords
    for keyword in primary_keywords:
        script = re.sub(r'\b' + re.escape(keyword) + r'\b(?!\s*\n\s*)', f'\n\n{keyword}\n  ', script, flags=re.IGNORECASE)
    # Indent JOIN keywords
    for join_keyword in join_keywords:
        script = re.sub(r'\b' + re.escape(join_keyword) + r'\b', '\n\n  ' + join_keyword, script, flags=re.IGNORECASE)

    # Indent Special keywords
    for keyword in special_keywords:
        script = re.sub(r'\b(?!\s*\n\s*)' + re.escape(keyword) + r'\b(?!\s*\n\s*)', '\n  ' + keyword, script, flags=re.IGNORECASE)

    script = re.sub(r'\)', '\n)', script)

    # New line and comma for multiple columns in clauses
    script = re.sub(r'\s*,\s*', '\n   , ', script)

    # Identifies each subquery in the input, it's indentation level and the start and end positions
    def subquery_position(script):
        sq_level_list = []
        sq_start = []
        sq_end = []
        sq = []
        sq_element_list = []

        # Identifies positions of each SELECT in script
        select_positions = [m.start() for m in re.finditer(r'\bSELECT\b', script, re.IGNORECASE)]
        # Loops through positions and count parentheses before each SELECT - Takes the difference between open and closed brackets to calculate
        # subquery/indentation level
        for index, subquery_start in enumerate(select_positions):
            sq_elements = []
            before_select = script[:subquery_start]
            open_count = before_select.count('(')
            close_count = before_select.count(')')
            subquery_level = open_count - close_count
            sq_level_list.append(subquery_level)

            # Loops through the script from each subquery start onwards - i is used as a count to identify subquery end which is indicated
            # by balancing parentheses to match 1 - subqueries are then be extracted from script using start and end positions
            subquery = ""
            open_count_sub = 0
            close_count_sub = 0
            for i, char in enumerate(script[subquery_start:]):
                subquery += char
                if char == '(':
                    open_count_sub += 1
                elif char == ')':
                    close_count_sub += 1
                if close_count_sub - open_count_sub == 1:
                    break
            subquery_end = i + 1 + subquery_start  # end position is one past the last character
            subquery = script[subquery_start:subquery_end]
            sq_elements = [subquery_level, subquery_start, subquery_end, subquery]
            sq_element_list.append(sq_elements)
        return sq_element_list

    # Uses the output of find subqueries to indent based on the subquery level
    def indent_subqueries(script):
        sq_element_lists = []
        adjusted_sq = []
        subquery_list = []
        subquery_indent_list = []
        end_subqueries = []
        end_subquery_indent_list = []
        end_subquery_list = []
        current_level_list = []
        split_level_list = []

        sq_level = 0
        sq_start = 1
        sq_end = 2
        sq = 3

        sq_elements = subquery_position(script)

        select_positions = [m.start() for m in re.finditer(r'\bSELECT\b', script, re.IGNORECASE)]

        # Extracting subquery blocks
        for elements in sq_elements:
          if elements[sq_level] == 1:
            if current_level_list:
              split_level_list.append(current_level_list)
              current_level_list = []
            current_level_list.append(elements)
          else:
            current_level_list.append(elements)

        if current_level_list:
          split_level_list.append(current_level_list)

        del split_level_list[0]


        for sq_elements in split_level_list:
          string = ""
          for index, line in enumerate(script[sq_elements[0][sq_end]:].split('\n')):
            if line == "":
              break
            string += '\n' + line
            for idx, char in enumerate(string):
              pos = sq_elements[0][sq_end] + idx

          # Extracting end subqueries within subquery blocks
          end_subqueries.append(script[sq_elements[0][sq_end]:pos])

          for i in range(len(sq_elements) - 1):

            sq_end_to_end = script[sq_elements[i+1][sq_end]:sq_elements[i][sq_end]]
            end_subqueries.append(sq_end_to_end)


          # Adjusting subqueries - Separates subqueries into constituent parts if it contains nested subqueries -
          # this breaks the subquery into two parts (prior nested subquery & post nested subquery)
          # Loops through the subqueries and skips the index[0] as this is the entire script
          # If current subquery is in previous subquery then the position of the previous SELECT up to 2 characters before the SELECT
          # of the current subquery will be taken - the previous subquery is updated in sq list to its prior nested subquery part
          for idx in range(len(sq_elements)):

              if sq_elements[idx][sq] in sq_elements[idx - 1][sq]:

                  prev_subquery_start = sq_elements[idx - 1][sq_start]
                  adjusted_sub = script[prev_subquery_start:select_positions[idx] - 2]
                  sq_elements[idx - 1][sq] = adjusted_sub


          # Indenting adjusted sq list
          for index, element in enumerate(sq_elements):
              line_indented_list = ""
              for line in element[sq].split('\n'):
                  line_indented = 's' * (element[sq_level] * 2) + line  # Indent with 2 spaces per level
                  line_indented_list += '\n' + line_indented
              subquery_indent_list.append(line_indented_list)
              subquery_list.append(element[sq])

              end_line_indented_list = ""
              for line in end_subqueries[index].split('\n'):
                  end_line_indented = 's' * (element[sq_level] * 2) + line  # Indent with 2 spaces per level
                  end_line_indented_list += '\n' + end_line_indented
              end_subquery_indent_list.append(end_line_indented_list)
              end_subquery_list.append(end_subqueries[index])

          for i in range(len(subquery_list)):
              script = script.replace(subquery_list[i], subquery_indent_list[i])
              script = script.replace(end_subquery_list[i], end_subquery_indent_list[i])

        return script

    return indent_subqueries(script)

sql_script = """
SELECT
    customers.customer_id,
    customers.customer_name,
    orders.order_id,
    orders.order_date,
    order_details.quantity,
    products.product_name
FROM
    customers
    INNER JOIN (
        SELECT
            orders.order_id,
            orders.attribute_id
        FROM
            orders
            INNER JOIN (
                SELECT
                    attribute_id
                FROM
                    product_attribute
                INNER JOIN (
                    SELECT
                        attribute
                        , row_number() over (partition by person_id order by order) as rn
                    FROM
                        attribute
                    INNER JOIN (
                        SELECT
                            id
                        FROM
                            product
                    ) AS product4
                    ON product_attribute.attribute_id = product4.attribute_id AND order.order_id IS NOT NULL
                ) AS product3
                ON product_attribute.attribute_id = product3.attribute_id
            WHERE order.product IS NULL) AS product2
            ON orders.attribute_id = product2.attribute_id
        WHERE
            orders.order_id IS NOT NULL
    ) AS product1
    ON customers.order_id = orders.order_id
    AND customers.customer_id = product1.customer_id


INNER JOIN (
        SELECT
            o.order_id,
            o.customer_id
        FROM
            orders o
            INNER JOIN (
                SELECT
                    pa.attribute_id
                    , row_number() over (partition by person_id order by order ) as rn
                FROM
                    product_attribute pa

                    INNER JOIN (
                        SELECT
                            pa.house_number
                            , milk_id
                        FROM
                            Local.cornershop
                        ) AS local_shop ON local_shop.milk_id = pa.meowmeow
            ) AS filtered_products ON o.attribute_id = filtered_products.attribute_id
        WHERE
            o.order_id IS NOT NULL
    ) AS orders
ORDER BY
    product.product_id;
"""
formatted_sql = format_sql(sql_script)
print(formatted_sql)