In [None]:
import json
import time
import os
import re
import shlex
from dataclasses import dataclass
from datetime import datetime
from configparser import ConfigParser

import pandas as pd
import psycopg

In [None]:
#loading db config
def load_config(filename="database.ini", section="postgresql"):
    parser = ConfigParser()
    parser.read(filename)

    config = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            config[param[0]] = param[1]
    else:
        raise Exception("Section {0} not found in the {1} file".format(section, filename))
    return config

#connecting to db
try:
    pg_conn = psycopg.connect(**load_config())
    cursor = pg_conn.cursor()
except (psycopg.DatabaseError, Exception) as error:
    print(error)

In [None]:
#formatting query
def format_query(query):
    #removing comments
    query = re.sub(r"--.*", "", query)
    query = re.sub(r"/\*.*?\*/", "", query, flags = re.DOTALL)
    query = re.sub(r"EXPLAIN (ANALYZE|(\(.*\)))", "", query) #fallback for explain
    
    #joining words with spaces while preserving quoted strings
    query = " ".join(shlex.split(query, posix = False))

    return query

queries = []

#loading all queries
for root, dirs, files in os.walk(os.curdir):
    for file in files:
        if not file.endswith(".sql"):
            continue

        path = os.path.join(root, file)
        query = open(path, "r").read()
        query = format_query(query)

        queries.append([file, query])
        print([file, query])

In [None]:
#execution settings
precache_repeats = 1
query_repeats = 3

In [None]:
@dataclass
class QueryResult:
    label: str
    query: str
    bench_time: datetime
    result_set: dict
    exec_time: float

#executes all statements in query and returns the best result of all precache_repeats
def run_query(query, precache_repeats = 1):

    for statement in query.split(";"):
        statement = statement.strip()

        if statement == "":
            continue

        if statement.upper().startswith(("CREATE", "REPLACE", "REFRESH", "DROP")):
            cursor.execute(statement)
            continue

        for i in range(precache_repeats):
            bench_start, query_start = datetime.now(), time.perf_counter_ns()
            cursor.execute("EXPLAIN (ANALYZE, BUFFERS, FORMAT JSON, SETTINGS) " + statement)

            result_set = cursor.fetchall()
            query_end = time.perf_counter_ns()

            result = QueryResult(
                label = label,
                query = query,
                bench_time = bench_start,
                result_set = json.dumps(result_set),
                exec_time = query_end - query_start
            )

            if i == 0:
                best_result = result

            if result.exec_time < best_result.exec_time:
                best_result = result
        
    return best_result

#executing queries
results: list[QueryResult] = []

for label, query in queries:

    print(label)

    for i in range(query_repeats):
        result = run_query(query, precache_repeats)
        results.append(result)

In [None]:
#saving results
df = pd.DataFrame(results)
df.to_csv("results.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#plotting results
sns.set_context("talk")
sns.set_palette("viridis")
sns.set_theme(style = "whitegrid")

df["exec_time"] = df["exec_time"] / 1000000

g = sns.barplot(
    data = df,
    x = "exec_time",
    y = "label"
)

g.set(
    title = "Query runtimes",
    xlabel = "Execution time [ms]",
    ylabel = "Query"
)

g.get_figure().set_size_inches(10, 10)
g.get_figure().tight_layout()

print(g)