SELECT–FROM–WHERE–GROUP BY

In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from pathlib import Path
import re
from typing import List, Set, Tuple, Dict, Optional, Any
import os, uuid
import numpy as np

from sqlalchemy.orm.base import PASSIVE_OFF


In [2]:
USER = "postgres"
HOST = "localhost"
PORT = "5432"
PASSWORD = "user"

DB_ADMIN_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/postgres"
engine_admin = create_engine(DB_ADMIN_URL, isolation_level="AUTOCOMMIT")

DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/synthea"
engine = create_engine(DB_URL)
print("Connesso al database synthea")

Connesso al database synthea


In [3]:
def run(sql_or_text, show=False):
    with engine.begin() as conn:
        stmt = text(sql_or_text) if isinstance(sql_or_text, str) else sql_or_text
        result = conn.execute(stmt)
        if result.returns_rows:
            df = pd.DataFrame(result.fetchall(), columns=result.keys())
            if show:
                display(df)
            return df
        return None


def _strip_semicolon(sql: str) -> str:
    return re.sub(r';\s*$', '', sql.strip())


def _count_table(tname: str) -> int:
    return int(run(f"SELECT COUNT(*) AS n FROM {tname};").iloc[0]["n"])


def _size_table(tname: str) -> int:
    return int(run(f"SELECT pg_total_relation_size('{tname}') AS bytes;").iloc[0]["bytes"])


def _network_bytes(strategy_key: str, sizes: dict) -> int:
    if strategy_key == "owner-server":
        return sizes.get("ro", 0) + sizes.get("rs", 0)
    if strategy_key == "server-owner":
        return sizes.get("rs", 0)
    if strategy_key == "owner-only":
        return 0
    if strategy_key == "server-only":
        return sizes.get("out", 0)
    if strategy_key == "parallel":
        return sizes.get("ro", 0) + sizes.get("rs", 0)
    return 0


def _unqualify(tok: str) -> str:
    tok = tok.strip().strip('"')
    return tok.split('.')[-1].lower() if '.' in tok else tok.lower()


def _split_outside_parents(s: str) -> List[str]:
    items, buf, d = [], [], 0
    for ch in s:
        if ch == '(':
            d += 1
        elif ch == ')':
            d = max(0, d - 1)
        if ch == ',' and d == 0:
            items.append(''.join(buf).strip())
            buf = []
        else:
            buf.append(ch)
    if buf:
        items.append(''.join(buf).strip())
    return items

FRAMMENTAZIONE VERTICALE

# PATIENTS
Owner(PATIENTS) = { id, birthdate,ssn, drivers, passport, first, middle, last, maiden, address, city, fips, zip, lat, lon, income }


Server(PATIENTS) = { id, deathdate, gender, race, ethnicity, marital, prefix, suffix, birthplace, state, county, healthcare_expanses, healthcare_coverage}

In [4]:
# se servisse ricaricare i dati

sql = open("sql/fragmentPatients.sql").read()

with engine.begin() as conn:
    if sql.strip():
        conn.execute(text(sql))
        print("Frammentazione creata")
    else:
        print("Errore")

# O semplicemente uso la funzione run(sql)

Frammentazione creata


In [5]:
run(''' ANALYZE owner.patients_owner; ANALYZE server.patients_server;''')

In [44]:
Fo = {
    "id", "birthdate", "ssn", "drivers", "passport",
    "first", "middle", "last", "maiden",
    "address", "city", "fips", "zip", "lat", "lon",
    "income"
}

Fs = {
    "id", "deathdate", "gender", "race", "ethnicity", "marital",
    "prefix", "suffix", "birthplace", "state", "county",
    "healthcare_expenses", "healthcare_coverage"
}


In [45]:

def parse_query_groupby(query: str) -> Tuple[Set[str], Set[str], List[Dict[str, Any]]]:
    """
    Estrae: SELECT "plain" (non aggregati), nessuna WHERE (deve mancare),
    insieme di colonne in GROUP BY e lista delle aggregazioni.
    Se trova WHERE -> solleva ValueError.
    Ritorna (select_plain, None, group_by, aggs) per compatibilità.
    """
    q = query.strip()
    m_sel = re.search(r"\bselect\s+(.*?)\s+from\b", q, re.I | re.S)
    if not m_sel:
        raise ValueError("SELECT ... FROM mancante.")
    sel_txt = m_sel.group(1)
    rest = q[m_sel.end():]


    m_gb = re.search(r"\bgroup\s+by\b", rest, re.I)


    group_by_txt = None
    if m_gb:
        group_by_txt = rest[m_gb.end():].strip()
        group_by_txt = re.sub(r';\s*$', '', group_by_txt, flags=re.S)

    # SELECT: separo plain vs aggregazioni
    select_items = _split_outside_parents(sel_txt)
    select_plain: Set[str] = set()
    aggs: List[Dict[str, Any]] = []

    agg_re = re.compile(
        r"^(count|sum|avg|min|max)\s*\(\s*(distinct\s+)?(\*|[a-zA-Z_][\w\.]*)\s*\)\s*(?:as\s+([a-zA-Z_]\w*))?$",
        re.I
    )
    for it in select_items:
        it_norm = it.strip()
        m = agg_re.match(it_norm)
        if m:
            func = m.group(1).lower()
            distinct = bool(m.group(2))
            arg_raw = m.group(3)
            alias = m.group(4).lower() if m.group(4) else None
            arg = None if arg_raw == '*' else _unqualify(arg_raw)
            aggs.append({"func": func, "arg": arg, "distinct": distinct, "alias": alias})
        else:
            # Colonne non aggregate (devono stare nel GROUP BY)
            select_plain.add(_unqualify(it_norm))

    group_by: Set[str] = set()
    if group_by_txt:
        cols = [tok for tok in _split_outside_parents(group_by_txt) if tok.strip()]
        group_by = {_unqualify(c) for c in cols}

    # Coerenza: i "plain" devono essere tutti nel GROUP BY
    if select_plain - group_by:
        missing = ", ".join(sorted(select_plain - group_by))
        raise ValueError(f"Le colonne non aggregate in SELECT devono apparire nel GROUP BY (manca: {missing}).")

    return select_plain, group_by, aggs





def classify_groupby_agg(group_by: Set[str], aggs: List[Dict[str, Any]],
                         Fo: Set[str], Fs: Set[str]) -> Dict[str, Set[str]]:
    """
    Invariante: nessuna WHERE. Mi basta sapere dove stanno i G e gli arg delle funzioni.
    Nota: COUNT(*) ha arg=None -> non è assegnato a nessun lato.
    """
    G_owner = {g for g in group_by if g in Fo}
    G_server = {g for g in group_by if g in Fs}
    Agg_owner = {a["arg"] for a in aggs if a.get("arg") and a["arg"] in Fo}
    Agg_server = {a["arg"] for a in aggs if a.get("arg") and a["arg"] in Fs}
    return {
        "G_owner": G_owner, "G_server": G_server,
        "Agg_owner": Agg_owner, "Agg_server": Agg_server,
    }




def choose_strategy_groupby(classified: Dict[str, Set[str]]) -> str:
    """
    Caso 2: GROUP BY senza WHERE.

    Regole:
      - owner-only:   G ⊆ Fo e tutti gli arg di agg (se presenti) in Fo
      - server-only:  G ⊆ Fs e tutti gli arg di agg (se presenti) in Fs
      - server-owner: G ⊆ Fo e almeno un arg di agg in Fs (nessun arg in Fo)
      - owner-server: G ⊆ Fs e almeno un arg di agg in Fo (nessun arg in Fs)
      - parallel:     G spezzato tra i due oppure arg distribuiti su entrambi
      Nota: COUNT(*) ha arg=None → non forza alcun lato.
    """
    G_o, G_s = classified["G_owner"], classified["G_server"]
    A_o, A_s = classified["Agg_owner"], classified["Agg_server"]

    # Tutto su un solo lato (chiavi e arg aggregazioni)
    if not G_s and not A_s:
        return "owner-only"
    if not G_o and not A_o:
        return "server-only"

    # Chiavi da un lato, misure dall'altro (nessuna misura locale)
    if not G_s and A_s and not A_o:
        return "server-owner"  # G su Owner, A su Server
    if not G_o and A_o and not A_s:
        return "owner-server"  # G su Server, A su Owner

    # Altrimenti: chiavi spezzate o arg distribuiti
    return "parallel"


In [46]:

def render_aggs_sql(aggs: List[Dict[str, Any]], Fo: Set[str]) -> str:
    exprs = []
    for a in aggs:
        func = a["func"].upper()
        distinct = "DISTINCT " if a.get("distinct") else ""
        arg = a.get("arg")
        if arg is None:
            expr = f"{func}(*)"
            alias = a.get("alias") or f"{func.lower()}_all"
        else:
            qual = "o" if arg in Fo else "s"
            expr = f"{func}({distinct}{qual}.{arg})"
            alias = a.get("alias") or f"{func.lower()}_{arg}"
        exprs.append(f"{expr} AS {alias}")
    return ", ".join(exprs)



In [47]:
def generate_subqueries_gb(
        select_plain: Set[str], group_by: Set[str], aggs: List[Dict[str, Any]],
        Fo: Set[str], Fs: Set[str], strategy: str
) -> Tuple[str | None, str | None, str | None]:

    # campi necessari lato Server/Owner: select/plain + group_by + argomenti agg
    sel_plain = {c.lower() for c in select_plain}
    gb = {c.lower() for c in group_by}
    agg_args = {a["arg"] for a in aggs if a.get("arg")}

    need_fs = ((sel_plain | gb | agg_args) & Fs) - {'id'}
    need_fo = ((sel_plain | gb | agg_args) & Fo) - {'id'}

    Aqs = sorted(need_fs)
    Aqo = sorted(need_fo)

    gb_owner = [f"o.{c}" for c in sorted(gb & Fo)]
    gb_server = [f"s.{c}" for c in sorted(gb & Fs)]
    gb_all = gb_owner + gb_server
    gb_sql = ", ".join(gb_all)

    aggs_sql = render_aggs_sql(aggs, Fo)
    select_parts = []
    if gb_sql:
        select_parts.append(gb_sql)
    if aggs_sql:
        select_parts.append(aggs_sql)
    final_select = ", ".join(select_parts) if select_parts else aggs_sql

    qs = qo = qso = None

    if strategy == "server-owner":
        # 1) estraggo dal Server le colonne necessarie (id + Fs), senza WHERE
        proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
        qs = f"SELECT {proj_qs} FROM server.patients_server s"

        # 2) aggrego sull'Owner unendo Rs (id, colonne Fs necessarie)
        qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
        if gb_sql:
            qso += f" GROUP BY {gb_sql}"

    elif strategy == "owner-server":
        # 1) creo Ro di soli id (nessun filtro in caso 2)
        qo = "SELECT o.id FROM owner.patients_owner o"

        # 2) dal Server prendo (id + Fs necessarie) vincolando a Ro
        proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
        qs = f"SELECT {proj_qs} FROM server.patients_server s JOIN Ro r USING (id)"

        # 3) aggrego sull'Owner unendo Rs
        qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
        if gb_sql:
            qso += f" GROUP BY {gb_sql}"

    elif strategy == "owner-only":
        # se nel SELECT/GROUP BY/AGG servono colonne Fs, devo passare dal Server
        needs_server = bool(((sel_plain | gb | agg_args) & Fs))
        if needs_server:
            qo = "SELECT o.id FROM owner.patients_owner o"
            proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
            qs = f"SELECT {proj_qs} FROM server.patients_server s JOIN Ro r USING (id)"
            qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
            if gb_sql:
                qso += f" GROUP BY {gb_sql}"
        else:
            # tutto su Owner
            qso = f"SELECT {final_select} FROM owner.patients_owner o"
            if gb_sql:
                qso += f" GROUP BY {gb_sql}"

    elif strategy == "server-only":
        # se servono colonne Fo devo passare dall'Owner, altrimenti tutto Server
        needs_owner = bool(((sel_plain | gb | agg_args) & Fo))
        if needs_owner:
            proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
            qs = f"SELECT {proj_qs} FROM server.patients_server s"
            qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
            if gb_sql:
                qso += f" GROUP BY {gb_sql}"
        else:
            gb_only_s = ", ".join([f"s.{c}" for c in sorted(gb & Fs)])
            aggs_sql_s = render_aggs_sql(aggs, Fo)  # arg Fo non presenti per definizione qui
            parts = []
            if gb_only_s:
                parts.append(gb_only_s)
            if aggs_sql_s:
                parts.append(aggs_sql_s)
            final_s = ", ".join(parts) if parts else aggs_sql_s
            qso = f"SELECT {final_s} FROM server.patients_server s"
            if gb_only_s:
                qso += f" GROUP BY {gb_only_s}"

    elif strategy == "parallel":
        # Materializza entrambi i lati con i soli attributi necessari, poi aggrega su Owner
        # Ro: id + colonne Fo necessarie
        proj_qo = ", ".join(["o.id"] + [f"o.{c}" for c in Aqo]) if Aqo else "o.id"
        qo = f"SELECT {proj_qo} FROM owner.patients_owner o"

        # Rs: id + colonne Fs necessarie
        proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs]) if Aqs else "s.id"
        qs = f"SELECT {proj_qs} FROM server.patients_server s"

        # Aggregazione finale su Owner unendo Ro e Rs (e mantenendo alias o/s)
        qso = (
            "SELECT " + final_select +
            " FROM owner.patients_owner o"
            " JOIN Ro r USING (id)"
            " JOIN Rs s USING (id)"
        )
        if gb_sql:
            qso += f" GROUP BY {gb_sql}"

    else:
        raise ValueError("Strategy must be one of: server-owner, owner-server, owner-only, server-only, parallel")

    return qs, qo, qso


In [48]:
def process_query_gb(query: str, Fo: Set[str], Fs: Set[str]) -> Dict[str, any]:
    # Parse specifico per GROUP BY (senza WHERE)
    select_plain, group_by, aggs = parse_query_groupby(query)

    # Classificazione per il caso GROUP BY: chiavi e argomenti delle agg sui due lati
    classified_gb = classify_groupby_agg(group_by, aggs, Fo, Fs)

    # Scelta strategia per GROUP BY (nessun WHERE)
    strategy_key = choose_strategy_groupby(classified_gb)
    strategy_eff = strategy_key  # nel caso 2 non servono fallback su selettività

    # Subquery: nel caso 2 non ci sono Co/Cs/Cso (passo liste vuote)
    qs, qo, qso = generate_subqueries_gb(
        select_plain=select_plain, group_by=group_by, aggs=aggs,
        Fo=Fo, Fs=Fs, strategy=strategy_eff
    )

    return {
        "Query": query,
        "SELECT_PLAIN": select_plain,
        "GROUP_BY": group_by,
        "AGGS": aggs,
        "Classificazione_GB": classified_gb,
        "Strategia": strategy_key,
        "Strategia_eff": strategy_eff,
        "qs": qs, "qo": qo, "qso": qso
    }


In [49]:
def _replan_alternative_gb(plan: dict, Fo: set, Fs: set) -> dict | None:
    # Flip tra owner-server e server-owner. Se non è uno dei due, niente alternativa.
    cur = plan.get("Strategia_eff") or plan.get("Strategia")
    alt = {"owner-server": "server-owner", "server-owner": "owner-server"}.get(cur)
    if not alt:
        return None

    # Rigenera le subquery per l'alternativa usando solo SELECT/GROUP BY/AGGS
    qs, qo, qso = generate_subqueries_gb(
        select_plain=plan["SELECT_PLAIN"],
        group_by=plan["GROUP_BY"],
        aggs=plan["AGGS"],
        Fo=Fo, Fs=Fs, strategy=alt
    )
    return {"Strategia": alt, "qs": qs, "qo": qo, "qso": qso}


In [50]:
def evaluate_query_gb(query: str,
                      Fo: set, Fs: set,
                      tag: str | None = None,
                      schema: str = "work",
                      save_to: str | None = None,
                      also_compare_alt: bool = True) -> dict:
    plan = process_query_gb(query, Fo, Fs)

    # per il caso 2 usiamo la strategia effettiva (coerente con process_query_gb)
    sk = plan.get("Strategia_eff") or plan["Strategia"]

    tag = tag or uuid.uuid4().hex[:8]

    run(f"CREATE SCHEMA IF NOT EXISTS {schema};")
    ro_name, rs_name, out_name = f"{schema}.ro_{tag}", f"{schema}.rs_{tag}", f"{schema}.out_{tag}"

    counts, sizes = {}, {}

    if sk == "owner-server":
        qo = _strip_semicolon(plan["qo"])
        qs = _strip_semicolon(plan["qs"])
        qso = _strip_semicolon(plan["qso"])

        run(f"DROP TABLE IF EXISTS {ro_name}; CREATE TABLE {ro_name} AS {qo};")
        counts["ro"], sizes["ro"] = _count_table(ro_name), _size_table(ro_name)

        qs_mat = qs.replace(" Ro ", f" {ro_name} ")
        run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs_mat};")
        counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso_mat = qso.replace(" Rs ", f" {rs_name} ")
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    elif sk == "server-owner":
        qs = _strip_semicolon(plan["qs"])
        qso = _strip_semicolon(plan["qso"])

        run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs};")
        counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso_mat = qso.replace(" Rs ", f" {rs_name} ")
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    elif sk in ("owner-only", "server-only"):

        if plan["qo"]:
            qo = _strip_semicolon(plan["qo"])
            run(f"DROP TABLE IF EXISTS {ro_name}; CREATE TABLE {ro_name} AS {qo};")
            counts["ro"], sizes["ro"] = _count_table(ro_name), _size_table(ro_name)

        if plan["qs"]:
            qs = _strip_semicolon(plan["qs"])
            qs_mat = qs.replace(" Ro ", f" {ro_name} ") if plan["qo"] else qs
            run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs_mat};")
            counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso = _strip_semicolon(plan["qso"])
        qso_mat = qso.replace(" Rs ", f" {rs_name} ") if plan["qs"] else qso
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    elif sk == "parallel":
        # materializza entrambi i lati, poi la query finale che li usa entrambi
        qo = _strip_semicolon(plan["qo"])
        qs = _strip_semicolon(plan["qs"])
        qso = _strip_semicolon(plan["qso"])

        run(f"DROP TABLE IF EXISTS {ro_name}; CREATE TABLE {ro_name} AS {qo};")
        counts["ro"], sizes["ro"] = _count_table(ro_name), _size_table(ro_name)

        run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs};")
        counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso_mat = (
            qso.replace(" Ro ", f" {ro_name} ")
               .replace(" Rs ", f" {rs_name} ")
        )
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    else:
        raise ValueError(f"Strategia sconosciuta: {sk!r}")

    net_bytes = _network_bytes(sk, sizes)

    alt_info = None
    if also_compare_alt and sk in ("owner-server", "server-owner"):
        alt = _replan_alternative_gb(plan, Fo, Fs)
        if alt:
            tag_alt = tag + "_alt"
            ro_alt, rs_alt, out_alt = f"{schema}.ro_{tag_alt}", f"{schema}.rs_{tag_alt}", f"{schema}.out_{tag_alt}"
            sizes_alt = {}

            if alt["Strategia"] == "owner-server":
                qo_alt = _strip_semicolon(alt["qo"])
                qs_alt = _strip_semicolon(alt["qs"])
                qso_alt = _strip_semicolon(alt["qso"])

                run(f"DROP TABLE IF EXISTS {ro_alt}; CREATE TABLE {ro_alt} AS {qo_alt};")
                sizes_alt["ro"] = _size_table(ro_alt)

                qs_alt_mat = qs_alt.replace(" Ro ", f" {ro_alt} ")
                run(f"DROP TABLE IF EXISTS {rs_alt}; CREATE TABLE {rs_alt} AS {qs_alt_mat};")
                sizes_alt["rs"] = _size_table(rs_alt)

                qso_alt_mat = qso_alt.replace(" Rs ", f" {rs_alt} ")
                run(f"DROP TABLE IF EXISTS {out_alt}; CREATE TABLE {out_alt} AS {qso_alt_mat};")

            else:  # server-owner
                qs_alt = _strip_semicolon(alt["qs"])
                qso_alt = _strip_semicolon(alt["qso"])
                run(f"DROP TABLE IF EXISTS {rs_alt}; CREATE TABLE {rs_alt} AS {qs_alt};")
                sizes_alt["rs"] = _size_table(rs_alt)
                qso_alt_mat = qso_alt.replace(" Rs ", f" {rs_alt} ")
                run(f"DROP TABLE IF EXISTS {out_alt}; CREATE TABLE {out_alt} AS {qso_alt_mat};")

            net_alt = _network_bytes(alt["Strategia"], sizes_alt)
            saving_pct = 1 - (net_bytes / net_alt) if net_alt and net_alt > 0 else None
            alt_info = {
                "alt_strategy": alt["Strategia"],
                "alt_network_bytes": net_alt,
                "saving_pct": float(saving_pct) if saving_pct is not None else None,
                "tables_alt": {"ro": ro_alt if "ro" in sizes_alt else None,
                               "rs": rs_alt if "rs" in sizes_alt else None,
                               "out": out_alt}
            }

    row = {
        "tag": tag,
        "query": plan["Query"],
        "strategy": sk,
        "result_owner": counts.get("ro"), "result_server": counts.get("rs"), "result_out": counts.get("out"),
        "bytes_result_owner": sizes.get("ro"), "bytes_result_server": sizes.get("rs"),
        "bytes_result_out": sizes.get("out"),
        "network_bytes": net_bytes,
        "alt_strategy": alt_info["alt_strategy"] if alt_info else None,
        "alt_network_bytes": alt_info["alt_network_bytes"] if alt_info else None,
        "saving_pct": alt_info["saving_pct"] if alt_info else None
    }

    if save_to:
        save_to = os.path.abspath(save_to)
        df = pd.DataFrame([row])
        header = not os.path.exists(save_to)
        df.to_csv(save_to, mode="a", index=False, header=header)

    return {
        "plan": plan,
        "row": row,
        "tables": {"result_owner": ro_name if "ro" in counts else None,
                   "result_server": rs_name if "rs" in counts else None,
                   "result_out": out_name if "out" in counts else None},
        "alt": alt_info
    }


In [51]:
def evaluate_queries_gb(queries: list[str],
                        Fo: set, Fs: set,
                        schema: str = "work",
                        save_to: str | None = None,
                        also_compare_alt: bool = True) -> pd.DataFrame:
    rows = []
    for i, q in enumerate(queries, 1):
        tag = f"hv{i:02d}"
        res = evaluate_query_gb(q, Fo, Fs, tag=tag, schema=schema,
                                save_to=save_to, also_compare_alt=also_compare_alt)
        rows.append(res["row"])
    return pd.DataFrame(rows)

TESTING

In [54]:
q = """
SELECT county, MAX(income) AS max_inc
FROM patients
GROUP BY county
    """
res = evaluate_query_gb(q, Fo, Fs, tag="q01")
pd.DataFrame([res["row"]])

Unnamed: 0,tag,query,strategy,result_owner,result_server,result_out,bytes_result_owner,bytes_result_server,bytes_result_out,network_bytes,alt_strategy,alt_network_bytes,saving_pct
0,q01,"\nSELECT county, MAX(income) AS max_inc\nFROM ...",owner-server,112,112,12,16384,24576,16384,40960,server-owner,24576,-0.666667


In [55]:
if res["tables"]["result_owner"]:
    print("ro")
    run(f"SELECT * FROM {res['tables']['result_owner']} ;", show=True)  # Ro (Qo)
if res["tables"]["result_server"]:
    print("rs")
    run(f"SELECT * FROM {res['tables']['result_server']} ;", show=True)  # Rs (Qs)
print("rout")
run(f"SELECT * FROM {res['tables']['result_out']};")  # Out (Qso)


ro


Unnamed: 0,id
0,c10e497b-ed25-d6c6-e043-144724bf84da
1,08c1e3c5-4732-9008-ddd4-edc1f2358521
2,ac5294eb-05dc-ed5b-2e7c-021eebd1c7b3
3,aeabefce-854a-81f8-2a92-134a22ae6871
4,a514d082-312d-f9cf-6e1b-42b6fa1bfb6f
...,...
107,f15015b7-a177-da0a-d208-3c63f799da12
108,92057acb-1a4e-921b-8d21-822d52094f47
109,bd38b1a4-d16e-a126-426c-0a4a278d1948
110,80139337-c548-00de-53c3-5cf9aae9af60


rs


Unnamed: 0,id,county
0,c10e497b-ed25-d6c6-e043-144724bf84da,Middlesex County
1,08c1e3c5-4732-9008-ddd4-edc1f2358521,Essex County
2,ac5294eb-05dc-ed5b-2e7c-021eebd1c7b3,Norfolk County
3,aeabefce-854a-81f8-2a92-134a22ae6871,Hampshire County
4,a514d082-312d-f9cf-6e1b-42b6fa1bfb6f,Middlesex County
...,...,...
107,f15015b7-a177-da0a-d208-3c63f799da12,Middlesex County
108,92057acb-1a4e-921b-8d21-822d52094f47,Hampshire County
109,bd38b1a4-d16e-a126-426c-0a4a278d1948,Hampshire County
110,80139337-c548-00de-53c3-5cf9aae9af60,Bristol County


rout


Unnamed: 0,county,max_inc
0,Hampshire County,199051.0
1,Suffolk County,272124.0
2,Plymouth County,165197.0
3,Bristol County,137596.0
4,Norfolk County,144732.0
5,Middlesex County,941064.0
6,Essex County,840831.0
7,Barnstable County,188361.0
8,Hampden County,151565.0
9,Worcester County,150171.0


In [None]:
queries = [
    "SELECT city, gender, COUNT(*) AS n, AVG(income) AS avg_inc FROM patients  GROUP BY city, gender",
    "SELECT city, max(healthcare_coverage) AS max_healthcare_coverage FROM patients GROUP BY city",
    "",

]
df = evaluate_queries_gb(queries, Fo, Fs, save_to='query2_evaluation.cvs')
df