SELECT–FROM–WHERE–GROUP BY

In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from pathlib import Path
import re
from typing import List, Set, Tuple, Dict, Optional, Any
import os, uuid
import numpy as np

from sqlalchemy.orm.base import PASSIVE_OFF


In [3]:
USER = "postgres"
HOST = "localhost"
PORT = "5432"
PASSWORD = "user"

DB_ADMIN_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/postgres"
engine_admin = create_engine(DB_ADMIN_URL, isolation_level="AUTOCOMMIT")

DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/synthea"
engine = create_engine(DB_URL)
print("Connesso al database synthea")

Connesso al database synthea


In [4]:
def run(sql_or_text, show=False):
    with engine.begin() as conn:
        stmt = text(sql_or_text) if isinstance(sql_or_text, str) else sql_or_text
        result = conn.execute(stmt)
        if result.returns_rows:
            df = pd.DataFrame(result.fetchall(), columns=result.keys())
            if show:
                display(df)
            return df
        return None


def _strip_semicolon(sql: str) -> str:
    return re.sub(r';\s*$', '', sql.strip())


def _count_table(tname: str) -> int:
    return int(run(f"SELECT COUNT(*) AS n FROM {tname};").iloc[0]["n"])


def _size_table(tname: str) -> int:
    return int(run(f"SELECT pg_total_relation_size('{tname}') AS bytes;").iloc[0]["bytes"])


def _network_bytes(strategy_key: str, sizes: dict) -> int:
    if strategy_key == "owner-server":
        return sizes.get("ro", 0) + sizes.get("rs", 0)
    if strategy_key == "server-owner":
        return sizes.get("rs", 0)
    if strategy_key == "owner-only":
        return 0
    if strategy_key == "server-only":
        return sizes.get("out", 0)
    if strategy_key == "parallel":
        return sizes.get("ro", 0) + sizes.get("rs", 0)
    return 0


def _unqualify(tok: str) -> str:
    tok = tok.strip().strip('"')
    return tok.split('.')[-1].lower() if '.' in tok else tok.lower()


def _split_outside_parents(s: str) -> List[str]:
    items, buf, d = [], [], 0
    for ch in s:
        if ch == '(':
            d += 1
        elif ch == ')':
            d = max(0, d - 1)
        if ch == ',' and d == 0:
            items.append(''.join(buf).strip())
            buf = []
        else:
            buf.append(ch)
    if buf:
        items.append(''.join(buf).strip())
    return items

FRAMMENTAZIONE VERTICALE

# PATIENTS
Owner(PATIENTS) = { id, deathdate, first, last, ssn, drivers, passport, address, city, state, county, fips, zip, lat, lon, income, birthplace }
Server(PATIENTS) = { id, birthdate, gender, race, ethnicity, marital }

In [35]:
# se servisse ricaricare i dati

sql = open("fragmentPatients.sql").read()

with engine.begin() as conn:
    if sql.strip():
        conn.execute(text(sql))
        print("Frammentazione creata")
    else:
        print("Errore")

# O semplicemente uso la funzione run(sql)

Frammentazione creata


In [5]:
run('''
DROP SCHEMA work CASCADE; CREATE SCHEMA work;
''')

In [6]:
run(''' ANALYZE owner.patients_owner; ANALYZE server.patients_server;''')

In [7]:
Fo = {
    "id", "deathdate", "ssn", "drivers", "passport", "prefix", "first", "middle", "last", "suffix", "maiden",
    "birthplace", "address", "city", "state", "county", "fips", "zip", "lat", "lon",
    "healthcare_expenses", "healthcare_coverage", "income"
}
Fs = {"id", "birthdate", "gender", "race", "ethnicity", "marital"}

In [8]:

def domini_from_pg_stats(schema: str, table: str) -> dict:
    sql = f"""
    SELECT s.attname::text AS col,
           CASE
             WHEN s.n_distinct > 0
               THEN s.n_distinct::numeric
             ELSE (-s.n_distinct) * c.reltuples
           END AS est_distinct
    FROM pg_stats s
    JOIN pg_class c ON c.relname = s.tablename
    JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = s.schemaname
    WHERE s.schemaname = :schema AND s.tablename = :table;
    """
    rows = run(text(sql).bindparams(schema=schema, table=table), show=False)
    return {r.col.lower(): max(1, int(r.est_distinct or 1)) for _, r in rows.iterrows()}


domini_owner = domini_from_pg_stats("owner", "patients_owner")
domini_server = domini_from_pg_stats("server", "patients_server")

domini = {**domini_owner, **domini_server}


In [9]:

def parse_query_groupby(query: str) -> Tuple[Set[str], Optional[str], Set[str], List[Dict[str, Any]]]:

    q = query.strip()
    m_sel = re.search(r"select\s+(.*?)\s+from\b", q, re.I | re.S)
    if not m_sel:
        raise ValueError("SELECT ... FROM mancante.")
    sel_txt = m_sel.group(1)
    rest = q[m_sel.end():]


    m_wh = re.search(r"\bwhere\b", rest, re.I)
    m_gb = re.search(r"\bgroup\s+by\b", rest, re.I)

    where_clause = None
    group_by_txt = None
    end_rest = len(rest)

    if m_wh:
        where_end = m_gb.start() if m_gb else end_rest
        where_clause = rest[m_wh.end():where_end].strip()
    if m_gb:
        group_by_txt = rest[m_gb.end():].strip()
        group_by_txt = re.sub(r';\s*$', '', group_by_txt, flags=re.S)


    select_items = _split_outside_parents(sel_txt)
    select_plain: Set[str] = set()
    aggs: List[Dict[str, Any]] = []

    agg_re = re.compile(
        r"^(count|sum|avg|min|max)\s*\(\s*(distinct\s+)?(\*|[a-zA-Z_][\w\.]*)\s*\)\s*(?:as\s+([a-zA-Z_]\w*))?$",
        re.I
    )
    for it in select_items:
        it_norm = it.strip()
        m = agg_re.match(it_norm)
        if m:
            func = m.group(1).lower()
            distinct = bool(m.group(2))
            arg_raw = m.group(3)
            alias = m.group(4).lower() if m.group(4) else None
            arg = None if arg_raw == '*' else _unqualify(arg_raw)
            aggs.append({"func": func, "arg": arg, "distinct": distinct, "alias": alias})
        else:
            select_plain.add(_unqualify(it_norm))


    group_by: Set[str] = set()
    if group_by_txt:
        cols = [tok for tok in _split_outside_parents(group_by_txt) if tok.strip()]
        group_by = {_unqualify(c) for c in cols}

    return select_plain, (where_clause or None), group_by, aggs


def extract_conditions(where_clause: str) -> Tuple[List[str], bool]:
    if " OR " in where_clause.upper():
        conditions = [c.strip() for c in re.split(r"\bOR\b", where_clause, flags=re.IGNORECASE)]
        return conditions, True
    else:
        conditions = [c.strip() for c in re.split(r"\bAND\b", where_clause, flags=re.IGNORECASE)]
        return conditions, False


def classify_conditions(conditions: List[str], Fo: Set[str], Fs: Set[str]) -> Dict[str, List[str]]:
    Co, Cs, Cso = [], [], []
    for cond in conditions:

        attrs = {tok.lower() for tok in re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', cond)}
        in_owner = attrs & Fo
        in_server = attrs & Fs
        if in_owner and in_server:
            Cso.append(cond)
        elif in_owner:
            Co.append(cond)
        elif in_server:
            Cs.append(cond)
    return {"Co": Co, "Cs": Cs, "Cso": Cso}


def classify_groupby_agg(group_by: Set[str], aggs: List[Dict[str, Any]],
                         Fo: Set[str], Fs: Set[str]) -> Dict[str, Set[str]]:
    G_owner = {g for g in group_by if g in Fo}
    G_server = {g for g in group_by if g in Fs}
    Agg_owner = {a["arg"] for a in aggs if a.get("arg") and a["arg"] in Fo}
    Agg_server = {a["arg"] for a in aggs if a.get("arg") and a["arg"] in Fs}
    return {
        "G_owner": G_owner, "G_server": G_server,
        "Agg_owner": Agg_owner, "Agg_server": Agg_server
    }


def stima_selettivita(condizione: str, domini: dict) -> float:
    c = condizione.lower()
    for attr in domini:
        if attr.lower() in c:
            return 1 / domini[attr]
    return 0.5


def choose_strategy(classified, domini, where_clause, has_or,
                    bytes_owner=1.0, bytes_server=1.0) -> str:
    Co = classified.get("Co", [])
    Cs = classified.get("Cs", [])
    Cso = classified.get("Cso", [])

    def _sel(preds: list[str]) -> float:
        s = 1.0
        for c in preds:
            s *= max(min(stima_selettivita(c, domini), 1.0), 1e-6)
        return s


    if Cso:

        if Co and not Cs:
            return "owner-server"
        if Cs and not Co:
            return "server-owner"
        if Co and Cs:
            cost_o = _sel(Co) * bytes_owner
            cost_s = _sel(Cs) * bytes_server
            return "owner-server" if cost_o < cost_s else "server-owner"
        return "owner-server" if bytes_owner <= bytes_server else "server-owner"


    if has_or:
        if Co and Cs:
            return "parallel"
        if Co:
            return "owner-only"
        if Cs:
            return "server-only"
        return "unknown"

    # AND / nessun OR
    if Co and not Cs:
        return "owner-only"
    if Cs and not Co:
        return "server-only"
    if Co and Cs:
        cost_o = _sel(Co) * bytes_owner
        cost_s = _sel(Cs) * bytes_server
        return "owner-server" if cost_o < cost_s else "server-owner"

    return "unknown"

In [10]:

def render_aggs_sql(aggs: List[Dict[str, Any]], Fo: Set[str], Fs: Set[str]) -> str:
    exprs = []
    for a in aggs:
        func = a["func"].upper()
        distinct = "DISTINCT " if a.get("distinct") else ""
        arg = a.get("arg")
        if arg is None:
            expr = f"{func}(*)"
            alias = a.get("alias") or f"{func.lower()}_all"
        else:
            qual = "o" if arg in Fo else "s"
            expr = f"{func}({distinct}{qual}.{arg})"
            alias = a.get("alias") or f"{func.lower()}_{arg}"
        exprs.append(f"{expr} AS {alias}")
    return ", ".join(exprs)



In [11]:
def generate_subqueries_gb(
        Co: List[str], Cs: List[str], Cso: List[str],
        select_plain: Set[str], group_by: Set[str], aggs: List[Dict[str, Any]],
        Fo: Set[str], Fs: Set[str], strategy: str
) -> Tuple[str | None, str | None, str | None]:


    sel_plain = {c.lower() for c in select_plain}
    gb = {c.lower() for c in group_by}
    agg_args = {a["arg"] for a in aggs if a.get("arg")}

    fs_in_cso = {
        a.lower()
        for cond in Cso
        for a in re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', cond)
        if a.lower() in Fs
    }
    Aqs = sorted(((sel_plain | gb | agg_args) & Fs) | fs_in_cso - {'id'})

    gb_owner = [f"o.{c}" for c in sorted(gb & Fo)]
    gb_server = [f"s.{c}" for c in sorted(gb & Fs)]
    gb_all = gb_owner + gb_server
    gb_sql = ", ".join(gb_all)

    aggs_sql = render_aggs_sql(aggs, Fo, Fs)
    select_parts = []
    if gb_sql:
        select_parts.append(gb_sql)
    if aggs_sql:
        select_parts.append(aggs_sql)
    final_select = ", ".join(select_parts) if select_parts else aggs_sql

    qs = qo = qso = None

    if strategy == "server-owner":

        proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
        qs = f"SELECT {proj_qs} FROM server.patients_server s" \
             + (f" WHERE {' AND '.join(Cs)}" if Cs else "")

        qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
        if Co or Cso:
            qso += " WHERE " + " AND ".join(Co + Cso)
        if gb_sql:
            qso += f" GROUP BY {gb_sql}"

    elif strategy == "owner-server":

        qo = "SELECT o.id FROM owner.patients_owner o" \
             + (f" WHERE {' AND '.join(Co)}" if Co else "")

        proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
        qs = f"SELECT {proj_qs} FROM server.patients_server s JOIN Ro r USING (id)" \
             + (f" WHERE {' AND '.join(Cs)}" if Cs else "")

        qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
        if Cso:
            qso += " WHERE " + " AND ".join(Cso)
        if gb_sql:
            qso += f" GROUP BY {gb_sql}"

    elif strategy == "owner-only":

        needs_server = bool(((sel_plain | gb | agg_args) & Fs) or Cso)
        if needs_server:

            qo = "SELECT o.id FROM owner.patients_owner o" \
                 + (f" WHERE {' AND '.join(Co)}" if Co else "")

            proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
            qs = f"SELECT {proj_qs} FROM server.patients_server s JOIN Ro r USING (id)"

            qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
            if Cso:
                qso += " WHERE " + " AND ".join(Cso)
            if gb_sql:
                qso += f" GROUP BY {gb_sql}"
        else:

            qso = f"SELECT {final_select} FROM owner.patients_owner o" \
                  + (f" WHERE {' AND '.join(Co)}" if Co else "")
            if gb_sql:
                qso += f" GROUP BY {gb_sql}"

    elif strategy == "server-only":
        needs_owner = bool(((sel_plain | gb | agg_args) & Fo) or Cso)
        if needs_owner:

            proj_qs = ", ".join(["s.id"] + [f"s.{c}" for c in Aqs])
            qs = f"SELECT {proj_qs} FROM server.patients_server s" \
                 + (f" WHERE {' AND '.join(Cs)}" if Cs else "")

            qso = f"SELECT {final_select} FROM owner.patients_owner o JOIN Rs s USING (id)"
            if Cso:
                qso += " WHERE " + " AND ".join(Cso)
            if gb_sql:
                qso += f" GROUP BY {gb_sql}"
        else:

            gb_only_s = ", ".join([f"s.{c}" for c in sorted(gb & Fs)])
            aggs_sql_s = render_aggs_sql(aggs, Fo, Fs)
            parts = []
            if gb_only_s:
                parts.append(gb_only_s)
            if aggs_sql_s:
                parts.append(aggs_sql_s)
            final_s = ", ".join(parts) if parts else aggs_sql_s
            qso = f"SELECT {final_s} FROM server.patients_server s" \
                  + (f" WHERE {' AND '.join(Cs)}" if Cs else "")
            if gb_only_s:
                qso += f" GROUP BY {gb_only_s}"

    elif strategy == "parallel":
        raise NotImplementedError("parallel non ancora gestito")
    else:
        raise ValueError("Strategy must be one of: server-owner, owner-server, owner-only, server-only")

    return qs, qo, qso

In [12]:
def process_query_gb(query: str, Fo: Set[str], Fs: Set[str], domini: Dict[str, int]) -> Dict[str, any]:

    out = parse_query_groupby(query)
    select_plain, where, group_by, aggs = out


    conditions, has_or = extract_conditions(where or "")
    classified = classify_conditions(conditions, Fo, Fs)


    strategy_descr = choose_strategy(classified, domini, where or "", has_or)
    s = strategy_descr.lower()
    if "owner-server" in s:
        strategy_key = "owner-server"
    elif "server-owner" in s:
        strategy_key = "server-owner"
    elif "owner-only" in s:
        strategy_key = "owner-only"
    elif "server-only" in s:
        strategy_key = "server-only"
    elif "parallel" in s:
        strategy_key = "parallel"
    else:
        strategy_key = "unknown"


    sp = {c.lower() for c in select_plain}
    gb = {c.lower() for c in group_by}
    agg_args = {a["arg"] for a in aggs if a.get("arg")}

    needs_server = bool(((sp | gb | agg_args) & Fs) or classified["Cso"])
    needs_owner = bool(((sp | gb | agg_args) & Fo) or classified["Cso"])

    def _pick_two_way() -> str:
        sel_Co = 1.0
        for cond in classified["Co"]:
            sel_Co *= max(min(stima_selettivita(cond, domini), 1.0), 1e-6)
        sel_Cs = 1.0
        for cond in classified["Cs"]:
            sel_Cs *= max(min(stima_selettivita(cond, domini), 1.0), 1e-6)
        return "owner-server" if sel_Co < sel_Cs else "server-owner"

    strategy_eff = strategy_key
    if strategy_key in ("parallel", "unknown"):
        if needs_owner and not needs_server:
            strategy_eff = "owner-only"
        elif needs_server and not needs_owner:
            strategy_eff = "server-only"
        elif needs_owner and needs_server:
            strategy_eff = _pick_two_way()
        else:
            strategy_eff = "server-owner"  # fallback
    else:
        if strategy_key == "owner-only" and needs_server:
            strategy_eff = "owner-server"
        if strategy_key == "server-only" and needs_owner:
            strategy_eff = "server-owner"

    # subqueries (HAVING assente: passa None)
    qs, qo, qso = generate_subqueries_gb(
        classified["Co"], classified["Cs"], classified["Cso"],
        select_plain, group_by, aggs, Fo, Fs, strategy_eff
    )

    return {
        "Query": query,
        "SELECT_PLAIN": select_plain,
        "WHERE": where,
        "GROUP_BY": group_by,
        "AGGS": aggs,
        "Classificazione": classified,
        "Strategia": strategy_key,
        "Strategia_eff": strategy_eff,
        "qs": qs, "qo": qo, "qso": qso
    }


In [13]:
def _replan_alternative_gb(plan: dict, Fo: set, Fs: set) -> dict | None:
    Co = plan["Classificazione"]["Co"]
    Cs = plan["Classificazione"]["Cs"]
    if not (Co and Cs):
        return None
    alt = {"owner-server": "server-owner", "server-owner": "owner-server"}.get(plan["Strategia"])
    if not alt:
        return None
    qs, qo, qso = generate_subqueries_gb(
        Co, Cs, plan["Classificazione"]["Cso"],
        plan["SELECT_PLAIN"], plan["GROUP_BY"], plan["AGGS"], Fo, Fs, alt
    )
    return {"Strategia": alt, "qs": qs, "qo": qo, "qso": qso}

In [25]:
def evaluate_query_gb(query: str,
                      Fo: set, Fs: set, domini: dict,
                      tag: str | None = None,
                      schema: str = "work",
                      save_to: str | None = None,
                      also_compare_alt: bool = True) -> dict:
    plan = process_query_gb(query, Fo, Fs, domini)
    sk = plan["Strategia"]
    tag = tag or uuid.uuid4().hex[:8]

    run(f"CREATE SCHEMA IF NOT EXISTS {schema};")
    ro_name, rs_name, out_name = f"{schema}.ro_{tag}", f"{schema}.rs_{tag}", f"{schema}.out_{tag}"

    counts, sizes = {}, {}

    if sk == "owner-server":
        qo = _strip_semicolon(plan["qo"])
        qs = _strip_semicolon(plan["qs"])
        qso = _strip_semicolon(plan["qso"])

        run(f"DROP TABLE IF EXISTS {ro_name}; CREATE TABLE {ro_name} AS {qo};")
        counts["ro"], sizes["ro"] = _count_table(ro_name), _size_table(ro_name)

        qs_mat = qs.replace(" Ro ", f" {ro_name} ")
        run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs_mat};")
        counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso_mat = qso.replace(" Rs ", f" {rs_name} ")
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    elif sk == "server-owner":
        qs = _strip_semicolon(plan["qs"])
        qso = _strip_semicolon(plan["qso"])

        run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs};")
        counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso_mat = qso.replace(" Rs ", f" {rs_name} ")
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    elif sk in ("owner-only", "server-only"):

        if plan["qo"]:
            qo = _strip_semicolon(plan["qo"])
            run(f"DROP TABLE IF EXISTS {ro_name}; CREATE TABLE {ro_name} AS {qo};")
            counts["ro"], sizes["ro"] = _count_table(ro_name), _size_table(ro_name)

        if plan["qs"]:
            qs = _strip_semicolon(plan["qs"])
            qs_mat = qs.replace(" Ro ", f" {ro_name} ") if plan["qo"] else qs
            run(f"DROP TABLE IF EXISTS {rs_name}; CREATE TABLE {rs_name} AS {qs_mat};")
            counts["rs"], sizes["rs"] = _count_table(rs_name), _size_table(rs_name)

        qso = _strip_semicolon(plan["qso"])
        qso_mat = qso.replace(" Rs ", f" {rs_name} ") if plan["qs"] else qso
        run(f"DROP TABLE IF EXISTS {out_name}; CREATE TABLE {out_name} AS {qso_mat};")
        counts["out"], sizes["out"] = _count_table(out_name), _size_table(out_name)

    else:
        raise NotImplementedError("parallel non gestiti nel caso GROUP BY")

    net_bytes = _network_bytes(sk, sizes)

    alt_info = None
    if also_compare_alt and sk in ("owner-server", "server-owner"):
        alt = _replan_alternative_gb(plan, Fo, Fs)
        if alt:
            tag_alt = tag + "_alt"
            ro_alt, rs_alt, out_alt = f"{schema}.ro_{tag_alt}", f"{schema}.rs_{tag_alt}", f"{schema}.out_{tag_alt}"
            sizes_alt = {}

            if alt["Strategia"] == "owner-server":
                qo_alt = _strip_semicolon(alt["qo"])
                qs_alt = _strip_semicolon(alt["qs"])
                qso_alt = _strip_semicolon(alt["qso"])

                run(f"DROP TABLE IF EXISTS {ro_alt}; CREATE TABLE {ro_alt} AS {qo_alt};")
                sizes_alt["ro"] = _size_table(ro_alt)

                qs_alt_mat = qs_alt.replace(" Ro ", f" {ro_alt} ")
                run(f"DROP TABLE IF EXISTS {rs_alt}; CREATE TABLE {rs_alt} AS {qs_alt_mat};")
                sizes_alt["rs"] = _size_table(rs_alt)

                qso_alt_mat = qso_alt.replace(" Rs ", f" {rs_alt} ")
                run(f"DROP TABLE IF EXISTS {out_alt}; CREATE TABLE {out_alt} AS {qso_alt_mat};")

            else:  # server-owner
                qs_alt = _strip_semicolon(alt["qs"])
                qso_alt = _strip_semicolon(alt["qso"])
                run(f"DROP TABLE IF EXISTS {rs_alt}; CREATE TABLE {rs_alt} AS {qs_alt};")
                sizes_alt["rs"] = _size_table(rs_alt)
                qso_alt_mat = qso_alt.replace(" Rs ", f" {rs_alt} ")
                run(f"DROP TABLE IF EXISTS {out_alt}; CREATE TABLE {out_alt} AS {qso_alt_mat};")

            net_alt = _network_bytes(alt["Strategia"], sizes_alt)
            saving_pct = 1 - (net_bytes / net_alt) if net_alt and net_alt > 0 else None
            alt_info = {
                "alt_strategy": alt["Strategia"],
                "alt_network_bytes": net_alt,
                "saving_pct": float(saving_pct) if saving_pct is not None else None,
                "tables_alt": {"ro": ro_alt if "ro" in sizes_alt else None,
                               "rs": rs_alt if "rs" in sizes_alt else None,
                               "out": out_alt}
            }

    row = {
        "tag": tag,
        "query": plan["Query"],
        "strategy": sk,
        "result_owner": counts.get("ro"), "result_server": counts.get("rs"), "result_out": counts.get("out"),
        "bytes_result_owner": sizes.get("ro"), "bytes_result_server": sizes.get("rs"),
        "bytes_result_out": sizes.get("out"),
        "network_bytes": net_bytes,
        "alt_strategy": alt_info["alt_strategy"] if alt_info else None,
        "alt_network_bytes": alt_info["alt_network_bytes"] if alt_info else None,
        "saving_pct": alt_info["saving_pct"] if alt_info else None
    }

    if save_to:
        save_to = os.path.abspath(save_to)
        df = pd.DataFrame([row])
        header = not os.path.exists(save_to)
        df.to_csv(save_to, mode="a", index=False, header=header)

    return {
        "plan": plan,
        "row": row,
        "tables": {"result_owner": ro_name if "ro" in counts else None,
                   "result_server": rs_name if "rs" in counts else None,
                   "result_out": out_name if "out" in counts else None},
        "alt": alt_info
    }


In [45]:
def evaluate_queries_gb(queries: list[str],
                        Fo: set, Fs: set, domini: dict,
                        schema: str = "work",
                        save_to: str | None = None,
                        also_compare_alt: bool = True) -> pd.DataFrame:
    rows = []
    for i, q in enumerate(queries, 1):
        tag = f"gb{i:02d}"
        res = evaluate_query_gb(q, Fo, Fs, domini, tag=tag, schema=schema,
                                save_to=save_to, also_compare_alt=also_compare_alt)
        rows.append(res["row"])
    return pd.DataFrame(rows)

TESTING

In [46]:
q = """
    SELECT city, gender, COUNT(*) AS n
FROM patients
WHERE city='Worcester' AND gender='F'
GROUP BY city, gender;
    """
res = evaluate_query_gb(q, Fo, Fs, domini, tag="q01")
print(res)

{'plan': {'Query': "\n    SELECT city, gender, COUNT(*) AS n\nFROM patients\nWHERE city='Worcester' AND gender='F'\nGROUP BY city, gender;\n    ", 'SELECT_PLAIN': {'city', 'gender'}, 'WHERE': "city='Worcester' AND gender='F'", 'GROUP_BY': {'city', 'gender'}, 'AGGS': [{'func': 'count', 'arg': None, 'distinct': False, 'alias': 'n'}], 'Classificazione': {'Co': ["city='Worcester'"], 'Cs': ["gender='F'"], 'Cso': []}, 'Strategia_descr': 'owner-server', 'Strategia': 'owner-server', 'Strategia_eff': 'owner-server', 'qs': "SELECT s.id, s.gender FROM server.patients_server s JOIN Ro r USING (id) WHERE gender='F'", 'qo': "SELECT o.id FROM owner.patients_owner o WHERE city='Worcester'", 'qso': 'SELECT o.city, s.gender, COUNT(*) AS n FROM owner.patients_owner o JOIN Rs s USING (id) GROUP BY o.city, s.gender'}, 'row': {'tag': 'gb01', 'query': "\n    SELECT city, gender, COUNT(*) AS n\nFROM patients\nWHERE city='Worcester' AND gender='F'\nGROUP BY city, gender;\n    ", 'strategy': 'owner-server', 're

In [42]:
if res["tables"]["result_owner"]:
    print("ro")
    run(f"SELECT * FROM {res['tables']['result_owner']} ;", show=True)  # Ro (Qo)
if res["tables"]["result_server"]:
    print("rs")
    run(f"SELECT * FROM {res['tables']['result_server']} ;", show=True)  # Rs (Qs)
print("rout")
run(f"SELECT * FROM {res['tables']['result_out']};")  # Out (Qso)


ro


Unnamed: 0,id
0,8cef0aac-f2c6-40ed-cc00-e65bbb555956


rs


Unnamed: 0,id,birthdate,gender
0,8cef0aac-f2c6-40ed-cc00-e65bbb555956,1960-01-09,F


rout


Unnamed: 0,city,gender,n
0,Worcester,F,1


In [None]:
queries = [
    "SELECT city, gender, COUNT(*) AS n, AVG(income) AS avg_inc FROM patients WHERE zip <> '00000' AND gender IN ('F','M') GROUP BY city, gender",
    "SELECT city, COUNT(*) AS deceduti_magg FROM patients WHERE birthdate <= (deathdate - INTERVAL '18 years') GROUP BY city",
    "",

]
df = evaluate_queries_gb(queries, Fo, Fs, domini, save_to='query2_evaluation.cvs')
df