In [7]:
import pandas as pd
import json
import re
import os
from bs4 import BeautifulSoup

## DataFusion runtimes

Total runtimes of binary join & shredded yannakakis, aggregated per query.

In [8]:
df = pd.read_csv('timings.csv')
df['duration(s)'] = df['duration(µs)'] / 1_000_000
df.drop(columns=['variant','duration(µs)'], inplace=True)
df.replace("BinaryJoin","DF-Bin", inplace=True)
df.replace("Yannakakis","SYA", inplace=True)
df_agg = df.groupby(["query","method"]).aggregate(["mean","median"])
df_agg.columns = ['_'.join(col) for col in df_agg.columns]
df_agg.reset_index(inplace=True)
# df_agg.to_csv('timings_agg.csv', index=False) # , mode='a'
df_agg

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median
0,dblp_acyclic_201_00,DF-Bin,1.292136,1.291339
1,dblp_acyclic_201_00,SYA,0.637914,0.638233
2,dblp_acyclic_201_01,DF-Bin,0.129688,0.129630
3,dblp_acyclic_201_01,SYA,0.081931,0.081907
4,dblp_acyclic_201_02,DF-Bin,0.201509,0.200955
...,...,...,...,...
3183,yago_acyclic_tree_6_77,SYA,0.004391,0.004387
3184,yago_acyclic_tree_6_78,DF-Bin,0.114430,0.114626
3185,yago_acyclic_tree_6_78,SYA,0.147300,0.147164
3186,yago_acyclic_tree_6_79,DF-Bin,0.005294,0.005318


## Datafusion detailed metrics

In [9]:
def replace_utf8_string(text):
    # Use regex to match and replace value: Utf8("SomeString") by value: Utf8(\"SomeString\")
    pattern = r'value:\s*Utf8\("(.*?)"\)'  # This matches 'value: Utf8("SomeString")'
    replacement = r'value: Utf8(\"\1\")'  # Adds escaped quotes: Utf8(\"SomeString\")
    
    result = re.sub(pattern, replacement, text)
    
    return result

metrics_file = "output/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

# Drop 2NSA plan metrics (we're now analyzing binary joins)
metrics = [metric for metric in metrics if metric["params"]["method"] == "BinaryJoin"]
print(len(metrics)) # should be nr_of_queries * 10 (10 repetitions)

15940


In [10]:
def get_metric(metric_name, metrics):
    for metric in metrics:
        if metric["name"] == metric_name:
            return metric

def collect_metrics(metrics):
    def update_timings(node):
        if node["operator"].startswith("AggregateExec"):
            timings["aggregate_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("FilterExec"):
            timings["filter_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("ProjectionExec"):
            timings["projection_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        # memoryexec does not contain timing metrics

        for child in node["children"]:
            update_timings(child)

    result = {}

    result["method"] = metrics["params"]["method"]
    result["query"] = metrics["params"]["query"]

    timings = {
        "aggregate_time": 0,
        "filter_time": 0,
        "projection_time": 0
    }
    root = metrics["plan"]
    update_timings(root)
    # all timings are in nanoseconds, convert to seconds
    timings = {k: v / 1_000_000_000 for k, v in timings.items()}
    result.update(timings)
    return result

df_bin = pd.DataFrame([collect_metrics(m) for m in metrics])
df_bin["method"] = "DF-Bin"
df_bin = df_bin.groupby(["query","method"]).aggregate(["mean","median"])
df_bin.columns = ['_'.join(col) for col in df_bin.columns]
df_bin.reset_index(inplace=True)
df_bin = pd.merge(
    df_agg[df_agg["method"] == "DF-Bin"],
    df_bin,
    on=["query","method"]
)
# hashjoin time = total time - aggregate time - filter time - projection time
df_bin["hashjoin_time(s)_mean"] = df_bin["duration(s)_mean"] - df_bin["aggregate_time_mean"] - df_bin["filter_time_mean"] - df_bin["projection_time_mean"]
df_bin["hashjoin_time(s)_median"] = df_bin["duration(s)_median"] - df_bin["aggregate_time_median"] - df_bin["filter_time_median"] - df_bin["projection_time_median"]
df_bin.drop(columns=["aggregate_time_mean","filter_time_mean","projection_time_mean","aggregate_time_median","filter_time_median","projection_time_median"], inplace=True)
df_bin.to_csv('timings_agg.csv', index=False)
df_bin

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,dblp_acyclic_201_00,DF-Bin,1.292136,1.291339,1.253745,1.253016
1,dblp_acyclic_201_01,DF-Bin,0.129688,0.129630,0.110419,0.110389
2,dblp_acyclic_201_02,DF-Bin,0.201509,0.200955,0.166934,0.166404
3,dblp_acyclic_201_03,DF-Bin,0.844165,0.843552,0.805474,0.804855
4,dblp_acyclic_201_04,DF-Bin,0.353416,0.352603,0.306617,0.305792
...,...,...,...,...,...,...
1589,yago_acyclic_tree_6_75,DF-Bin,0.114939,0.115007,0.094222,0.094247
1590,yago_acyclic_tree_6_76,DF-Bin,0.031876,0.031859,0.026433,0.026421
1591,yago_acyclic_tree_6_77,DF-Bin,0.023129,0.023131,0.022568,0.022578
1592,yago_acyclic_tree_6_78,DF-Bin,0.114430,0.114626,0.067123,0.067245


In [11]:
metrics_file = "output/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

projection = []

def filter_time(filternode):
    for metric in filternode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # filternode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def projection_time(projectionnode):
    for metric in projectionnode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # projectionnode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def aggregate_time(aggregatenode):
    for metric in aggregatenode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]
    raise ValueError("aggregate_time metric not found")
        
def collect_timings(node, timings: dict):
    if node["operator"].startswith("FilterExec"):
        timings["filter_time"] += filter_time(node)
    elif node["operator"].startswith("ProjectionExec"):
        timings["projection_time"] += projection_time(node)
    elif node["operator"].startswith("Aggregate"):
        timings["aggregate_time"] += aggregate_time(node)

    for child in node["children"]:
        collect_timings(child, timings)


for entry in metrics:
    method = entry["params"]["method"]
    if method=="BinaryJoin": # skip binaryjoin, we're analyzing 2NSA now
        continue
    
    query = entry["params"]["query"]
    metrics = entry["plan"]["metrics"]
    row = {
        "method": method,   
        "query": query,
    }
    metrics = {"filter_time": 0, "projection_time": 0, "aggregate_time": 0}
    collect_timings(entry["plan"], metrics)
    # already convert all timings from ns to s
    metrics = {key: value / 1_000_000_000 for key, value in metrics.items()}
    row.update(metrics)
    projection.append(row)

yann_metrics = pd.DataFrame(projection)
yann_metrics["method"] = "SYA"
yann_metrics = yann_metrics.groupby(["query","method"]).aggregate(["mean","median"])
yann_metrics.columns = ['_'.join(col) for col in yann_metrics.columns]
yann_metrics.reset_index(inplace=True)

yann_metrics = pd.merge(
    df_agg[df_agg["method"] == "SYA"],
    yann_metrics,
    on=["query","method"]
)

# join_time = total_time - filter_time - projection_time - aggregate_time
yann_metrics["hashjoin_time(s)_mean"] = yann_metrics["duration(s)_mean"] - yann_metrics["filter_time_mean"] - yann_metrics["projection_time_mean"] - yann_metrics["aggregate_time_mean"]
yann_metrics["hashjoin_time(s)_median"] = yann_metrics["duration(s)_median"] - yann_metrics["filter_time_median"] - yann_metrics["projection_time_median"] - yann_metrics["aggregate_time_median"]
yann_metrics.drop(columns=["filter_time_mean","projection_time_mean","aggregate_time_mean","filter_time_median","projection_time_median","aggregate_time_median"], inplace=True)
yann_metrics.to_csv('timings_agg.csv', index=False, header=False, mode='a')
yann_metrics

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,dblp_acyclic_201_00,SYA,0.637914,0.638233,0.599591,0.599858
1,dblp_acyclic_201_01,SYA,0.081931,0.081907,0.062678,0.062775
2,dblp_acyclic_201_02,SYA,0.150796,0.150736,0.116670,0.116644
3,dblp_acyclic_201_03,SYA,0.419517,0.419907,0.381785,0.382166
4,dblp_acyclic_201_04,SYA,0.186757,0.186599,0.141050,0.140958
...,...,...,...,...,...,...
1589,yago_acyclic_tree_6_75,SYA,0.137463,0.137628,0.115006,0.115146
1590,yago_acyclic_tree_6_76,SYA,0.027468,0.027487,0.021034,0.020978
1591,yago_acyclic_tree_6_77,SYA,0.004391,0.004387,0.003812,0.003820
1592,yago_acyclic_tree_6_78,SYA,0.147300,0.147164,0.094933,0.094798


## DuckDB runtimes

In [12]:
def total_mark_join_time(html):
    pattern = r"<b>\s*HASH JOIN\s*\(([\d|\.]+)s\)\s*<\/b>\s*</p>\s*<p>\s*MARK"
    mark_times = [float(match.group(1)) for match in re.finditer(pattern, html)]
    return sum(mark_times)

def extract_times_from_timing_table(html_file_path: str):
    with open(html_file_path, 'r') as file:
        content = file.read()
    
    soup = BeautifulSoup(content, 'html.parser')
    
    rows = soup.find_all('tr')
    
    execution_time = None
    seq_scan_time = None
    hashjoin_time = None
    aggregate_time = None
    projection_time = None
    filter_time = None
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 1:
            phase = cells[0].get_text(strip=True)
            time = cells[1].get_text(strip=True)
            
            if phase == 'Execution Time':
                execution_time = float(time)
            elif phase == 'SEQ_SCAN':
                seq_scan_time = float(time)
            elif phase == "HASH_JOIN":
                hashjoin_time = float(time)
            elif phase == "UNGROUPED_AGGREGATE":
                aggregate_time = float(time)
            elif phase == "PROJECTION":
                projection_time = float(time)
            elif phase == "FILTER":
                filter_time = float(time)

    mark_join_time = total_mark_join_time(content)
    
    return {
        'execution_time': execution_time,
        'seq_scan_time': seq_scan_time,
        'hashjoin_time': hashjoin_time,
        "markjoin_time": mark_join_time,
        'aggregate_time': aggregate_time,
        'projection_time': projection_time,
        'filter_time': filter_time,
    }

def build_table(duckdb_plans: str):
    """ 
    Build Pandas DataFrame with timings reported by DuckDB html query plans.
    `duckdb_plans` is the path to the folder containing the html files.
    """
    duckdb_df = []

    for query_folder in os.listdir(duckdb_plans):
        query_folder_path = os.path.join(duckdb_plans, query_folder)
        if not os.path.isdir(query_folder_path):
            continue
        
        for query_file in os.listdir(query_folder_path):
            if not query_file.endswith('.html'):
                continue
            
            query_file_path = os.path.join(query_folder_path, query_file)
            times = extract_times_from_timing_table(query_file_path)
            duckdb_df.append(
                {
                    "query": query_folder,
                    "run": os.path.splitext(query_file)[0],
                    "execution_time": times['execution_time'],
                    "aggregate_time": times['aggregate_time'],
                    "hashjoin_time": times["hashjoin_time"],
                    "markjoin_time": times["markjoin_time"],
                    "projection_time": times['projection_time'],
                    "filter_time": times['filter_time'],
                    "seq_scan_time": times['seq_scan_time'],
                }
            )

    duckdb_df = pd.DataFrame(duckdb_df)
    return duckdb_df

duckdb_plans = "../../query_plans/ce_duckdb/2_original_with_aliases"
duckdb_df = build_table(duckdb_plans)
duckdb_df.drop(columns=["run"], inplace=True)
duckdb_df["method"] = "DuckDB-Bin"

# # subtract markjoin time from hashjoin time to get the actual time spent in computing inner hashjoins
duckdb_df["hashjoin_time(s)"] = duckdb_df["hashjoin_time"] - duckdb_df["markjoin_time"]
duckdb_df.drop(columns=["aggregate_time","hashjoin_time","markjoin_time","projection_time","filter_time","seq_scan_time"], inplace=True)
duckdb_df.rename(columns={"execution_time":"duration(s)"}, inplace=True)

duckdb_df = duckdb_df.groupby(["query","method"]).aggregate(["mean","median"])
duckdb_df.columns = ['_'.join(col) for col in duckdb_df.columns]
duckdb_df.reset_index(inplace=True)
duckdb_df.to_csv('timings_agg.csv', index=False, header=False, mode='a')
duckdb_df

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,dblp_acyclic_201_00,DuckDB-Bin,0.867648,0.859715,0.826000,0.824105
1,dblp_acyclic_201_01,DuckDB-Bin,0.152020,0.147681,0.125084,0.125715
2,dblp_acyclic_201_02,DuckDB-Bin,0.206569,0.205610,0.178390,0.179452
3,dblp_acyclic_201_03,DuckDB-Bin,0.591593,0.590417,0.561913,0.561459
4,dblp_acyclic_201_04,DuckDB-Bin,0.352022,0.350535,0.314565,0.313139
...,...,...,...,...,...,...
1602,yago_acyclic_tree_6_75,DuckDB-Bin,0.248192,0.248308,0.204660,0.204962
1603,yago_acyclic_tree_6_76,DuckDB-Bin,0.051917,0.052372,0.043599,0.043944
1604,yago_acyclic_tree_6_77,DuckDB-Bin,0.028181,0.027772,0.025457,0.025055
1605,yago_acyclic_tree_6_78,DuckDB-Bin,0.193438,0.193142,0.146068,0.144882
