In [13]:
import pandas as pd
import json
import re
import os
from bs4 import BeautifulSoup

## DataFusion runtimes

Total runtimes of binary join & shredded yannakakis, aggregated per query.

In [14]:
df = pd.read_csv('timings.csv')
df['duration(s)'] = df['duration(µs)'] / 1_000_000
df.drop(columns=['variant','duration(µs)'], inplace=True)
df.replace("BinaryJoin","DF-Bin", inplace=True)
df.replace("Yannakakis","SYA", inplace=True)
df_agg = df.groupby(["query","method"]).aggregate(["mean","median"])
df_agg.columns = ['_'.join(col) for col in df_agg.columns]
df_agg.reset_index(inplace=True)
# df_agg.to_csv('timings_agg.csv', index=False) # , mode='a'
df_agg

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median
0,1,DF-Bin,0.001155,0.001116
1,1,SYA,0.000747,0.000742
2,2,DF-Bin,0.144960,0.144696
3,2,SYA,0.018891,0.018862
4,3,DF-Bin,0.021759,0.021765
...,...,...,...,...
281,144,SYA,0.088474,0.088408
282,145,DF-Bin,0.014366,0.014273
283,145,SYA,0.011219,0.011108
284,146,DF-Bin,0.026471,0.026405


## Datafusion detailed metrics

In [15]:
def replace_utf8_string(text):
    # Use regex to match and replace value: Utf8("SomeString") by value: Utf8(\"SomeString\")
    pattern = r'value:\s*Utf8\("(.*?)"\)'  # This matches 'value: Utf8("SomeString")'
    replacement = r'value: Utf8(\"\1\")'  # Adds escaped quotes: Utf8(\"SomeString\")
    
    result = re.sub(pattern, replacement, text)
    
    return result

metrics_file = "output/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

# Drop 2NSA plan metrics (we're now analyzing binary joins)
metrics = [metric for metric in metrics if metric["params"]["method"] == "BinaryJoin"]
print(len(metrics)) # should be nr_of_queries * 10 (10 repetitions)

1430


In [16]:
def get_metric(metric_name, metrics):
    for metric in metrics:
        if metric["name"] == metric_name:
            return metric

def collect_metrics(metrics):
    def update_timings(node):
        if node["operator"].startswith("AggregateExec"):
            timings["aggregate_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("FilterExec"):
            timings["filter_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("ProjectionExec"):
            timings["projection_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        # memoryexec does not contain timing metrics

        for child in node["children"]:
            update_timings(child)

    result = {}

    result["method"] = metrics["params"]["method"]
    result["query"] = int(metrics["params"]["query"])

    timings = {
        "aggregate_time": 0,
        "filter_time": 0,
        "projection_time": 0
    }
    root = metrics["plan"]
    update_timings(root)
    # all timings are in nanoseconds, convert to seconds
    timings = {k: v / 1_000_000_000 for k, v in timings.items()}
    result.update(timings)
    return result

df_bin = pd.DataFrame([collect_metrics(m) for m in metrics])
df_bin["method"] = "DF-Bin"
df_bin = df_bin.groupby(["query","method"]).aggregate(["mean","median"])
df_bin.columns = ['_'.join(col) for col in df_bin.columns]
df_bin.reset_index(inplace=True)
df_bin = pd.merge(
    df_agg[df_agg["method"] == "DF-Bin"],
    df_bin,
    on=["query","method"]
)
# hashjoin time = total time - aggregate time - filter time - projection time
df_bin["hashjoin_time(s)_mean"] = df_bin["duration(s)_mean"] - df_bin["aggregate_time_mean"] - df_bin["filter_time_mean"] - df_bin["projection_time_mean"]
df_bin["hashjoin_time(s)_median"] = df_bin["duration(s)_median"] - df_bin["aggregate_time_median"] - df_bin["filter_time_median"] - df_bin["projection_time_median"]
df_bin.drop(columns=["aggregate_time_mean","filter_time_mean","projection_time_mean","aggregate_time_median","filter_time_median","projection_time_median"], inplace=True)
df_bin.to_csv('timings_agg.csv', index=False)
df_bin

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,1,DF-Bin,0.001155,0.001116,0.001047,0.001016
1,2,DF-Bin,0.144960,0.144696,0.142855,0.142590
2,3,DF-Bin,0.021759,0.021765,0.019512,0.019558
3,4,DF-Bin,0.019939,0.019915,0.018830,0.018818
4,5,DF-Bin,0.070732,0.069803,0.067254,0.066524
...,...,...,...,...,...,...
138,142,DF-Bin,5.132936,5.134273,5.092906,5.094812
139,143,DF-Bin,4.018366,3.974902,3.991139,3.948167
140,144,DF-Bin,0.338679,0.338933,0.332521,0.332772
141,145,DF-Bin,0.014366,0.014273,0.010296,0.010385


In [17]:
metrics_file = "output/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

projection = []

def filter_time(filternode):
    for metric in filternode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # filternode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def projection_time(projectionnode):
    for metric in projectionnode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # projectionnode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def aggregate_time(aggregatenode):
    for metric in aggregatenode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]
    raise ValueError("aggregate_time metric not found")
        
def collect_timings(node, timings: dict):
    if node["operator"].startswith("FilterExec"):
        timings["filter_time"] += filter_time(node)
    elif node["operator"].startswith("ProjectionExec"):
        timings["projection_time"] += projection_time(node)
    elif node["operator"].startswith("Aggregate"):
        timings["aggregate_time"] += aggregate_time(node)

    for child in node["children"]:
        collect_timings(child, timings)


for entry in metrics:
    method = entry["params"]["method"]
    if method=="BinaryJoin": # skip binaryjoin, we're analyzing 2NSA now
        continue
    
    query = int(entry["params"]["query"])
    metrics = entry["plan"]["metrics"]
    row = {
        "method": method,   
        "query": query,
    }
    metrics = {"filter_time": 0, "projection_time": 0, "aggregate_time": 0}
    collect_timings(entry["plan"], metrics)
    # already convert all timings from ns to s
    metrics = {key: value / 1_000_000_000 for key, value in metrics.items()}
    row.update(metrics)
    projection.append(row)

yann_metrics = pd.DataFrame(projection)
yann_metrics["method"] = "SYA"
yann_metrics = yann_metrics.groupby(["query","method"]).aggregate(["mean","median"])
yann_metrics.columns = ['_'.join(col) for col in yann_metrics.columns]
yann_metrics.reset_index(inplace=True)

yann_metrics = pd.merge(
    df_agg[df_agg["method"] == "SYA"],
    yann_metrics,
    on=["query","method"]
)

# join_time = total_time - filter_time - projection_time - aggregate_time
yann_metrics["hashjoin_time(s)_mean"] = yann_metrics["duration(s)_mean"] - yann_metrics["filter_time_mean"] - yann_metrics["projection_time_mean"] - yann_metrics["aggregate_time_mean"]
yann_metrics["hashjoin_time(s)_median"] = yann_metrics["duration(s)_median"] - yann_metrics["filter_time_median"] - yann_metrics["projection_time_median"] - yann_metrics["aggregate_time_median"]
yann_metrics.drop(columns=["filter_time_mean","projection_time_mean","aggregate_time_mean","filter_time_median","projection_time_median","aggregate_time_median"], inplace=True)
yann_metrics.to_csv('timings_agg.csv', index=False, header=False, mode='a')
yann_metrics

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,1,SYA,0.000747,0.000742,0.000647,0.000644
1,2,SYA,0.018891,0.018862,0.017166,0.017139
2,3,SYA,0.005443,0.005412,0.003275,0.003255
3,4,SYA,0.005679,0.005645,0.004437,0.004430
4,5,SYA,0.015824,0.015700,0.012538,0.012442
...,...,...,...,...,...,...
138,142,SYA,2.143622,2.145800,2.121184,2.123063
139,143,SYA,0.634139,0.635128,0.618966,0.619863
140,144,SYA,0.088474,0.088408,0.085586,0.085530
141,145,SYA,0.011219,0.011108,0.007271,0.007217


## DuckDB runtimes

In [18]:
def total_mark_join_time(html):
    pattern = r"<b>\s*HASH JOIN\s*\(([\d|\.]+)s\)\s*<\/b>\s*</p>\s*<p>\s*MARK"
    mark_times = [float(match.group(1)) for match in re.finditer(pattern, html)]
    return sum(mark_times)

def extract_times_from_timing_table(html_file_path: str):
    with open(html_file_path, 'r') as file:
        content = file.read()
    
    soup = BeautifulSoup(content, 'html.parser')
    
    rows = soup.find_all('tr')
    
    execution_time = None
    seq_scan_time = None
    hashjoin_time = None
    aggregate_time = None
    projection_time = None
    filter_time = None
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 1:
            phase = cells[0].get_text(strip=True)
            time = cells[1].get_text(strip=True)
            
            if phase == 'Execution Time':
                execution_time = float(time)
            elif phase == 'SEQ_SCAN':
                seq_scan_time = float(time)
            elif phase == "HASH_JOIN":
                hashjoin_time = float(time)
            elif phase == "UNGROUPED_AGGREGATE":
                aggregate_time = float(time)
            elif phase == "PROJECTION":
                projection_time = float(time)
            elif phase == "FILTER":
                filter_time = float(time)

    mark_join_time = total_mark_join_time(content)
    
    return {
        'execution_time': execution_time,
        'seq_scan_time': seq_scan_time,
        'hashjoin_time': hashjoin_time,
        "markjoin_time": mark_join_time,
        'aggregate_time': aggregate_time,
        'projection_time': projection_time,
        'filter_time': filter_time,
    }

def build_table(duckdb_plans: str):
    """ 
    Build Pandas DataFrame with timings reported by DuckDB html query plans.
    `duckdb_plans` is the path to the folder containing the html files.
    """
    duckdb_df = []

    for query_folder in os.listdir(duckdb_plans):
        query_folder_path = os.path.join(duckdb_plans, query_folder)
        if not os.path.isdir(query_folder_path):
            continue
        
        for query_file in os.listdir(query_folder_path):
            if not query_file.endswith('.html'):
                continue
            
            query_file_path = os.path.join(query_folder_path, query_file)
            times = extract_times_from_timing_table(query_file_path)
            duckdb_df.append(
                {
                    "query": query_folder,
                    "run": os.path.splitext(query_file)[0],
                    "execution_time": times['execution_time'],
                    "aggregate_time": times['aggregate_time'],
                    "hashjoin_time": times["hashjoin_time"],
                    "markjoin_time": times["markjoin_time"],
                    "projection_time": times['projection_time'],
                    "filter_time": times['filter_time'],
                    "seq_scan_time": times['seq_scan_time'],
                }
            )

    duckdb_df = pd.DataFrame(duckdb_df)
    return duckdb_df

duckdb_plans = "../../query_plans/stats_duckdb/2_original_with_aliases"
duckdb_df = build_table(duckdb_plans)
duckdb_df.drop(columns=["run"], inplace=True)
duckdb_df["method"] = "DuckDB-Bin"

# # subtract markjoin time from hashjoin time to get the actual time spent in computing inner hashjoins
duckdb_df["hashjoin_time(s)"] = duckdb_df["hashjoin_time"] - duckdb_df["markjoin_time"]
duckdb_df.drop(columns=["aggregate_time","hashjoin_time","markjoin_time","projection_time","filter_time","seq_scan_time"], inplace=True)
duckdb_df.rename(columns={"execution_time":"duration(s)"}, inplace=True)

duckdb_df = duckdb_df.groupby(["query","method"]).aggregate(["mean","median"])
duckdb_df.columns = ['_'.join(col) for col in duckdb_df.columns]
duckdb_df.reset_index(inplace=True)
duckdb_df.to_csv('timings_agg.csv', index=False, header=False, mode='a')
duckdb_df

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,1,DuckDB-Bin,0.000633,0.000632,0.000480,0.000481
1,10,DuckDB-Bin,0.088972,0.088338,0.087335,0.086737
2,100,DuckDB-Bin,0.011423,0.011060,0.009605,0.009396
3,101,DuckDB-Bin,0.024160,0.023280,0.020778,0.020071
4,102,DuckDB-Bin,0.087483,0.085856,0.084785,0.083115
...,...,...,...,...,...,...
138,95,DuckDB-Bin,0.121884,0.124332,0.118877,0.121216
139,96,DuckDB-Bin,0.187618,0.189920,0.183773,0.186113
140,97,DuckDB-Bin,3.187001,2.773124,3.179245,2.765452
141,98,DuckDB-Bin,0.787426,0.788835,0.775044,0.776176
