In [1]:
import pandas as pd
import json
import re
import os
from bs4 import BeautifulSoup

## DataFusion runtimes

Total runtimes of binary join & shredded yannakakis, aggregated per query.

In [2]:
df = pd.read_csv('timings.csv')
df['duration(s)'] = df['duration(µs)'] / 1_000_000
df.drop(columns=['variant','duration(µs)'], inplace=True)
df.replace("BinaryJoin","DF-Bin", inplace=True)
df.replace("Yannakakis","SYA", inplace=True)
df_agg = df.groupby(["query","method"]).aggregate(["mean","median"])
df_agg.columns = ['_'.join(col) for col in df_agg.columns]
df_agg.reset_index(inplace=True)
# df_agg.to_csv('timings_agg.csv', index=False) # , mode='a'
df_agg

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median
0,10a,DF-Bin,0.808268,0.807770
1,10a,SYA,0.778226,0.777806
2,10b,DF-Bin,0.481729,0.481706
3,10b,SYA,0.476152,0.476267
4,10c,DF-Bin,0.740892,0.740221
...,...,...,...,...
219,9b,SYA,0.361744,0.361259
220,9c,DF-Bin,0.497847,0.497847
221,9c,SYA,0.503854,0.503887
222,9d,DF-Bin,0.643600,0.644643


## Datafusion detailed metrics

In [3]:
def replace_utf8_string(text):
    # Use regex to match and replace value: Utf8("SomeString") by value: Utf8(\"SomeString\")
    pattern = r'value:\s*Utf8\("(.*?)"\)'  # This matches 'value: Utf8("SomeString")'
    replacement = r'value: Utf8(\"\1\")'  # Adds escaped quotes: Utf8(\"SomeString\")
    
    result = re.sub(pattern, replacement, text)
    
    return result

metrics_file = "output/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

# Drop 2NSA plan metrics (we're now analyzing binary joins)
metrics = [metric for metric in metrics if metric["params"]["method"] == "BinaryJoin"]
print(len(metrics)) # should be nr_of_queries * 10 (10 repetitions)

1120


In [4]:
def get_metric(metric_name, metrics):
    for metric in metrics:
        if metric["name"] == metric_name:
            return metric

def collect_metrics(metrics):
    def update_timings(node):
        if node["operator"].startswith("AggregateExec"):
            timings["aggregate_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("FilterExec"):
            timings["filter_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("ProjectionExec"):
            timings["projection_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        # memoryexec does not contain timing metrics

        for child in node["children"]:
            update_timings(child)

    result = {}

    result["method"] = metrics["params"]["method"]
    result["query"] = metrics["params"]["query"]

    timings = {
        "aggregate_time": 0,
        "filter_time": 0,
        "projection_time": 0
    }
    root = metrics["plan"]
    update_timings(root)
    # all timings are in nanoseconds, convert to seconds
    timings = {k: v / 1_000_000_000 for k, v in timings.items()}
    result.update(timings)
    return result

df_bin = pd.DataFrame([collect_metrics(m) for m in metrics])
df_bin["method"] = "DF-Bin"
df_bin = df_bin.groupby(["query","method"]).aggregate(["mean","median"])
df_bin.columns = ['_'.join(col) for col in df_bin.columns]
df_bin.reset_index(inplace=True)
df_bin = pd.merge(
    df_agg[df_agg["method"] == "DF-Bin"],
    df_bin,
    on=["query","method"]
)
# hashjoin time = total time - aggregate time - filter time - projection time
df_bin["hashjoin_time(s)_mean"] = df_bin["duration(s)_mean"] - df_bin["aggregate_time_mean"] - df_bin["filter_time_mean"] - df_bin["projection_time_mean"]
df_bin["hashjoin_time(s)_median"] = df_bin["duration(s)_median"] - df_bin["aggregate_time_median"] - df_bin["filter_time_median"] - df_bin["projection_time_median"]
df_bin.drop(columns=["aggregate_time_mean","filter_time_mean","projection_time_mean","aggregate_time_median","filter_time_median","projection_time_median"], inplace=True)
df_bin.to_csv('timings_agg.csv', index=False)
df_bin

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,10a,DF-Bin,0.808268,0.807770,0.065077,0.065061
1,10b,DF-Bin,0.481729,0.481706,0.042303,0.042252
2,10c,DF-Bin,0.740892,0.740221,0.243381,0.242953
3,11a,DF-Bin,0.077794,0.077833,0.002536,0.002565
4,11b,DF-Bin,0.046686,0.046591,0.015409,0.015208
...,...,...,...,...,...,...
107,8d,DF-Bin,2.548468,2.547300,2.381475,2.380463
108,9a,DF-Bin,0.605991,0.606510,0.112905,0.112869
109,9b,DF-Bin,0.348056,0.348185,0.040717,0.040730
110,9c,DF-Bin,0.497847,0.497847,0.137179,0.137204


In [5]:
metrics_file = "output/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

projection = []

def filter_time(filternode):
    for metric in filternode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # filternode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def projection_time(projectionnode):
    for metric in projectionnode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # projectionnode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def aggregate_time(aggregatenode):
    for metric in aggregatenode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]
    raise ValueError("aggregate_time metric not found")
        
def collect_timings(node, timings: dict):
    if node["operator"].startswith("FilterExec"):
        timings["filter_time"] += filter_time(node)
    elif node["operator"].startswith("ProjectionExec"):
        timings["projection_time"] += projection_time(node)
    elif node["operator"].startswith("Aggregate"):
        timings["aggregate_time"] += aggregate_time(node)

    for child in node["children"]:
        collect_timings(child, timings)


for entry in metrics:
    method = entry["params"]["method"]
    if method=="BinaryJoin": # skip binaryjoin, we're analyzing 2NSA now
        continue
    
    query = entry["params"]["query"]
    metrics = entry["plan"]["metrics"]
    row = {
        "method": method,   
        "query": query,
    }
    metrics = {"filter_time": 0, "projection_time": 0, "aggregate_time": 0}
    collect_timings(entry["plan"], metrics)
    # already convert all timings from ns to s
    metrics = {key: value / 1_000_000_000 for key, value in metrics.items()}
    row.update(metrics)
    projection.append(row)

yann_metrics = pd.DataFrame(projection)
yann_metrics["method"] = "SYA"
yann_metrics = yann_metrics.groupby(["query","method"]).aggregate(["mean","median"])
yann_metrics.columns = ['_'.join(col) for col in yann_metrics.columns]
yann_metrics.reset_index(inplace=True)

yann_metrics = pd.merge(
    df_agg[df_agg["method"] == "SYA"],
    yann_metrics,
    on=["query","method"]
)

# join_time = total_time - filter_time - projection_time - aggregate_time
yann_metrics["hashjoin_time(s)_mean"] = yann_metrics["duration(s)_mean"] - yann_metrics["filter_time_mean"] - yann_metrics["projection_time_mean"] - yann_metrics["aggregate_time_mean"]
yann_metrics["hashjoin_time(s)_median"] = yann_metrics["duration(s)_median"] - yann_metrics["filter_time_median"] - yann_metrics["projection_time_median"] - yann_metrics["aggregate_time_median"]
yann_metrics.drop(columns=["filter_time_mean","projection_time_mean","aggregate_time_mean","filter_time_median","projection_time_median","aggregate_time_median"], inplace=True)
yann_metrics.to_csv('timings_agg.csv', index=False, header=False, mode='a')
yann_metrics

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,10a,SYA,0.778226,0.777806,0.034854,0.035224
1,10b,SYA,0.476152,0.476267,0.036536,0.036787
2,10c,SYA,0.609768,0.609623,0.112533,0.112749
3,11a,SYA,0.076912,0.076872,0.001844,0.001844
4,11b,SYA,0.044636,0.044656,0.013228,0.013132
...,...,...,...,...,...,...
107,8d,SYA,0.615817,0.615394,0.445329,0.445232
108,9a,SYA,0.572789,0.573454,0.077298,0.077478
109,9b,SYA,0.361744,0.361259,0.052471,0.052342
110,9c,SYA,0.503854,0.503887,0.136273,0.136589


## DuckDB runtimes

In [6]:
def total_mark_join_time(html):
    pattern = r"<b>\s*HASH JOIN\s*\(([\d|\.]+)s\)\s*<\/b>\s*</p>\s*<p>\s*MARK"
    mark_times = [float(match.group(1)) for match in re.finditer(pattern, html)]
    return sum(mark_times)

def extract_times_from_timing_table(html_file_path: str):
    with open(html_file_path, 'r') as file:
        content = file.read()
    
    soup = BeautifulSoup(content, 'html.parser')
    
    rows = soup.find_all('tr')
    
    execution_time = None
    seq_scan_time = None
    hashjoin_time = None
    aggregate_time = None
    projection_time = None
    filter_time = None
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 1:
            phase = cells[0].get_text(strip=True)
            time = cells[1].get_text(strip=True)
            
            if phase == 'Execution Time':
                execution_time = float(time)
            elif phase == 'SEQ_SCAN':
                seq_scan_time = float(time)
            elif phase == "HASH_JOIN":
                hashjoin_time = float(time)
            elif phase == "UNGROUPED_AGGREGATE":
                aggregate_time = float(time)
            elif phase == "PROJECTION":
                projection_time = float(time)
            elif phase == "FILTER":
                filter_time = float(time)

    mark_join_time = total_mark_join_time(content)
    
    return {
        'execution_time': execution_time,
        'seq_scan_time': seq_scan_time,
        'hashjoin_time': hashjoin_time,
        "markjoin_time": mark_join_time,
        'aggregate_time': aggregate_time,
        'projection_time': projection_time,
        'filter_time': filter_time,
    }

def build_table(duckdb_plans: str):
    """ 
    Build Pandas DataFrame with timings reported by DuckDB html query plans.
    `duckdb_plans` is the path to the folder containing the html files.
    """
    duckdb_df = []

    for query_folder in os.listdir(duckdb_plans):
        query_folder_path = os.path.join(duckdb_plans, query_folder)
        if not os.path.isdir(query_folder_path):
            continue
        
        for query_file in os.listdir(query_folder_path):
            if not query_file.endswith('.html'):
                continue
            
            query_file_path = os.path.join(query_folder_path, query_file)
            times = extract_times_from_timing_table(query_file_path)
            duckdb_df.append(
                {
                    "query": query_folder,
                    "run": os.path.splitext(query_file)[0],
                    "execution_time": times['execution_time'],
                    "aggregate_time": times['aggregate_time'],
                    "hashjoin_time": times["hashjoin_time"],
                    "markjoin_time": times["markjoin_time"],
                    "projection_time": times['projection_time'],
                    "filter_time": times['filter_time'],
                    "seq_scan_time": times['seq_scan_time'],
                }
            )

    duckdb_df = pd.DataFrame(duckdb_df)
    return duckdb_df

duckdb_plans = "../../query_plans/imdb_duckdb/2_original_with_aliases"
duckdb_df = build_table(duckdb_plans)
duckdb_df.drop(columns=["run"], inplace=True)
duckdb_df["method"] = "DuckDB-Bin"

# # subtract markjoin time from hashjoin time to get the actual time spent in computing inner hashjoins
duckdb_df["hashjoin_time(s)"] = duckdb_df["hashjoin_time"] - duckdb_df["markjoin_time"]
duckdb_df.drop(columns=["aggregate_time","hashjoin_time","markjoin_time","projection_time","filter_time","seq_scan_time"], inplace=True)
duckdb_df.rename(columns={"execution_time":"duration(s)"}, inplace=True)

duckdb_df = duckdb_df.groupby(["query","method"]).aggregate(["mean","median"])
duckdb_df.columns = ['_'.join(col) for col in duckdb_df.columns]
duckdb_df.reset_index(inplace=True)
duckdb_df.to_csv('timings_agg.csv', index=False, header=False, mode='a')
duckdb_df

Unnamed: 0,query,method,duration(s)_mean,duration(s)_median,hashjoin_time(s)_mean,hashjoin_time(s)_median
0,10a,DuckDB-Bin,1.024408,0.978643,0.023349,0.022693
1,10b,DuckDB-Bin,0.868753,0.861319,0.019812,0.019508
2,10c,DuckDB-Bin,1.132093,1.121761,0.182219,0.180219
3,11a,DuckDB-Bin,0.119697,0.118126,0.002360,0.002389
4,11b,DuckDB-Bin,0.126528,0.123618,0.011015,0.010953
...,...,...,...,...,...,...
108,8d,DuckDB-Bin,2.253939,2.223370,1.939751,1.913964
109,9a,DuckDB-Bin,1.187696,1.183863,0.089564,0.088296
110,9b,DuckDB-Bin,0.788807,0.783606,0.036458,0.036031
111,9c,DuckDB-Bin,1.138927,1.131404,0.155080,0.153628
