In [1]:
import pandas as pd
import json
import re
import os
from bs4 import BeautifulSoup
import numpy as np

## DataFusion runtimes

Total runtimes of binary join & shredded yannakakis, aggregated per query.

In [2]:
df = pd.read_csv('timings_revision.csv')
df["total_time"] = np.nan # total time = optimization time + execution time (in seconds)
df['execution_time'] = df['duration(µs)'] / 1_000_000
df.drop(columns=['variant','duration(µs)'], inplace=True)
df.replace("BinaryJoin","DF-Bin", inplace=True)
df.replace("Yannakakis","SYA", inplace=True)
df_agg = df.groupby(["query","method"]).aggregate("median")
df_agg.reset_index(inplace=True)
df_agg

Unnamed: 0,query,method,total_time,execution_time
0,10a,DF-Bin,,1.582985
1,10a,SYA,,1.560020
2,10b,DF-Bin,,1.301113
3,10b,SYA,,1.306005
4,10c,DF-Bin,,1.499487
...,...,...,...,...
219,9b,SYA,,1.340928
220,9c,DF-Bin,,1.474649
221,9c,SYA,,1.482741
222,9d,DF-Bin,,1.627611


## Datafusion detailed metrics

In [3]:
def replace_utf8_string(text):
    # Use regex to match and replace value: Utf8("SomeString") by value: Utf8(\"SomeString\")
    pattern = r'value:\s*Utf8\("(.*?)"\)'  # This matches 'value: Utf8("SomeString")'
    replacement = r'value: Utf8(\"\1\")'  # Adds escaped quotes: Utf8(\"SomeString\")
    
    result = re.sub(pattern, replacement, text)
    
    return result

metrics_file = "output_revision/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

# Drop 2NSA plan metrics (we're now analyzing binary joins)
metrics = [metric for metric in metrics if metric["params"]["method"] == "BinaryJoin"]
print(len(metrics)) # should be nr_of_queries * 10 (10 repetitions)

1120


In [4]:
def get_metric(metric_name, metrics):
    for metric in metrics:
        if metric["name"] == metric_name:
            return metric

def collect_metrics(metrics):
    def update_timings(node):
        if node["operator"].startswith("AggregateExec"):
            timings["aggregate_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("FilterExec"):
            timings["filter_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        elif node["operator"].startswith("ProjectionExec"):
            timings["projection_time"] += get_metric("elapsed_compute", node["metrics"])["value"]
        # memoryexec does not contain timing metrics
        elif node["operator"].startswith("ParquetExec"):
            timings["parquet_time"] += get_metric("time_elapsed_processing", node["metrics"])["value"]
        elif node["operator"].startswith("CoalesceBatchesExec"):
            timings["coalesce_batches_time"] += get_metric("elapsed_compute", node["metrics"])["value"]

        for child in node["children"]:
            update_timings(child)

    result = {}

    result["method"] = metrics["params"]["method"]
    result["query"] = metrics["params"]["query"]

    timings = {
        "aggregate_time": 0,
        "filter_time": 0,
        "projection_time": 0,
        "parquet_time": 0,
        "coalesce_batches_time": 0
    }
    root = metrics["plan"]
    update_timings(root)
    # all timings are in nanoseconds, convert to seconds
    timings = {k: v / 1_000_000_000 for k, v in timings.items()}
    result.update(timings)
    return result

df_bin = pd.DataFrame([collect_metrics(m) for m in metrics])
df_bin["method"] = "DF-Bin"
df_bin = df_bin.groupby(["query","method"]).aggregate("median")
df_bin.reset_index(inplace=True)
df_bin = pd.merge(
    df_agg[df_agg["method"] == "DF-Bin"],
    df_bin,
    on=["query","method"]
)
# hashjoin time = total time - aggregate time - filter time - projection time - parquet time - coalesce_batches_time
df_bin["hashjoin_time"] = df_bin["execution_time"] - df_bin["aggregate_time"] - df_bin["filter_time"] - df_bin["projection_time"] - df_bin["parquet_time"] - df_bin["coalesce_batches_time"]
df_bin

Unnamed: 0,query,method,total_time,execution_time,aggregate_time,filter_time,projection_time,parquet_time,coalesce_batches_time,hashjoin_time
0,10a,DF-Bin,,1.582985,0.000163,0.638807,0.000833,0.762457,0.037571,0.143154
1,10b,DF-Bin,,1.301113,0.000143,0.373556,0.000810,0.781096,0.037116,0.108391
2,10c,DF-Bin,,1.499487,0.000169,0.386751,0.000826,0.767246,0.040288,0.304208
3,11a,DF-Bin,,0.174990,0.000009,0.028621,0.000030,0.131995,0.002780,0.011557
4,11b,DF-Bin,,0.197555,0.000006,0.015633,0.000015,0.155758,0.000541,0.025602
...,...,...,...,...,...,...,...,...,...,...
107,8d,DF-Bin,,3.029763,0.001928,0.061289,0.000070,0.412683,0.009795,2.543998
108,9a,DF-Bin,,1.619190,0.000226,0.361585,0.001276,1.015957,0.044133,0.196013
109,9b,DF-Bin,,1.313363,0.000271,0.232304,0.000222,0.973837,0.007690,0.099040
110,9c,DF-Bin,,1.474649,0.000404,0.274196,0.001160,0.938995,0.040251,0.219643


In [5]:
# Time (sec) spent on ParquetExec
print(df_bin["parquet_time"].describe())

# Percentage of time spent on ParquetExec
print((df_bin["parquet_time"] / df_bin["execution_time"] * 100).describe())

count    112.000000
mean       0.831672
std        0.545581
min        0.100383
25%        0.393768
50%        0.800421
75%        1.131922
max        1.937651
Name: parquet_time, dtype: float64
count    112.000000
mean      63.450616
std       15.507490
min       12.707469
25%       54.179044
50%       66.568664
75%       75.466461
max       87.862203
dtype: float64


In [6]:
df_bin.drop(columns=["aggregate_time","filter_time","projection_time","parquet_time","coalesce_batches_time"], inplace=True)
df_bin.to_csv('timings_agg_revision.csv', index=False)
df_bin

Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,10a,DF-Bin,,1.582985,0.143154
1,10b,DF-Bin,,1.301113,0.108391
2,10c,DF-Bin,,1.499487,0.304208
3,11a,DF-Bin,,0.174990,0.011557
4,11b,DF-Bin,,0.197555,0.025602
...,...,...,...,...,...
107,8d,DF-Bin,,3.029763,2.543998
108,9a,DF-Bin,,1.619190,0.196013
109,9b,DF-Bin,,1.313363,0.099040
110,9c,DF-Bin,,1.474649,0.219643


In [7]:
metrics_file = "output_revision/metrics.txt"
metrics = []
with open(metrics_file) as f:
    for i, line in enumerate(f):
        try:
            line = replace_utf8_string(line)
            metrics.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"Error in line {i}")

projection = []

def filter_time(filternode):
    for metric in filternode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # filternode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def projection_time(projectionnode):
    for metric in projectionnode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]

    # projectionnode was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def aggregate_time(aggregatenode):
    for metric in aggregatenode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]
    raise ValueError("aggregate_time metric not found")

def parquet_time(parquetnode):
    for metric in parquetnode["metrics"]:
        if metric["name"] == "time_elapsed_processing":
            return metric["value"]
    
    # parquetexec was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.  
    return 0

def coalesce_batches_time(coalescenode):
    for metric in coalescenode["metrics"]:
        if metric["name"] == "elapsed_compute":
            return metric["value"]
        
    # coalescebatchesexec was never executed
    # can be due to early stopping in case of a multisemijoin with >=2 children.
    return 0
        
def collect_timings(node, timings: dict):
    if node["operator"].startswith("FilterExec"):
        timings["filter_time"] += filter_time(node)
    elif node["operator"].startswith("ProjectionExec"):
        timings["projection_time"] += projection_time(node)
    elif node["operator"].startswith("Aggregate"):
        timings["aggregate_time"] += aggregate_time(node)
    elif node["operator"].startswith("ParquetExec"):
        timings["parquet_time"] += parquet_time(node)
    elif node["operator"].startswith("CoalesceBatchesExec"):
        timings["coalesce_batches_time"] += coalesce_batches_time(node)

    for child in node["children"]:
        collect_timings(child, timings)


for entry in metrics:
    method = entry["params"]["method"]
    if method=="BinaryJoin": # skip binaryjoin, we're analyzing 2NSA now
        continue
    
    query = entry["params"]["query"]
    metrics = entry["plan"]["metrics"]
    row = {
        "method": method,   
        "query": query,
    }
    metrics = {"filter_time": 0, "projection_time": 0, "aggregate_time": 0, "parquet_time": 0, "coalesce_batches_time": 0}
    collect_timings(entry["plan"], metrics)
    # already convert all timings from ns to s
    metrics = {key: value / 1_000_000_000 for key, value in metrics.items()}
    row.update(metrics)
    projection.append(row)

yann_metrics = pd.DataFrame(projection)
yann_metrics["method"] = "SYA"
yann_metrics = yann_metrics.groupby(["query","method"]).aggregate("median")
yann_metrics.reset_index(inplace=True)

yann_metrics = pd.merge(
    df_agg[df_agg["method"] == "SYA"],
    yann_metrics,
    on=["query","method"]
)

# join_time = total_time - filter_time - projection_time - aggregate_time - parquet_time - coalesce_batches_time

yann_metrics["hashjoin_time"] = yann_metrics["execution_time"] - yann_metrics["aggregate_time"] - yann_metrics["filter_time"] - yann_metrics["projection_time"] - yann_metrics["parquet_time"] - yann_metrics["coalesce_batches_time"]
yann_metrics

Unnamed: 0,query,method,total_time,execution_time,filter_time,projection_time,aggregate_time,parquet_time,coalesce_batches_time,hashjoin_time
0,10a,SYA,,1.560020,0.638070,0.000832,0.000153,0.764897,0.037614,0.118454
1,10b,SYA,,1.306005,0.373886,0.000813,0.000137,0.783112,0.036964,0.111094
2,10c,SYA,,1.373513,0.385713,0.000827,0.000158,0.765752,0.039309,0.181754
3,11a,SYA,,0.175129,0.028593,0.000035,0.000026,0.131889,0.002783,0.011804
4,11b,SYA,,0.195862,0.015662,0.000020,0.000024,0.155698,0.000533,0.023925
...,...,...,...,...,...,...,...,...,...,...
107,8d,SYA,,0.987600,0.059843,0.000072,0.001995,0.393547,0.009912,0.522231
108,9a,SYA,,1.584936,0.360474,0.001258,0.000209,1.014128,0.044123,0.164744
109,9b,SYA,,1.340928,0.233159,0.000225,0.000252,0.974695,0.007799,0.124798
110,9c,SYA,,1.482741,0.273470,0.001146,0.000406,0.943218,0.040326,0.224175


In [8]:
yann_metrics.drop(columns=["filter_time","projection_time","aggregate_time","parquet_time","coalesce_batches_time"], inplace=True)
yann_metrics.to_csv('timings_agg_revision.csv', index=False, header=False, mode='a')
yann_metrics

Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,10a,SYA,,1.560020,0.118454
1,10b,SYA,,1.306005,0.111094
2,10c,SYA,,1.373513,0.181754
3,11a,SYA,,0.175129,0.011804
4,11b,SYA,,0.195862,0.023925
...,...,...,...,...,...
107,8d,SYA,,0.987600,0.522231
108,9a,SYA,,1.584936,0.164744
109,9b,SYA,,1.340928,0.124798
110,9c,SYA,,1.482741,0.224175


## DuckDB runtimes

In [9]:
def total_mark_join_time(html):
    pattern = r"<b>\s*HASH JOIN\s*\(([\d|\.]+)s\)\s*<\/b>\s*</p>\s*<p>\s*MARK"
    mark_times = [float(match.group(1)) for match in re.finditer(pattern, html)]
    return sum(mark_times)

def extract_times_from_timing_table(html_file_path: str):
    with open(html_file_path, 'r') as file:
        content = file.read()
    
    soup = BeautifulSoup(content, 'html.parser')
    
    rows = soup.find_all('tr')
    
    total_time = None
    execution_time = None
    seq_scan_time = None
    hashjoin_time = None
    aggregate_time = None
    projection_time = None
    filter_time = None
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 1:
            phase = cells[0].get_text(strip=True)
            time = cells[1].get_text(strip=True)
            
            if phase == "TOTAL TIME":
                total_time = float(time)
            elif phase == 'Execution Time':
                execution_time = float(time)
            elif phase == 'SEQ_SCAN':
                seq_scan_time = float(time)
            elif phase == "HASH_JOIN":
                hashjoin_time = float(time)
            elif phase == "UNGROUPED_AGGREGATE":
                aggregate_time = float(time)
            elif phase == "PROJECTION":
                projection_time = float(time)
            elif phase == "FILTER":
                filter_time = float(time)

    mark_join_time = total_mark_join_time(content)
    
    return {
        'total_time': total_time,
        'execution_time': execution_time,
        'seq_scan_time': seq_scan_time,
        'hashjoin_time': hashjoin_time,
        "markjoin_time": mark_join_time,
        'aggregate_time': aggregate_time,
        'projection_time': projection_time,
        'filter_time': filter_time,
    }

def build_table(duckdb_plans: str):
    """ 
    Build Pandas DataFrame with timings reported by DuckDB html query plans.
    `duckdb_plans` is the path to the folder containing the html files.
    """
    duckdb_df = []

    for query_folder in os.listdir(duckdb_plans):
        query_folder_path = os.path.join(duckdb_plans, query_folder)
        if not os.path.isdir(query_folder_path):
            continue
        
        for query_file in os.listdir(query_folder_path):
            if not query_file.endswith('.html'):
                continue
            
            query_file_path = os.path.join(query_folder_path, query_file)
            times = extract_times_from_timing_table(query_file_path)
            duckdb_df.append(
                {
                    "query": query_folder,
                    "run": os.path.splitext(query_file)[0],
                    "total_time": times['total_time'],
                    "execution_time": times['execution_time'],
                    "aggregate_time": times['aggregate_time'],
                    "hashjoin_time": times["hashjoin_time"],
                    "markjoin_time": times["markjoin_time"],
                    "projection_time": times['projection_time'],
                    "filter_time": times['filter_time'],
                    "seq_scan_time": times['seq_scan_time'],
                }
            )

    duckdb_df = pd.DataFrame(duckdb_df)
    return duckdb_df


duckdb_plans = "../../query_plans/imdb_duckdb/2_original_with_aliases"
duckdb_df = build_table(duckdb_plans)
duckdb_df.drop(columns=["run"], inplace=True)
duckdb_df["method"] = "DuckDB-Bin"

# Time (sec) spent on SequentialScan
print(duckdb_df["seq_scan_time"].describe())

# Percentage of time spent on SequentialScan
print((duckdb_df["seq_scan_time"] / duckdb_df["execution_time"] * 100).describe())

# # subtract markjoin time from hashjoin time to get the actual time spent in computing inner hashjoins
duckdb_df["hashjoin_time(s)"] = duckdb_df["hashjoin_time"] - duckdb_df["markjoin_time"]
duckdb_df.drop(columns=["aggregate_time","hashjoin_time","markjoin_time","projection_time","filter_time","seq_scan_time"], inplace=True)
duckdb_df.rename(columns={"hashjoin_time(s)":"hashjoin_time"}, inplace=True)

duckdb_df = duckdb_df.groupby(["query","method"]).aggregate("median")
duckdb_df.reset_index(inplace=True)
duckdb_df.to_csv('timings_agg_revision.csv', index=False, header=False, mode='a')
duckdb_df

count    1130.000000
mean        0.378414
std         0.236154
min         0.004126
25%         0.169285
50%         0.324748
75%         0.556659
max         0.923550
Name: seq_scan_time, dtype: float64
count    1130.000000
mean       57.487826
std        20.531071
min         6.940436
25%        46.602075
50%        59.458664
75%        70.368541
max        95.117681
dtype: float64


Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,10a,DuckDB-Bin,0.988438,0.978643,0.022693
1,10b,DuckDB-Bin,0.870636,0.861319,0.019508
2,10c,DuckDB-Bin,1.167226,1.121761,0.180219
3,11a,DuckDB-Bin,0.122523,0.118126,0.002389
4,11b,DuckDB-Bin,0.128717,0.123618,0.010953
...,...,...,...,...,...
108,8d,DuckDB-Bin,2.365270,2.223370,1.913964
109,9a,DuckDB-Bin,1.210967,1.183863,0.088296
110,9b,DuckDB-Bin,0.793663,0.783606,0.036031
111,9c,DuckDB-Bin,1.172012,1.131404,0.153628


## Umbra runtimes

In [10]:
import numpy as np
def parse_umbra_timings(file: str, benchmark: str, method: str) -> pd.DataFrame:
    df = pd.read_csv(file)
    df["benchmark"] = df["name"].apply(lambda x: x.split(":")[0])
    df["query"] = df["name"].apply(lambda x: x.split(":")[1].split(".")[0])
    df["method"] = method
    df = df[df["benchmark"] == benchmark]
    df = df[["query","method","compilation_time_median","execution_time_median"]]
    df["total_time"] = df["compilation_time_median"] + df["execution_time_median"]
    df["hashjoin_time"]=np.nan
    df.rename(columns={"execution_time_median":"execution_time"}, inplace=True)
    return df[["query","method","total_time","execution_time","hashjoin_time"]]

In [11]:
method = "Umbra-Default"
file = "../../umbra/results/benchmark_umbra_default.csv"
benchmark = "job"

umbra = parse_umbra_timings(file, benchmark, method)
umbra.to_csv('timings_agg_revision.csv', index=False, header=False, mode='a')
umbra

Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,1a,Umbra-Default,0.068412,0.012775,
1,1b,Umbra-Default,0.071148,0.008437,
2,1c,Umbra-Default,0.070765,0.010346,
3,1d,Umbra-Default,0.087652,0.010968,
4,2a,Umbra-Default,0.073100,0.026065,
...,...,...,...,...,...
108,32a,Umbra-Default,0.082660,0.014483,
109,32b,Umbra-Default,0.085099,0.019090,
110,33a,Umbra-Default,0.162391,0.020115,
111,33b,Umbra-Default,0.158270,0.019704,


In [12]:
method = "Umbra-L&E"
file = "../../umbra/results/benchmark_umbra_le.csv"
benchmark = "job"

umbra = parse_umbra_timings(file, benchmark, method)
umbra.to_csv('timings_agg_revision.csv', index=False, header=False, mode='a')
umbra

Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,1a,Umbra-L&E,0.086141,0.011075,
1,1b,Umbra-L&E,0.084050,0.009298,
2,1c,Umbra-L&E,0.090959,0.008799,
3,1d,Umbra-L&E,0.103766,0.009495,
4,2a,Umbra-L&E,0.089164,0.024307,
...,...,...,...,...,...
105,32a,Umbra-L&E,0.114397,0.012943,
106,32b,Umbra-L&E,0.107303,0.015704,
107,33a,Umbra-L&E,29.350372,0.017372,
108,33b,Umbra-L&E,28.738583,0.016783,


In [13]:
method = "Umbra-Interpreted"
file = "../../umbra/results/benchmark_umbra_interpreted.csv"
benchmark = "job"

umbra = parse_umbra_timings(file, benchmark, method)
umbra.to_csv('timings_agg_revision.csv', index=False, header=False, mode='a')
umbra

Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,1a,Umbra-Interpreted,0.103361,0.100822,
1,1b,Umbra-Interpreted,0.043645,0.041516,
2,1c,Umbra-Interpreted,0.045115,0.043278,
3,1d,Umbra-Interpreted,0.041994,0.040230,
4,2a,Umbra-Interpreted,0.229362,0.227750,
...,...,...,...,...,...
108,32a,Umbra-Interpreted,0.204025,0.201990,
109,32b,Umbra-Interpreted,0.211425,0.209906,
110,33a,Umbra-Interpreted,0.212847,0.206315,
111,33b,Umbra-Interpreted,0.214171,0.207970,


In [14]:
method = "Umbra-Chained"
file = "../../umbra/results/benchmark_umbra_chained.csv"
benchmark = "job"

umbra = parse_umbra_timings(file, benchmark, method)
umbra.to_csv('timings_agg_revision.csv', index=False, header=False, mode='a')
umbra

Unnamed: 0,query,method,total_time,execution_time,hashjoin_time
0,1a,Umbra-Chained,0.046395,0.007242,
1,1b,Umbra-Chained,0.043546,0.005649,
2,1c,Umbra-Chained,0.045739,0.006706,
3,1d,Umbra-Chained,0.049348,0.006706,
4,2a,Umbra-Chained,0.046001,0.016703,
...,...,...,...,...,...
108,32a,Umbra-Chained,0.050683,0.009501,
109,32b,Umbra-Chained,0.053113,0.012428,
110,33a,Umbra-Chained,0.105997,0.012651,
111,33b,Umbra-Chained,0.103856,0.012063,
