# Evaluate Python UDF Performance (Iris)

In [4]:
import os

import pandas as pd

import json
import duckdb

### Load timing data

In [5]:
timing_dir = 'analyze_outputs'
files = [os.path.abspath(os.path.join(timing_dir, f)) for f in os.listdir(timing_dir) if '.json' in f and any([(s in f) for s in ['onnx_i', 'torch_i', 'onnx_single', 'torch_single']])]
files

['/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_onnx_single_iris4_5000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_torch_iris5_10000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_onnx_iris5_10000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_torch_iris3_10000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_onnx_single_iris3_5000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_onnx_single_iris5_15000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_onnx_iris3_15000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_torch_iris5_15000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_onnx_iris4_20000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb/eval/analyze_outputs/opt_vec_torch_iris5_20000000.json',
 '/u/antor/u13/ukumaras/Projects/mldb

In [6]:
times_raw = {}

for file in files:
    with open(file, "r") as f:
        name = file.split("/")[-1].split(".")[0]
        workload = name.split("_")[-1]
        key = name.split("_iris")[0]
        run = name.split("_iris")[1][0]
        times_raw[key + "~" + workload + "~" + run] = json.loads(f.read())

In [7]:
times = []

for k, v in times_raw.items():
    t = {}
    t['dataset'] = 'iris'
    t['pipeline'] = k.split("~")[0]
    t['workload'] = k.split("~")[1]
    t['run'] = k.split("~")[2]

    if t['run'] ==  '1':
        prediction = v['children'][0]['children'][0]['children'][0]
    else:
        prediction = v['children'][0]['children'][0]
    pred_info = prediction['extra_info'].split('|')[1:]
    for i in pred_info:
        kv = i.split(":")
        t[kv[0]] = kv[1]
    
    times.append(t)

In [8]:
times_df = pd.DataFrame.from_records(times).loc[:, ['dataset', 'pipeline', 'workload', 'move', 'load', 'predict']]
times_df['workload'] = times_df['workload'].astype(int)
times_df.sort_values(['pipeline', 'workload'])
times_df

Unnamed: 0,dataset,pipeline,workload,move,load,predict
0,iris,opt_vec_onnx_single,5000000,119684,7443,1491323
1,iris,opt_vec_torch,10000000,259892,5398,3543445
2,iris,opt_vec_onnx,10000000,295035,7196,2960334
3,iris,opt_vec_torch,10000000,280028,5362,3486194
4,iris,opt_vec_onnx_single,5000000,119771,9563,1568372
5,iris,opt_vec_onnx_single,15000000,356580,8027,4603015
6,iris,opt_vec_onnx,15000000,440942,7125,4144989
7,iris,opt_vec_torch,15000000,389370,4789,5301605
8,iris,opt_vec_onnx,20000000,586944,6867,5945266
9,iris,opt_vec_torch,20000000,520116,4238,6900531


In [9]:
times_df = pd.concat([times_df])

### Write to times table

In [10]:
con = duckdb.connect("../test.db")
tables = con.sql(f"SHOW TABLES").df()
has_table = tables[tables.name == 'times'].shape[0] == 1
    
if not has_table:
    con.sql(f"CREATE TABLE times (dataset varchar, workload integer, move integer, load integer, predict integer, ts timestamp DEFAULT current_timestamp)")

con.sql(f"DELETE FROM times WHERE pipeline LIKE 'opt%'")
con.sql(f"INSERT INTO times (dataset, pipeline, workload, move, load, predict) SELECT * FROM times_df")
con.sql(f"SELECT * FROM times").show()
con.close()

┌─────────┬──────────────────────┬──────────┬─────────┬────────┬──────────┬─────────────────────────┐
│ dataset │       pipeline       │ workload │  move   │  load  │ predict  │           ts            │
│ varchar │       varchar        │  int32   │  int32  │ int32  │  int32   │        timestamp        │
├─────────┼──────────────────────┼──────────┼─────────┼────────┼──────────┼─────────────────────────┤
│ iris    │ python_udf_vec       │  1000000 │  218034 │  13971 │  2139728 │ 2024-05-21 14:43:26.063 │
│ iris    │ python_udf_vec       │  5000000 │ 1185950 │  14044 │  5536022 │ 2024-05-21 14:43:54.855 │
│ iris    │ python_udf_vec       │ 10000000 │ 2224030 │  13889 │  7646142 │ 2024-05-21 14:49:48.529 │
│ iris    │ python_udf_vec       │ 15000000 │ 3188160 │  13680 │ 11789199 │ 2024-05-21 14:50:06.5   │
│ iris    │ python_udf_vec       │ 20000000 │ 4314927 │  15031 │ 14350052 │ 2024-05-21 14:50:28.176 │
│ iris    │ python_udf_vec       │  1000000 │  223791 │  14497 │  1715185 │ 2024-0

In [24]:
con.close()