# Evaluate Traditional Pipeline Performance (Iris)

In [1]:
import pandas as pd

import time

from transformers import AutoModelForSequenceClassification, AutoTokenizer

import torch
import duckdb

  from .autonotebook import tqdm as notebook_tqdm


### Config

In [2]:
times = {}

In [3]:
table_name = 'imdb'
times['dataset'] = 'imdb'
times['pipeline'] = 'traditonal'

workload = 1000
select_stmt = f"SELECT * FROM imdb LIMIT {workload}"

### Load imdb data

In [4]:
st = time.perf_counter_ns()

con = duckdb.connect("../test.db")
imdb = con.sql(select_stmt).df()
imdb.to_csv('dummy.csv', index=False)
con.close()
imdb = pd.read_csv('dummy.csv')

et = time.perf_counter_ns()
times['workload'] = imdb.shape[0]
times["move"] = (et - st)/1000
imdb.head()

Unnamed: 0,text,label,stage
0,I rented I AM CURIOUS-YELLOW from my video sto...,0.0,train
1,"""I Am Curious: Yellow"" is a risible and preten...",0.0,train
2,If only to avoid making this type of film in t...,0.0,train
3,This film was probably inspired by Godard's Ma...,0.0,train
4,"Oh, brother...after hearing about this ridicul...",0.0,train


In [14]:
imdb.shape

(1000, 3)

### Load the compiled model

In [5]:
st = time.perf_counter_ns()

model_path = "/homes/ukumaras/scratch/Models/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

et = time.perf_counter_ns()
times["load"] = (et - st)/1000

### Predict

In [10]:
st = time.perf_counter_ns()

inputs = tokenizer(list(imdb['text'].values), return_tensors="pt", max_length=512, padding=True, truncation=True)
    
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax(dim=1).numpy()

et = time.perf_counter_ns()
times["predict"] = (et - st)/1000

In [11]:
times_df = pd.DataFrame.from_records([times]).loc[:, ['dataset', 'pipeline', 'workload', 'move', 'load', 'predict']]
times_df

Unnamed: 0,dataset,pipeline,workload,move,load,predict
0,imdb,traditonal,1000,718026.979,3641943.229,87026760.0


In [13]:
con = duckdb.connect("../test.db")
tables = con.sql(f"SHOW TABLES").df()
has_table = tables[tables.name == 'times'].shape[0] == 1
    
if not has_table:
    con.sql(f"CREATE TABLE times (dataset varchar, workload integer, move integer, load integer, predict integer, ts timestamp DEFAULT current_timestamp)")

con.sql(f"DELETE FROM times WHERE pipeline LIKE 'opt%'")
con.sql(f"INSERT INTO times (dataset, pipeline, workload, move, load, predict) SELECT * FROM times_df")
con.sql(f"SELECT * FROM times").show()
con.close()

┌─────────┬────────────────┬──────────┬──────────┬─────────┬───────────┬─────────────────────────┐
│ dataset │    pipeline    │ workload │   move   │  load   │  predict  │           ts            │
│ varchar │    varchar     │  int32   │  int32   │  int32  │   int32   │        timestamp        │
├─────────┼────────────────┼──────────┼──────────┼─────────┼───────────┼─────────────────────────┤
│ iris    │ python_udf_vec │  1000000 │   218034 │   13971 │   2139728 │ 2024-05-21 14:43:26.063 │
│ iris    │ python_udf_vec │  5000000 │  1185950 │   14044 │   5536022 │ 2024-05-21 14:43:54.855 │
│ iris    │ python_udf_vec │ 10000000 │  2224030 │   13889 │   7646142 │ 2024-05-21 14:49:48.529 │
│ iris    │ python_udf_vec │ 15000000 │  3188160 │   13680 │  11789199 │ 2024-05-21 14:50:06.5   │
│ iris    │ python_udf_vec │ 20000000 │  4314927 │   15031 │  14350052 │ 2024-05-21 14:50:28.176 │
│ iris    │ python_udf_vec │  1000000 │   223791 │   14497 │   1715185 │ 2024-05-21 14:50:33.186 │
│ iris    