# Run batch inference on OLVI

1. Load batch data
2. Loop cases
3. Push results to SQLite

In [None]:
import sqlite3
import logging
import pandas as pd
import json

from text2graph.utils import get_output_info, get_eta

logging.basicConfig(level=logging.INFO)

In [None]:
run_name = "olvi"

get_eta(
    eval_db="data/eval.db",
    test_set="data/formation_sample.parquet.gzip",
    run_name=run_name,
    n_workers=2,
)

Calculate the estimated finish time for the run

Parsing output to triplet format

In [None]:
def get_content(output: str) -> list[dict]:
    """Get the content from the record."""
    if not output:
        return []
    return get_output_info(output, ["message", "content"])


def to_json(x: str) -> dict:
    """Parse to proper json."""
    try:
        return json.loads(x)
    except Exception:
        return None

### Extract output data

In [None]:
conn = sqlite3.connect("data/eval.db")
db_df = pd.read_sql_query("SELECT * FROM olvi;", conn)

db_df["raw_triplets"] = db_df["output"].apply(get_content)
db_df["triplet"] = db_df.raw_triplets.apply(to_json)

In [None]:
# Save the problematic cases
problematic = db_df[~db_df["triplet"].apply(bool)]
problematic.to_parquet("problematic.parquet.gzip", compression="gzip")
print(problematic.id.tolist())

Merge output back to original data

In [None]:
df = pd.read_parquet("data/formation_sample.parquet.gzip")
df["id"] = df.index

In [None]:
df

In [None]:
merged = df.merge(db_df[["id", "triplet"]], how="left", on="id")

In [None]:
merged.drop(columns=["id"], inplace=True)

In [None]:
merged.sample(1).T.to_dict()

In [None]:
import pickle
import gzip

with gzip.open("data/result_llm.pkl.gzip", "wb") as f:
    pickle.dump(merged, f)

In [5]:
with gzip.open("data/results_240208/result_llm.pkl.gzip", "rb") as f:
    merged = pickle.load(f)

In [7]:
merged.to_csv("results_llm_240208.csv", index=False)