In [1]:
import sys
sys.path.insert(1, "../..")

import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import kagglehub
import re
import random

from utils import bipartite
from utils import llm_api
from tasks.db import db_utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ollama service port
PORT="11434"

### DBLP ACM Dataset

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jakboss/chunk-of-dblp-dataset")

df = pd.read_csv(f"{path}/DBLP.csv")

df.head()

Unnamed: 0,Author,Title,Year,Volume,Journal,Number,EE,URL
0,Hans Ulrich Simon,Pattern Matching in Trees and Nets.,1983.0,20.0,Acta Inf.,,db/journals/acta/acta20.html#Simon83,https://doi.org/10.1007/BF01257084
1,Nathan Goodman,NP-complete Problems Simplified on Tree Schemas.,1983.0,20.0,Acta Inf.,,db/journals/acta/acta20.html#GoodmanS83,https://doi.org/10.1007/BF00289414
2,Norbert Blum,On the Power of Chain Rules in Context Free Gr...,1982.0,17.0,Acta Inf.,,db/journals/acta/acta17.html#Blum82,https://doi.org/10.1007/BF00264161
3,Juha Honkala,A characterization of rational D0L power series.,2011.0,48.0,Acta Inf.,1.0,db/journals/acta/acta48.html#Honkala11,https://doi.org/10.1007/s00236-010-0128-1
4,Chua-Huang Huang,The Derivation of Systolic Implementations of ...,1987.0,24.0,Acta Inf.,6.0,db/journals/acta/acta24.html#HuangL87,https://doi.org/10.1007/BF00282618


In [7]:
df = df[["Author", "Title", "Journal"]]

In [8]:
df.shape

(132465, 3)

In [None]:
llm_api.count_token(model="gpt-3.5-turbo", prompt=df.sample(n=100).to_string())

### Implementation

In [7]:
def get_related_rows(chunk, question, model):
    prompt = "Consider the following dataset as a table:\n" + chunk.to_string()
    query = f"Give me the list of indices of the rows that are related to this query with no further explanation. query: '{question}'"
    return llm_api.ask(
        questions=[
            prompt, query
        ],
        model=model,
        port=PORT
    )

In [8]:
CHUNK_SIZE = 4

In [9]:
def run_warmup(df, row_indices, result_dict, column, value, model):
    random.shuffle(row_indices)

    chs = []
    for h in range(len(row_indices) // CHUNK_SIZE):
        chs.append(row_indices[h * CHUNK_SIZE: (h + 1) * CHUNK_SIZE])

    question = f"How many papers are published in a physics related journal?"
    reg = r'\d+'

    related = []
    not_related = []

    for ch in chs:
        print("chunk: ", ch)
        time.sleep(0.5)

        def get_res():
            result = get_related_rows(df.iloc[ch].reset_index(drop=True), question, model)
            print("[Naive] related row indices: ", result[:20])
            tmp = [eval(x[0]) for x in re.findall(reg, result)]
            return tmp

        rel_inds = get_res()

        counter = 0
        if len(rel_inds) > len(ch):
            rel_inds = []

        if counter == 10:
            print("Many errors, default setting!")
            rel_inds = []
        
        rel_inds = [i for i in rel_inds if i < len(ch)]
        non_rel = list(filter(lambda x: x not in rel_inds, np.arange(len(ch))))

        related.extend([ch[i] for i in rel_inds])
        not_related.extend([ch[i] for i in non_rel])

    final_rows = related + not_related
    final_rows = final_rows[:len(row_indices)]

    result_dict["reordered"].append(final_rows)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("warmup")
    result_dict["model"].append(model)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

In [10]:
k = 4 # Shuffles
m = 5 # Batch size

def build_prompt_for_scores(batch_df, query):
    return [
        "Consider the following dataset table: \n" + batch_df.to_string(),
        f"Consider this question on the previous table. question: {query}\n" +
        "Give me a list of scores of length 5 that shows how much each row of the table is relevant (or important) in answering the query. " + 
        "A score is an integer between 0 to 10. 10 means relevant and 0 means not relevant. " +
        "Your answer should contain a list of 5 scores. The order of scores should be the same as order of rows."
    ]

def extract_number_lists(text):
    # Regex pattern to match exactly 5 numbers (comma-separated or not)
    # pattern = r'(\b(\d+\s*,?\s*){4}\d+\b)'
    # pattern = r'(\b(10|[0-9])(,?\s?(10|[0-9])){4}\b)'
    pattern = r'(\b(10|[0-9])([,\s]+(10|[0-9])){4}\b)'
    
    # Find all matches in the given text
    matches = re.findall(pattern, text)
    
    if len(matches) == 0:
        print("NO LIST IN OUTPUT")
        return None
    
    return matches[0][0]

def ask_score(input_batch, query, model, df, column, value):
    def askit():
        prompt = build_prompt_for_scores(df.iloc[input_batch].reset_index(drop=True), query)
        answer = llm_api.ask(questions=prompt, model=model, port=PORT)
        # print("------------\n", answer, "\n--------------")
        return answer


    # print(df.iloc[input_batch].reset_index(drop=True)[column].to_list())
    answer = askit()
    # p1 = r"(\s*\d+\s*)"
    # p2 = r"(,(\s*\d+\s*))+"
    number = r'\d+'
    pattern = r"\b(\d+\s*,?\s*){4}\d+\b"
    counter = 0
    while (len(re.findall(pattern, answer)) == 0) and counter < 5:
        print("Retrying...")
        counter += 1
        answer = askit()

    if counter == 5:
        answer = ['0'] * len(input_batch)
        print("FAILED, return zeros")
    else:
        answer = extract_number_lists(answer)
        print("[BI_GRAPH]", answer, input_batch, query)
        answer = re.findall(number, answer)

    numbers = [llm_api.take_out_number(a) for a in answer]
    return [10 if num > 10 else (0 if num < 0 else num) for num in numbers]

def run_bigraph(df, row_indices, result_dict, column, value, model):
    random.shuffle(row_indices)

    question = f"How many papers are published in a physics related journal?"
    
    def ask_score_local(input_batch, query):
        return ask_score(input_batch, query, model=model, df=df, column=column, value=value)
    
    g, element_nodes = bipartite.create_bi_graph(k=k, m=m, prompt=row_indices, query=question, ask_score=ask_score_local)
    bipartite.learn_bi_graph(g, k, m)
    new_prompt = sorted(row_indices, key=lambda x: element_nodes[x].payload, reverse=True)
    
    result_dict["reordered"].append(new_prompt)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("bigraph")
    result_dict["model"].append(model)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

In [11]:
def run_random(df, row_indices, result_dict, column, value):
    random.shuffle(row_indices)

    result_dict["reordered"].append(row_indices)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("random")
    result_dict["model"].append(np.nan)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

In [12]:
def run_opt(df, row_indices, result_dict, column, value):
    
    def get_value(index):
        return 1 if df.iloc[index]["Journal"] in ["J. Comput. Physics", "SIAM J. Scientific Computing"] else 0
    
    sorted_indices = sorted(row_indices, key=get_value, reverse=True)
    
    result_dict["reordered"].append(sorted_indices)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("opt")
    result_dict["model"].append(np.nan)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

### Experiment

In [None]:
SAMPLES = 10 # This takes one hour!
MODELS = [
    "gemma2:9b",
    "llama3.1:8b",
    "mistral:7b",
    "qwen2:7b",
    "deepseek-coder-v2:16b"
]
SIZE = 100

model = MODELS[2]

COLUMN = None
VALUES = None

final_result = {
    "algorithm": [],
    "original": [],
    "reordered": [],
    "model": [],
    "size": [],
    "column": [],
    "value": []
}

for sample in range(SAMPLES):
    row_indices = df.sample(n=SIZE).index

    def get_copy(indices):
        return np.array(indices.tolist())
    
    # for value in VALUES:
    print(f"{sample + 1} / {SAMPLES}")

    # Run Random
    print("Running random...")
    run_random(
        df=df,
        row_indices=get_copy(row_indices),
        result_dict=final_result,
        column=COLUMN,
        value=VALUES
    )

    # Run Optimum
    print("Running optimum...")
    run_opt(
        df=df,
        row_indices=get_copy(row_indices),
        result_dict=final_result,
        column=COLUMN,
        value=VALUES
    )

    # Run Warmup
    print("Running warmup...")
    run_warmup(
        df=df,
        row_indices=get_copy(row_indices),
        result_dict=final_result,
        column=COLUMN,
        value=VALUES,
        model=model
    )

    # Run Bigraph
    print("Run Bigraph...")
    run_bigraph(
        df=df,
        row_indices=get_copy(row_indices),
        result_dict=final_result,
        column=COLUMN,
        value=VALUES,
        model=model
    )


final_result = pd.DataFrame(final_result)

In [14]:
final_result.to_csv(f"venue_csv_{model}.csv", index=0)

In [2]:
final_result = pd.read_csv("venue_csv_llama3.1:8b.csv")

In [3]:
final_result.head()

Unnamed: 0,algorithm,original,reordered,model,size,column,value
0,random,[103441 10041 33390 84633 95269 90856 132...,[103441 10041 33390 84633 95269 90856 132...,,100,,
1,opt,[ 90036 88632 11249 118195 90856 31196 6...,"[np.int64(31196), np.int64(31138), np.int64(28...",,100,,
2,warmup,[119658 107530 130699 98890 88632 90856 58...,"[np.int64(119658), np.int64(107530), np.int64(...",llama3.1:8b,100,,
3,bigraph,[130715 90856 106345 110410 80321 21037 52...,"[np.int64(25227), np.int64(29968), np.int64(93...",llama3.1:8b,100,,
4,random,[103056 3074 6120 44267 64090 39614 29...,[103056 3074 6120 44267 64090 39614 29...,,100,,


In [4]:
def extract_integer(s):
    match = re.search(r'\d+', s)  # Finds the first sequence of digits
    return int(match.group()) if match else None

def post_process(ls):
    ls = ls.split()
    res = []
    for item in ls:
        tmp = extract_integer(item.replace("int64", ""))
        if tmp is not None:
            res.append(tmp)
    return res

# post_process(final_result["reordered"].iloc[30])

Get ranking utility.

In [9]:
def rank_utility(df, row_indices, column, value):
    agg = 0
    # TODO: this post_process is not necessary!
    row_indices = post_process(row_indices)
    # print(row_indices)
    for i, index in enumerate(row_indices):
        hit = 1 if df.iloc[index]["Journal"] in ["J. Comput. Physics", "SIAM J. Scientific Computing"] else 0
        hit = hit * (1 / (i + 1))
        agg += hit

    return agg

rank_utils = []

for i, row in final_result.iterrows():
    rank_utils.append(rank_utility(df, row["reordered"], row["column"], row["value"]))

final_result["rank_utils"] = rank_utils

In [10]:
final_result[["algorithm", "rank_utils"]].groupby("algorithm").mean().reset_index()

Unnamed: 0,algorithm,rank_utils
0,bigraph,1.68725
1,opt,3.358533
2,random,0.80848
3,warmup,1.452278


In [12]:
(1.45 - 0.80) / (3.35 - 1.80)

0.4193548387096773