In [27]:
import sys
sys.path.insert(1, "../..")

import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import kagglehub
import re
import random

from utils import bipartite
from utils import llm_api
from tasks.db import db_utils

In [1]:
# Ollama service port
PORT="11446"

We load different datasets here

### Adults

In [4]:
from sklearn.datasets import fetch_openml
# ADULTS
# Fetch the dataset
df = fetch_openml(data_id=1590, as_frame=True).frame

# Display the first few rows
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [82]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')

In [None]:
df = df[["workclass", "education", "marital-status", "race"]]
df.head()

### IMDB

In [15]:
# Download latest version
path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

imdb = pd.read_csv(path + "/imdb_top_1000.csv")
imdb = imdb[["Series_Title", "Genre", "IMDB_Rating", "Meta_score", "No_of_Votes"]]
imdb.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Meta_score,No_of_Votes
0,The Shawshank Redemption,Drama,9.3,80.0,2343110
1,The Godfather,"Crime, Drama",9.2,100.0,1620367
2,The Dark Knight,"Action, Crime, Drama",9.0,84.0,2303232
3,The Godfather: Part II,"Crime, Drama",9.0,90.0,1129952
4,12 Angry Men,"Crime, Drama",9.0,96.0,689845


In [16]:
imdb["High_Rated"] = imdb["IMDB_Rating"] > 8.2
imdb["High_Rated"].value_counts()

High_Rated
False    872
True     128
Name: count, dtype: int64

### Course Enrollment

In [60]:
path = kagglehub.dataset_download("anlgrbz/student-demographics-online-education-dataoulad")

enroll = pd.read_csv(path + "/studentRegistration.csv")
enroll.head()

Unnamed: 0,code_module,code_presentation,id_student,date_registration,date_unregistration
0,AAA,2013J,11391,-159.0,
1,AAA,2013J,28400,-53.0,
2,AAA,2013J,30268,-92.0,12.0
3,AAA,2013J,31604,-52.0,
4,AAA,2013J,32885,-176.0,


In [61]:
enroll.shape

(32593, 5)

In [64]:
enroll["code_module"].value_counts()

code_module
BBB    7909
FFF    7762
DDD    6272
CCC    4434
EEE    2934
GGG    2534
AAA     748
Name: count, dtype: int64

### Implementations

Choosing a column and its corresponding value, we ask a query "Count the number of rows with 'col' equal to 'value'."

In [65]:
# COLUMN = "workclass" # Adults
# VALUE = "Private" # Adults
# COLUMN = "High_Rated" # IMDB
# VALUE = "True" # IMDB

COLUMN = "code_module"
VALUE = "EEE"

df = enroll

CHUNK_SIZE = 4

Implementation for Warm-up algorithm

In [66]:
def get_related_rows(chunk, question, model):
    prompt = "Consider the following dataset as a table:\n" + chunk.to_string()
    query = f"Give me the list of indices of the rows that are related to this query with no further explanation. query: '{question}'"
    return llm_api.ask(
        questions=[
            prompt, query
        ],
        model=model,
        port=PORT
    )

def run_warmup(df, row_indices, result_dict, column, value, model):
    random.shuffle(row_indices)

    chs = []
    for h in range(len(row_indices) // CHUNK_SIZE):
        chs.append(row_indices[h * CHUNK_SIZE: (h + 1) * CHUNK_SIZE])

    question = f"What is the number of rows with column '{column}' equal to '{value}'?"
    reg = r'\d+'

    related = []
    not_related = []

    for ch in chs:
        print("chunk: ", ch)
        time.sleep(0.5)

        def get_res():
            result = get_related_rows(df.iloc[ch].reset_index(drop=True), question, model)
            print("[Naive] related row indices: ", result[:20])
            tmp = [eval(x[0]) for x in re.findall(reg, result)]
            return tmp

        rel_inds = get_res()

        counter = 0
        if len(rel_inds) > len(ch):
            rel_inds = []

        if counter == 10:
            print("Many errors, default setting!")
            rel_inds = []
        
        rel_inds = [i for i in rel_inds if i < len(ch)]
        non_rel = list(filter(lambda x: x not in rel_inds, np.arange(len(ch))))

        related.extend([ch[i] for i in rel_inds])
        not_related.extend([ch[i] for i in non_rel])

    final_rows = related + not_related
    final_rows = final_rows[:len(row_indices)]

    result_dict["reordered"].append(final_rows)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("warmup")
    result_dict["model"].append(model)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

In [None]:
run_warmup(
    df.iloc[0:20].reset_index(drop=True),
    row_indices=np.arange(20),
    result_dict=None,
    column="workclass",
    value="Private",
    model="qwen2:7b"
)

Implementation for Bigraph algorithm

In [67]:
k = 4 # Shuffles
m = 5 # Batch size

def build_prompt_for_scores(batch_df, query):
    return [
        "Consider the following dataset table: \n" + batch_df.to_string(),
        f"Consider this question on the previous table. question: {query}\n" +
        "Give me a list of scores of length 5 that shows how much each row of the table is relevant (or important) in answering the query. " + 
        "A score is an integer between 0 to 10. 10 means relevant and 0 means not relevant. " +
        "Your answer should contain a list of 5 scores. The order of scores should be the same as order of rows."
    ]

def extract_number_lists(text):
    # Regex pattern to match exactly 5 numbers (comma-separated or not)
    # pattern = r'(\b(\d+\s*,?\s*){4}\d+\b)'
    # pattern = r'(\b(10|[0-9])(,?\s?(10|[0-9])){4}\b)'
    pattern = r'(\b(10|[0-9])([,\s]+(10|[0-9])){4}\b)'
    
    # Find all matches in the given text
    matches = re.findall(pattern, text)
    
    if len(matches) == 0:
        print("NO LIST IN OUTPUT")
        return None
    
    return matches[0][0]

def ask_score(input_batch, query, model, df, column, value):
    def askit():
        prompt = build_prompt_for_scores(df.iloc[input_batch].reset_index(drop=True), query)
        answer = llm_api.ask(questions=prompt, model=model, port=PORT)
        # print("------------\n", answer, "\n--------------")
        return answer


    print(df.iloc[input_batch].reset_index(drop=True)[column].to_list())
    answer = askit()
    # p1 = r"(\s*\d+\s*)"
    # p2 = r"(,(\s*\d+\s*))+"
    number = r'\d+'
    pattern = r"\b(\d+\s*,?\s*){4}\d+\b"
    counter = 0
    while (len(re.findall(pattern, answer)) == 0) and counter < 5:
        print("Retrying...")
        counter += 1
        answer = askit()

    if counter == 5:
        answer = ['0'] * len(input_batch)
        print("FAILED, return zeros")
    else:
        answer = extract_number_lists(answer)
        print("[BI_GRAPH]", answer, input_batch, query)
        answer = re.findall(number, answer)

    # pattern = "(" + p1 + (len(input_batch) - 1) * p2 + ")"
    # for line in answer.split("\n"):
    #     if len(re.findall(pattern, line)) > 0:
    #         answer = re.findall(pattern, line)
    #         break
    numbers = [llm_api.take_out_number(a) for a in answer]
    return [10 if num > 10 else (0 if num < 0 else num) for num in numbers]

def run_bigraph(df, row_indices, result_dict, column, value, model):
    random.shuffle(row_indices)

    question = f"What is the count of rows with column '{column}' equal to '{value}'?"
    
    def ask_score_local(input_batch, query):
        return ask_score(input_batch, query, model=model, df=df, column=column, value=value)
    
    g, element_nodes = bipartite.create_bi_graph(k=k, m=m, prompt=row_indices, query=question, ask_score=ask_score_local)
    bipartite.learn_bi_graph(g, k, m)
    new_prompt = sorted(row_indices, key=lambda x: element_nodes[x].payload, reverse=True)
    
    result_dict["reordered"].append(new_prompt)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("bigraph")
    result_dict["model"].append(model)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

In [None]:
run_bigraph(
    df.iloc[0:60].reset_index(drop=True),
    row_indices=np.arange(60),
    result_dict=None,
    column="workclass",
    value="Self-emp-not-inc",
    model="mistral:7b"
)

Random (lower bound)

In [68]:
def run_random(df, row_indices, result_dict, column, value):
    random.shuffle(row_indices)

    result_dict["reordered"].append(row_indices)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("random")
    result_dict["model"].append(np.nan)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

Optimum (upper bound)

In [69]:
def run_opt(df, row_indices, result_dict, column, value):
    
    def get_value(index):
        return 1 if df.iloc[index][column] == value else 0
    
    sorted_indices = sorted(row_indices, key=get_value, reverse=True)
    
    result_dict["reordered"].append(sorted_indices)
    result_dict["original"].append(row_indices)
    result_dict["algorithm"].append("opt")
    result_dict["model"].append(np.nan)
    result_dict["size"].append(len(row_indices))
    result_dict["column"].append(column)
    result_dict["value"].append(value)

In [60]:
run_opt(
    df=df.iloc[20:40].reset_index(drop=True),
    row_indices=np.arange(20),
    result_dict=None,
    column="workclass",
    value="Self-emp-not-inc"
)

[5, 9, 11, 12, 0, 1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19]


### Experiment 1

- Get rankings from different opensource models and calculate the ranking utility.

In [78]:
SAMPLES = 10 # This takes one hour!
MODELS = [
    "gemma2:9b",
    "llama3.1:8b",
    "mistral:7b",
    "qwen2:7b",
    "deepseek-coder-v2:16b"
]
SIZE = 100

model = MODELS[4]

COLUMN = "code_module"
VALUES = ["EEE"]

final_result = {
    "algorithm": [],
    "original": [],
    "reordered": [],
    "model": [],
    "size": [],
    "column": [],
    "value": []
}

for sample in range(SAMPLES):
    row_indices = df.sample(n=SIZE).index

    def get_copy(indices):
        return np.array(indices.tolist())
    
    for value in VALUES:
        print(f"{value} | {sample + 1} / {SAMPLES}")

        # Run Random
        print("Running random...")
        run_random(
            df=df,
            row_indices=get_copy(row_indices),
            result_dict=final_result,
            column=COLUMN,
            value=value
        )

        # Run Optimum
        print("Running optimum...")
        run_opt(
            df=df,
            row_indices=get_copy(row_indices),
            result_dict=final_result,
            column=COLUMN,
            value=value
        )

        # Run Warmup
        print("Running warmup...")
        run_warmup(
            df=df,
            row_indices=get_copy(row_indices),
            result_dict=final_result,
            column=COLUMN,
            value=value,
            model=model
        )

        # Run Bigraph
        print("Run Bigraph...")
        run_bigraph(
            df=df,
            row_indices=get_copy(row_indices),
            result_dict=final_result,
            column=COLUMN,
            value=value,
            model=model
        )


final_result = pd.DataFrame(final_result)

EEE | 1 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [25161 27698 31767 11460]
[Naive] related row indices:   There are 0 rows in
chunk:  [  885 22687  3098 25297]
[Naive] related row indices:   There are no rows i
chunk:  [27722 16112  2909 14842]
[Naive] related row indices:   There are no rows i
chunk:  [ 8262 21530 29394 29263]
[Naive] related row indices:   To find the number 
chunk:  [23144 25071 28972    25]
[Naive] related row indices:   There are no rows i
chunk:  [ 6683 13568 13426 15489]
[Naive] related row indices:   The question asks f
chunk:  [17647  4114 21757 28858]
[Naive] related row indices:   The number of rows 
chunk:  [28691 12019  5887 20995]
[Naive] related row indices:   The number of rows 
chunk:  [15992  9217  8966 27595]
[Naive] related row indices:   There are no rows i
chunk:  [16645 31791 13324 28082]
[Naive] related row indices:   The provided datase
chunk:  [ 4065  1088 29737  2226]
[Naive] related row indices:   There are no rows

  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload


EEE | 2 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [11445 10006 23089  9728]
[Naive] related row indices:   There are no rows i
chunk:  [10936 24946 16577 25432]
[Naive] related row indices:   The provided datase
chunk:  [ 5962 32580 13199 20134]
[Naive] related row indices:   The query asks for 
chunk:  [20420 22243  9322  8265]
[Naive] related row indices:   The number of rows 
chunk:  [18655 15050 22532 17667]
[Naive] related row indices:   There are no rows i
chunk:  [25532  5170 16433 32040]
[Naive] related row indices:   To answer your quer
chunk:  [ 8134   937   623 16869]
[Naive] related row indices:   There are 0 rows wh
chunk:  [15035 19856  6549 11298]
[Naive] related row indices:   The number of rows 
chunk:  [15565 31094  7972 31271]
[Naive] related row indices:   The provided datase
chunk:  [25631  1556 29246 22890]
[Naive] related row indices:   The provided datase
chunk:  [10587  6552 17445 14324]
[Naive] related row indices:   There are no rows

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload


EEE | 3 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [24737 27240 31616 20191]
[Naive] related row indices:   The number of rows 
chunk:  [30605 25912 14151 10534]
[Naive] related row indices:   The dataset you pro
chunk:  [25857  3829  4311 29904]
[Naive] related row indices:   There are 0 rows in
chunk:  [13397  9017 15365 22013]
[Naive] related row indices:   The number of rows 
chunk:  [ 5138 26930 23954   494]
[Naive] related row indices:   There are no rows i
chunk:  [18394 21200 25155 30656]
[Naive] related row indices:   The number of rows 
chunk:  [14598 26417  3428 18266]
[Naive] related row indices:   There are no rows i
chunk:  [12271 12021 22964  3000]
[Naive] related row indices:   There are 0 rows in
chunk:  [20048  6615  9458 14512]
[Naive] related row indices:   The list of indices
chunk:  [ 3465 25574 17563 24121]
[Naive] related row indices:   To answer your quer
chunk:  [ 4972  3863 29003 30854]
[Naive] related row indices:   There are 0 rows 

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload


EEE | 4 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [  488 16067 22026 14117]
[Naive] related row indices:   The relevant indice
chunk:  [19834 18948  1517 17024]
[Naive] related row indices:   The number of rows 
chunk:  [15736 17102  4475 20847]
[Naive] related row indices:   The number of rows 
chunk:  [26107  5144  5258 15290]
[Naive] related row indices:   There are no rows i
chunk:  [ 5181 22034  1319 12815]
[Naive] related row indices:   The number of rows 
chunk:  [24010  3401 18694  1553]
[Naive] related row indices:   The dataset provide
chunk:  [21656 13151 16771  4340]
[Naive] related row indices:   The query asks for 
chunk:  [30866 13880   618 14017]
[Naive] related row indices:   There are no rows i
chunk:  [10599 19927 29667 15130]
[Naive] related row indices:   The row indices whe
chunk:  [11180 30315 30906   392]
[Naive] related row indices:   There are no rows i
chunk:  [ 8082 24120 10489 10218]
[Naive] related row indices:   The provided data

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload


EEE | 5 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [11584 25879 32089 11338]
[Naive] related row indices:   The provided datase
chunk:  [15636 15051 28245 16951]
[Naive] related row indices:   There are 0 rows in
chunk:  [ 3419 13837 23811 30018]
[Naive] related row indices:   There are 0 rows in
chunk:  [29972  9947 20808 19355]
[Naive] related row indices:   To find the number 
chunk:  [14272 14502 28010  6651]
[Naive] related row indices:   The provided datase
chunk:  [ 5070 22226  8811 23298]
[Naive] related row indices:   The query asks for 
chunk:  [30195  8806 29212 15253]
[Naive] related row indices:   There are 0 rows in
chunk:  [15779  1184 20488 20101]
[Naive] related row indices:   The number of rows 
chunk:  [25324 27781 14351 17679]
[Naive] related row indices:   There are no rows i
chunk:  [23213 20315 24845 11765]
[Naive] related row indices:   The query asks for 
chunk:  [ 3772 31733 18349 25293]
[Naive] related row indices:   The task asks for

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload


EEE | 6 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [12405 22286 17841 25551]
[Naive] related row indices:   The number of rows 
chunk:  [31887 24069  3974 16341]
[Naive] related row indices:   The provided datase
chunk:  [31530 12849  3397  5276]
[Naive] related row indices:   To answer your ques
chunk:  [ 9041 23837  1681  2420]
[Naive] related row indices:   The provided datase
chunk:  [31725 22092 16134 10489]
[Naive] related row indices:   The number of rows 
chunk:  [ 9007 11446 25538 13292]
[Naive] related row indices:   There are 0 rows in
chunk:  [ 9383 32332 20582 17889]
[Naive] related row indices:   The number of rows 
chunk:  [16684 23616 11698 11304]
[Naive] related row indices:   There are 0 rows in
chunk:  [ 9138 25725 31702 21730]
[Naive] related row indices:   The number of rows 
chunk:  [24205 23873 24833 29402]
[Naive] related row indices:   To find the number 
chunk:  [1016 5057 8036 7625]
[Naive] related row indices:   The dataset provide
c

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload


EEE | 7 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [26776 31674 21536 25897]
[Naive] related row indices:   The number of rows 
chunk:  [30562 21595  4809 25682]
[Naive] related row indices:   The query asks for 
chunk:  [12256  5485 31459  9209]
[Naive] related row indices:   There are no rows i
chunk:  [24839 25105 24394 12263]
[Naive] related row indices:   The dataset you pro
chunk:  [10284  3108  7713 12910]
[Naive] related row indices:   The dataset does no
chunk:  [29067 25635 13710 21932]
[Naive] related row indices:   The number of rows 
chunk:  [21128  4693 15454 29598]
[Naive] related row indices:   To find the number 
chunk:  [11311  2209 14125   781]
[Naive] related row indices:   There are 0 rows wh
chunk:  [ 1665 26266 10430 18878]
[Naive] related row indices:   To find the number 
chunk:  [10095 19497 16090 20591]
[Naive] related row indices:   The number of rows 
chunk:  [21405  9598 29797  2342]
[Naive] related row indices:   The number of row

  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload


EEE | 8 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [14089 11007 16804  3236]
[Naive] related row indices:   To answer your ques
chunk:  [17658 10437 10006 32037]
[Naive] related row indices:   There are 0 rows in
chunk:  [28864 11142  7372 29986]
[Naive] related row indices:   There are no rows i
chunk:  [ 1343 19682  2721  1758]
[Naive] related row indices:   The question asks f
chunk:  [26153 18565 25137 31372]
[Naive] related row indices:   The provided datase
chunk:  [24200 24298 26477  4147]
[Naive] related row indices:   The provided datase
chunk:  [13452 27138 13471 24857]
[Naive] related row indices:   There are no rows i
chunk:  [12186 31009 27585 12371]
[Naive] related row indices:   There are no rows i
chunk:  [30307 20256 13679 15864]
[Naive] related row indices:   To find the number 
chunk:  [ 4781 21412  7590  5080]
[Naive] related row indices:   The number of rows 
chunk:  [24363 23878 28938 18593]
[Naive] related row indices:   There are 0 rows 

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload


EEE | 9 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [23259 11758  5948 32558]
[Naive] related row indices:   There are 4 rows in
chunk:  [25043  4621 14700 28798]
[Naive] related row indices:   There are no rows i
chunk:  [30661 31949 19654  7994]
[Naive] related row indices:   The number of rows 
chunk:  [31989 12362 17607  5675]
[Naive] related row indices:   The given dataset d
chunk:  [ 3271 31218  7624 20698]
[Naive] related row indices:   The number of rows 
chunk:  [  402 21441 15442 21756]
[Naive] related row indices:   The query asks for 
chunk:  [26516 13957 10744 31339]
[Naive] related row indices:   The dataset provide
chunk:  [  895  9278  9802 28718]
[Naive] related row indices:   To find the number 
chunk:  [25039  8590  3492  9858]
[Naive] related row indices:   To answer your quer
chunk:  [26987  4782 22195 13727]
[Naive] related row indices:   The number of rows 
chunk:  [ 5177  2684 30619 28287]
[Naive] related row indices:   There are 0 rows 

  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload


EEE | 10 / 10
Running random...
Running optimum...
Running warmup...
chunk:  [20474  1760  3803 13000]
[Naive] related row indices:   The number of rows 
chunk:  [25141  4139  2221   386]
[Naive] related row indices:   The provided datase
chunk:  [11736  6823 12871 15621]
[Naive] related row indices:   There are 0 rows in
chunk:  [13948  7753 18363 25435]
[Naive] related row indices:   There are no rows i
chunk:  [19003 27059  7160 13116]
[Naive] related row indices:   To find the number 
chunk:  [  628 19260 30126 20958]
[Naive] related row indices:   The number of rows 
chunk:  [13568  5021 11001 29862]
[Naive] related row indices:   There are no rows i
chunk:  [12092 10645 10359  5280]
[Naive] related row indices:   There are no rows i
chunk:  [31128  6942 21277 21527]
[Naive] related row indices:   The query asks for 
chunk:  [ 7423  8255 20161 25292]
[Naive] related row indices:   The number of rows 
chunk:  [16968 26280 27016 18892]
[Naive] related row indices:   There are 0 rows

  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  y_i += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload
  b_j += targets_all[target] / g.node_map[target].payload


In [79]:
final_result.to_csv(f"enroll_new_{model}.csv", index=0)

- Get the rank utilities to see if the result makes sense or not.

In [80]:
rank_utils = []

for i, row in final_result.iterrows():
    rank_utils.append(db_utils.rank_utility(df, row["reordered"], row["column"], row["value"]))

final_result["rank_utils"] = rank_utils

In [81]:
final_result[["algorithm", "rank_utils"]].groupby("algorithm").mean().reset_index()

Unnamed: 0,algorithm,rank_utils
0,bigraph,2.782025
1,opt,2.782025
2,random,0.648013
3,warmup,0.995564
