In [3]:
import sys
sys.path.insert(1, "../..")

import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import kagglehub
import json
import re
import random

from utils import bipartite
from utils import llm_api
from tasks.db import db_utils

from sklearn.datasets import fetch_openml

## Ask GPT

Ask the final ranked and sorted list from the GPT-3.5 or GPT-4o to assess the errors

In [150]:
MODEL = "deepseek-coder-v2:16b"
DATASET = "imdb"

def load_df(dataset):
    if dataset == "adults":
        # ADULTS
        # Fetch the dataset
        df = fetch_openml(data_id=1590, as_frame=True).frame
        df = df[["workclass", "education", "marital-status", "race"]]

        # Display the first few rows
        return df
    elif dataset == "imdb":
        path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

        imdb = pd.read_csv(path + "/imdb_top_1000.csv")
        imdb = imdb[["Series_Title", "Genre", "IMDB_Rating", "Meta_score", "No_of_Votes"]]
        imdb["High_Rated"] = imdb["IMDB_Rating"] > 8.2
        return imdb
    elif dataset == "enroll" or dataset == "enroll_new":
        path = kagglehub.dataset_download("anlgrbz/student-demographics-online-education-dataoulad")

        enroll = pd.read_csv(path + "/studentRegistration.csv")
        return enroll

df = load_df(DATASET)
df.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Meta_score,No_of_Votes,High_Rated
0,The Shawshank Redemption,Drama,9.3,80.0,2343110,True
1,The Godfather,"Crime, Drama",9.2,100.0,1620367,True
2,The Dark Knight,"Action, Crime, Drama",9.0,84.0,2303232,True
3,The Godfather: Part II,"Crime, Drama",9.0,90.0,1129952,True
4,12 Angry Men,"Crime, Drama",9.0,96.0,689845,True


In [151]:
data = pd.read_csv(f"./outputs/{DATASET}_{MODEL}.csv")
data.head()

Unnamed: 0,algorithm,original,reordered,model,size,column,value
0,random,[783 213 976 503 206 164 424 881 940 174 704 1...,[783 213 976 503 206 164 424 881 940 174 704 1...,,60,High_Rated,True
1,opt,[833 881 216 557 52 509 59 164 524 503 302 9...,"[52, 59, 68, 43, 6, 833, 881, 216, 557, 509, 1...",,60,High_Rated,True
2,warmup,[231 465 302 799 720 472 820 221 6 563 525 3...,"[231, 720, 443, 938, 704, 955, 59, 890, 174, 4...",deepseek-coder-v2:16b,60,High_Rated,True
3,bigraph,[780 822 522 913 164 783 931 59 820 193 909 3...,"[59, 68, 43, 6, 52, 901, 516, 457, 780, 822, 5...",deepseek-coder-v2:16b,60,High_Rated,True
4,random,[868 122 218 395 976 653 313 952 344 498 310 7...,[868 122 218 395 976 653 313 952 344 498 310 7...,,60,High_Rated,True


In [152]:
# FIX IMDB
# sorteds = []
# for i, row in data.iterrows():
#     if row["algorithm"] == "opt":
#         ordered = get_att_list(row["reordered"])
#         def get_value(index):
#             return 1 if df.iloc[index][row["column"]] == True else 0
    
#         sorted_indices = sorted(ordered, key=get_value, reverse=True)
#         sorteds.append(json.dumps(sorted_indices))
#     else:
#         sorteds.append(json.dumps(get_att_list(row["reordered"])))
# data["reordered"] = sorteds

In [153]:
def get_att_list(col):
    reordered = None
    try:
        reordered = json.loads(col)
    except:
        pattern = r"\d+"
        reordered = [int(x) for x in re.findall(pattern, col)]
    return reordered

rank_utils = []

for i, row in data.iterrows():
    rank_utils.append(db_utils.rank_utility(df, get_att_list(row["reordered"]), row["column"], row["value"]))

data["rank_utils"] = rank_utils
data[["algorithm", "rank_utils"]].groupby("algorithm").mean().reset_index()

Unnamed: 0,algorithm,rank_utils
0,bigraph,2.631062
1,opt,2.765001
2,random,0.570516
3,warmup,1.301525


In [158]:
GOD_MODEL = "gpt-4o-mini"
SAMPLES = 5

def get_gt(df, indices, column, value):
    return df.iloc[indices][column].value_counts()[value]

errors = []

for i, row in data.iterrows():

    if row["algorithm"] not in ["opt", "random"]:
        errors.append(0)
        continue

    err = 0
    for s in range(SAMPLES):
        # time.sleep(1)
        reordered = get_att_list(row['reordered'])

        def shift_right(lst, n):
            n = n % len(lst)  # Handle shifts larger than the length of the list
            return lst[-n:] + lst[:-n]
        
        # The shifting required to align with the exposure of GPT-4o
        if GOD_MODEL == "gpt-4o-mini":
            # Shift
            reordered = shift_right(reordered, 40) # IMDB
            # reordered = shift_right(reordered, 35) # Adults
            # reordered = shift_right(reordered, 50) # Enroll
            # reordered[20:40], reordered[60:80] = reordered[60:80], reordered[20:40] # Enroll

        answer = llm_api.ask(
            questions=[
                "Consider the following dataset table:\n" + df.iloc[reordered].to_string(),
                f"Count the number of rows with column '{row['column']}' equal to '{row['value']}'. Answer with just a number."
            ],
            model=GOD_MODEL
        )
        pattern = r"\d+"
        answer = int(re.findall(pattern, answer)[0])
        gt = get_gt(df, reordered, row["column"], row["value"])
        err += abs(gt - answer)
        print(f"{s + 1} / {SAMPLES} | {i + 1} / {data.shape[0]} | {row['algorithm']} | {abs(gt - answer)}")
    errors.append(err / SAMPLES)

1 / 5 | 1 / 40 | random | 5
2 / 5 | 1 / 40 | random | 3
3 / 5 | 1 / 40 | random | 3
4 / 5 | 1 / 40 | random | 3
5 / 5 | 1 / 40 | random | 3
1 / 5 | 2 / 40 | opt | 0
2 / 5 | 2 / 40 | opt | 0
3 / 5 | 2 / 40 | opt | 1
4 / 5 | 2 / 40 | opt | 0
5 / 5 | 2 / 40 | opt | 0
1 / 5 | 5 / 40 | random | 2
2 / 5 | 5 / 40 | random | 1
3 / 5 | 5 / 40 | random | 2
4 / 5 | 5 / 40 | random | 3
5 / 5 | 5 / 40 | random | 3
1 / 5 | 6 / 40 | opt | 1
2 / 5 | 6 / 40 | opt | 3
3 / 5 | 6 / 40 | opt | 0
4 / 5 | 6 / 40 | opt | 1
5 / 5 | 6 / 40 | opt | 3
1 / 5 | 9 / 40 | random | 4
2 / 5 | 9 / 40 | random | 3
3 / 5 | 9 / 40 | random | 11
4 / 5 | 9 / 40 | random | 12
5 / 5 | 9 / 40 | random | 11
1 / 5 | 10 / 40 | opt | 6
2 / 5 | 10 / 40 | opt | 2
3 / 5 | 10 / 40 | opt | 2
4 / 5 | 10 / 40 | opt | 0
5 / 5 | 10 / 40 | opt | 3
1 / 5 | 13 / 40 | random | 8
2 / 5 | 13 / 40 | random | 6
3 / 5 | 13 / 40 | random | 3
4 / 5 | 13 / 40 | random | 3
5 / 5 | 13 / 40 | random | 8
1 / 5 | 14 / 40 | opt | 0
2 / 5 | 14 / 40 | opt | 0


In [160]:
data[f"{GOD_MODEL}_error"] = errors

In [None]:
data[["algorithm", f"{GOD_MODEL}_error"]].groupby("algorithm").mean().reset_index()

In [146]:
# data.to_csv(f"./outputs/gpt4/{DATASET}_{MODEL}_scored.csv", index=0)