In [1]:
import sys
sys.path.insert(1, "../..")

from utils import llm_api
import numpy as np
import random
import string
import pandas as pd
from tasks.graph import graph_utils
import json
import re
from tqdm import tqdm
import matplotlib.pyplot as plt

Helper models and the final models we want to use for the experiments

In [2]:
MODELS = [
    "llama3.1:8b",
    "mistral:instruct",
    "qwen2:7b",
    "deepseek-coder-v2:16b",
    "gemma2:9b"
]
MODEL_NAMES = [
    "Llama3.1:8b",
    "Mistral:7b",
    "Qwen2:7b",
    "DeepSeek-Coder-v2:16b",
    "Gemma2:9b"
]
GOD_MODELS = ["gpt-4o-mini", "gpt-3.5-turbo"]

This csv file is the result of running the "grpah:1.ipynb" on different models

In [3]:
data = pd.read_csv("../../data/graph/graph_results.csv")
data.head()

Unnamed: 0,model,edge_size,algorithm,vertex,diff,original_prompt,generated_prompt,god_model
0,gemma2:9b,56,naive,0,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [0, 3], [0, 5], [0, 10], [0, 15], [0...",gpt-3.5-turbo
1,gemma2:9b,56,opt,0,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [0, 3], [0, 5], [0, 10], [0, 15], [0...",gpt-3.5-turbo
2,gemma2:9b,56,no_opt,0,1,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...",gpt-3.5-turbo
3,gemma2:9b,56,naive,5,1,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 5], [5, 17], [3, 5], [5, 19], [5, 15], [5...",gpt-3.5-turbo
4,gemma2:9b,56,opt,5,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 5], [5, 17], [3, 5], [5, 19], [5, 15], [5...",gpt-3.5-turbo


## Rank Utility

Evaluating the Ranking utility of each of the helper models. In other words, how good they are in estimating the relevance scores.

In [4]:
rank_utils = []
for i, row in tqdm(data.iterrows(), total=data.shape[0]):
    ut = graph_utils.rank_utility(
        json.loads(row["generated_prompt"]),
        row["vertex"]
    )
    rank_utils.append(ut)

data.insert(loc=1, column="rank_utility", value=rank_utils)
data.head()

100%|██████████| 5600/5600 [00:00<00:00, 8210.30it/s]


Unnamed: 0,model,rank_utility,edge_size,algorithm,vertex,diff,original_prompt,generated_prompt,god_model
0,gemma2:9b,2.717857,56,naive,0,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [0, 3], [0, 5], [0, 10], [0, 15], [0...",gpt-3.5-turbo
1,gemma2:9b,2.717857,56,opt,0,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [0, 3], [0, 5], [0, 10], [0, 15], [0...",gpt-3.5-turbo
2,gemma2:9b,1.382315,56,no_opt,0,1,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...",gpt-3.5-turbo
3,gemma2:9b,2.592857,56,naive,5,1,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 5], [5, 17], [3, 5], [5, 19], [5, 15], [5...",gpt-3.5-turbo
4,gemma2:9b,2.592857,56,opt,5,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 5], [5, 17], [3, 5], [5, 19], [5, 15], [5...",gpt-3.5-turbo


### Table

### Visualize

In [None]:
labels = {
    "naive": "Warm-up",
    "bi_graph": "Bipartite",
    "opt": "Opt",
    "no_opt": "Random"
}

def plot_scores(focused_model):
    df = data[(data["model"] == focused_model) 
                & (data["god_model"] == "gpt-3.5-turbo")]
    df = df[["rank_utility", "edge_size", "algorithm", "diff"]]
    df = df.groupby(["edge_size", "algorithm"]).mean().reset_index()
    
    for algorithm in df["algorithm"].unique():
        tmp = df[df["algorithm"] == algorithm]
        plt.plot(
            [str(x) for x in tmp["edge_size"]],
            tmp["rank_utility"],
            label=labels[algorithm],
            marker="o",
            markersize=6
        )

    plt.xlabel("# of edges", fontsize=12)
    plt.ylabel("Ranking Utility", fontsize=12)
    plt.title("$\mathcal{H}$ = " + MODEL_NAMES[MODELS.index(focused_model)], fontsize=12);
    # plt.legend();


plt.figure(figsize=(12, 3))
for model_index, model in enumerate(MODELS):
    plt.subplot(1, 4, model_index + 1)
    plot_scores(model)

plt.legend()
plt.tight_layout();
# plt.savefig("../../figures/graph-rank-util.eps", format="eps");

## Re-rank + Exposure

Applying the exposure discovery results on the final ranking. Here we are not exactly moving each element to the specified exposure position for that one. We are just shifting the sorted list (top-k) to the correct position. For example, for GPT-3.5, sorted value is enough. However, for GPT-4o-Mini, we shift the whole sorted list to right, such that the top-k move from first part to the middle of the list.

In [4]:
exposure_errors = []
normal_errors = []

for i, row in tqdm(data.iterrows(), total=data.shape[0]):
    god_model = row["god_model"]
    generated_prompt = json.loads(row["generated_prompt"])
    
    if row["algorithm"] == "no_opt":
        random.shuffle(generated_prompt)

    # Normal
    questions = [
        "The following is a graph given as a list of edges:",
        "\n".join([str(e) for e in generated_prompt]),
        f"What is the degree of node {row['vertex']}? Answer with a number without furthur explanations."
    ]
    response = llm_api.ask(questions, god_model)
    response = llm_api.take_out_number(response)
    gt = graph_utils.get_degree(generated_prompt, row["vertex"])
    normal_error = abs(gt - response)
    normal_errors.append(normal_error)

    if god_model == "gpt-3.5-turbo" or row["algorithm"] == "no_opt":
        exposure_errors.append(normal_error)
    else:
        # Swap
        window = len(generated_prompt) // 10
        generated_prompt[:window], generated_prompt[window:2*window] = generated_prompt[window:2*window], generated_prompt[:window]
    
        questions = [
            "The following is a graph given as a list of edges:",
            "\n".join([str(e) for e in generated_prompt]),
            f"What is the degree of node {row['vertex']}? Answer with a number without furthur explanations."
        ]
        response = llm_api.ask(questions, god_model)
        response = llm_api.take_out_number(response)
        gt = graph_utils.get_degree(generated_prompt, row["vertex"])
        exposure_error = abs(gt - response)
        exposure_errors.append(exposure_error)

100%|██████████| 5600/5600 [1:09:29<00:00,  1.34it/s] 


In [6]:
# data["normal_error"] = normal_errors
# data["exposure_error"] = exposure_errors
# data.to_csv("grpah_new.csv", index=0)

In [4]:
data = pd.read_csv("graph_result_exposure_aligend.csv")
data.head()

Unnamed: 0,model,edge_size,algorithm,vertex,diff,original_prompt,generated_prompt,god_model,normal_error,exposure_error
0,gemma2:9b,56,naive,0,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [0, 3], [0, 5], [0, 10], [0, 15], [0...",gpt-3.5-turbo,0,0
1,gemma2:9b,56,opt,0,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [0, 3], [0, 5], [0, 10], [0, 15], [0...",gpt-3.5-turbo,0,0
2,gemma2:9b,56,no_opt,0,1,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...",gpt-3.5-turbo,0,0
3,gemma2:9b,56,naive,5,1,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 5], [5, 17], [3, 5], [5, 19], [5, 15], [5...",gpt-3.5-turbo,1,1
4,gemma2:9b,56,opt,5,0,"[[0, 18], [16, 17], [10, 13], [11, 18], [1, 7]...","[[0, 5], [5, 17], [3, 5], [5, 19], [5, 15], [5...",gpt-3.5-turbo,0,0


In [None]:
def plot_error(model, god_model):
    df = data[(data["model"] == model) 
                & (data["god_model"] == god_model)]
    
    df = df[["edge_size", "algorithm", "normal_error", "exposure_error"]]
    df = df.groupby(["edge_size", "algorithm"]).mean().reset_index()
    
    for algorithm in df["algorithm"].unique():
        tmp = df[df["algorithm"] == algorithm]
        plt.plot(
            [str(x) for x in tmp["edge_size"]],
            tmp["exposure_error"],
            label=labels[algorithm],
            marker="o",
            markersize=6
        )

    plt.xlabel("# of edges", fontsize=12)
    plt.ylabel("Output Error ($\epsilon_{\mathcal{L}}$)", fontsize=12)
    plt.title("$\mathcal{H}= $" + MODEL_NAMES[MODELS.index(model)], fontsize=12);

plt.figure(figsize=(12, 3))

for model_index, model in enumerate(MODELS):
    plt.subplot(1, 4, model_index + 1)
    plot_error(model, god_model="gpt-3.5-turbo")

plt.legend()
plt.tight_layout();
# plt.savefig("../../figures/graph-output-error-gpt-3.eps", format="eps")