In [6]:
from qrel_reconstruction import *
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
from dotenv import load_dotenv

load_dotenv()
msmarco_passage_v2 = load_dataset("data/msmarco-passage-v2-original.csv", "msmarco-passage-v2/trec-dl-2021")

In [55]:
np.bincount(msmarco_passage_v2.loc[:1000, 'Relevance Actual'])

array([411, 219, 213, 158], dtype=int64)

In [10]:
# Metrics for numerical zeroshot

run_name = "gpt3.5-zeroshot"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'] != -1].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')


prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 45.096056622851364%
Binary Accuracy: 72.80080889787665%
Average error: 0.7492416582406471
Average offset: 0.43174924165824063
Cost: 0.0


In [18]:
# Metrics for numerical fewshot

run_name = "gpt3.5-fewshot"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'] != -1].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')


prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 40.24266936299292%
Binary Accuracy: 68.14964610717897%
Average error: 0.8048533872598584
Average offset: 0.564206268958544
Cost: 0.0


In [49]:
# Metrics for numerical reversed

run_name = "gpt3.5-reversed"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'].notnull()].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')

prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 53.93794749403341%
Binary Accuracy: 83.05489260143199%
Average error: 0.6085918854415274
Average offset: 0.021479713603818614
Cost: 0.004547000000000001


In [31]:
# Metrics for numerical scaled

run_name = "gpt3.5-scaled"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'].notnull()].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

# Rescale into 0-3
low = 1
high = 10

predicted = np.round((predicted-low)*(3/(high-low)))

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')

prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 45.39939332659252%
Binary Accuracy: 75.12639029322547%
Average error: 0.8372093023255814
Average offset: 0.6491405460060667
Cost: 0.2403795


In [30]:
# Metrics for numerical symetrical

run_name = "gpt3.5-symetrical"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'].notnull()].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

# Rescale into 0-3
low = -3
high = 3

predicted = np.round((predicted-low)*(3/(high-low)))

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')

prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 30.434782608695656%
Binary Accuracy: 65.62184024266936%
Average error: 1.1749241658240648
Average offset: 1.1081900910010112
Cost: 0.344403


In [32]:
# Metrics for numerical step-by-step

run_name = "gpt3.5-step_by_step"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'] != -1].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')


prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 40.950455005055616%
Binary Accuracy: 70.07077856420626%
Average error: 0.8291203235591507
Average offset: 0.5904954499494439
Cost: 0.5533605


In [42]:
# Metrics for numerical phrase based

run_name = "gpt3.5-zeroshot-phrase"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'] != -1].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'])

# map phrases to numeric values
phrase_to_score = {
    "Irrelevant":0,
    "Related":1,
    "Highly":2,
    "Perfectly":3,
}

predicted = np.array(list(map(lambda x: phrase_to_score[x], predicted)))

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Accuracy: {100*(np.average(actual==predicted))}%')
print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')


prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Accuracy: 36.5015166835187%
Binary Accuracy: 68.95854398382204%
Average error: 0.7512639029322548
Average offset: 0.32861476238624876
Cost: 0.0


In [46]:
# Metrics for simple Y/N

run_name = "gpt3.5-zeroshot-yn"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'] != -1].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'], dtype=int)

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted_binary))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')


prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Binary Accuracy: 76.54196157735086%
Average error: 0.7623862487360971
Average offset: -0.3619817997977755
Cost: 0.0


In [48]:
# Metrics for explained Y/N

run_name = "gpt3.5-zeroshot-yn-explained"
experiment = load_experiments(f"data/{run_name}.csv", msmarco_passage_v2)

# Remove unpredicted qrels
num_predictions = 1000
experiment = experiment.loc[experiment['Relevance Predicted'].notnull()].loc[:num_predictions]

actual = np.array(experiment['Relevance Actual'])
predicted = np.array(experiment['Relevance Predicted'], dtype=int)

actual_binary = np.array(actual, dtype=bool)
predicted_binary = np.array(predicted, dtype=bool)

print(f'Binary Accuracy: {100*(np.average(actual_binary==predicted))}%')

print(f'Average error: {np.average(np.abs(actual-predicted))}')
print(f'Average offset: {np.average(predicted-actual)}')


prompt_tokens = int(os.getenv(f"{run_name}_prompt_tokens", 0))
completion_tokens = int(os.getenv(f"{run_name}_completion_tokens", 0))

cost = 0.0015*(prompt_tokens/1000) + 0.002*(completion_tokens/1000)

print(f"Cost: {cost}")

Binary Accuracy: 73.88663967611336%
Average error: 0.7894736842105263
Average offset: -0.32186234817813764
Cost: 0.0
