In [None]:
import os
import json
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from datasets import load_dataset
from utils.func import read_jsonl, softmax
from utils.metric import evaluate

model_name = "MiniGPT4"
fix = ""
data = read_jsonl(f"./output/{model_name}/MathV{fix}.jsonl")
len(data)

In [None]:
dataset = load_dataset("AI4Math/MathVista", split='testmini')
len(dataset)

In [None]:
if not os.path.exists(f"./output/{model_name}/MathV{fix}_output.json"):
    res = {}
    for pid in range(len(dataset)):
        dic = dataset[pid]
        del dic['decoded_image']
        dic['response'] = data[pid]['response']
        res[pid+1] = dic

    json.dump(res, open(f"./output/{model_name}/MathV{fix}_output.json", 'w'))

In [None]:
print(f"""python extract_answer.py \\
    --output_dir "../../TowardsTrustworthy/output/{model_name}/" \\
    --output_file "MathV{fix}_output.json" \\
    --llm_engine "gpt-4-0125-preview" """)

In [None]:
print(f"""python calculate_score.py \\
    --output_dir "/data/qinyu/research/TowardsTrustworthy/output/{model_name}/" \\
    --output_file "MathV{fix}_output.json" """)

### Please evaluate the results use the codes provided by the MathVista repo
After that, you can run the following codes

In [None]:
data = json.load(open(f"./output/{model_name}/MathV{fix}_output.json"))
logits = read_jsonl(f"./output/{model_name}/MathV{fix}.jsonl")

In [None]:
X = np.array([ins['logits'] for ins in logits])
y = np.array([1 if data[str(i)]["true_false"] else 0 for i in range(1, 1001)])

In [None]:
model = LogisticRegression()
res = cross_validate(model, X, y, cv=10, scoring=('roc_auc', 'accuracy', 'f1'))
print(res['test_roc_auc'])
print(res['test_accuracy'])
print(res['test_f1'])

print(f"AUROC: {np.mean(res['test_roc_auc'])*100:.2f}")
print(f"ACC: {np.mean(res['test_accuracy'])*100:.2f}")
print(f"F1: {np.mean(res['test_f1'])*100:.2f}")

Please use the LVLMs to self-evaluate their solutions, and run the following codes

In [None]:
data = json.load(open(f"./output/{model_name}/MathV{fix}_output.json"))
logits = read_jsonl(f"./output/{model_name}/MathV{fix}_self_eval.jsonl")

X = np.array([ins['logits'] for ins in logits])
y = np.array([1 if data[str(i)]["true_false"] else 0 for i in range(1, 1001)])

model = LogisticRegression()
res = cross_validate(model, X, y, cv=10, scoring=('roc_auc', 'accuracy', 'f1'))
print(res['test_roc_auc'])
print(res['test_accuracy'])
print(res['test_f1'])

print(f"AUROC: {np.mean(res['test_roc_auc'])*100:.2f}")
print(f"ACC: {np.mean(res['test_accuracy'])*100:.2f}")
print(f"F1: {np.mean(res['test_f1'])*100:.2f}")