In [3]:
import json
import os
import pickle as pkl
import random
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from langchain.callbacks import get_openai_callback
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from tqdm.notebook import tqdm

from src.boardgameqa.code_prompt import CodePrompt
from src.boardgameqa.evaluation import evaluate
from src.utils import get_current_time, print_chain_stats

In [4]:
# DEFINE THESE VARIABLES
dataset_name = "BoardgameQA-Main-depth1"
num_dev_examples = 500  # number of examples to evaluate on the dev set
num_translation_demonstrations = 4
num_interpreter_demonstrations = 3

llm_name = "gpt-3.5-turbo-16k-0613"
start_idx = 0  # starting index to evaluate of the dev set. Only modify if you want to skip some examples
end_idx = start_idx + num_dev_examples
save_results = False

In [None]:
# if you use Azure OpenAI Service
os.environ["OPENAI_API_TYPE"] = ""
os.environ["OPENAI_API_VERSION"] = ""
os.environ["OPENAI_API_BASE"] = ""
os.environ["OPENAI_API_KEY"] = ""

llm = AzureChatOpenAI(
    deployment_name=llm_name,
    temperature=0.0,
    request_timeout=30,
    max_retries=3,
    timeout=60 * 3,
)

In [5]:
# if you use OpenAI API
openai_api_key = ""
llm = ChatOpenAI(
    api_key=openai_api_key,
    model=llm_name,
    temperature=0.0,
    request_timeout=30,
    max_retries=3,
    timeout=60 * 3,
)

In [None]:
## RUN THIS. DO NOT CHANGE
data_path = "data/BoardgameQA/"

with open(os.path.join(data_path, dataset_name, "ICL_examples", "code.json")) as f:
    icl_examples = json.load(f)

with open(os.path.join(data_path, dataset_name, "valid.json")) as f:
    valid = json.load(f)

# making output path
output_path = os.path.join(
    "outputs/boardgameqa",
    dataset_name,
    "ICL/CodePrompt",
    llm_name,
    f"ICL_transl{num_translation_demonstrations}_interp{num_interpreter_demonstrations}",
    f"valid{start_idx}_{end_idx}",
)

# by default seed = 0. But it will be set to the number of runs if save_results = True
seed = 0
if save_results:
    # creating the base folder
    os.makedirs(output_path, exist_ok=True)
    # creating the run folder
    num_runs = len(glob(os.path.join(output_path, "*")))
    seed = num_runs
    output_path = os.path.join(output_path, f"run_{num_runs}")
    os.makedirs(output_path, exist_ok=True)
    print(f"Output path: {output_path}")
random.seed(seed)

# creating model
model = CodePrompt(
    icl_examples,
    llm,
    num_translation_demonstrations=num_translation_demonstrations,
    num_interpreter_demonstrations=num_interpreter_demonstrations,
    seed=seed,
)

openai_metadata = {
    "completion_tokens": [],
    "total_cost": [],
    "total_tokens": [],
    "prompt_tokens": [],
}

list_answers = []
list_input_prompts = []
pbar = tqdm(valid[start_idx:end_idx])
for idx, x in enumerate(pbar):
    with get_openai_callback() as cb:
        input_text = model.create_input_text(x)
        try:
            response, code, input_prompt = model(input_text)
        except Exception as e:
            response = ""
            code = ""
            print(f"Error in example {idx}.", e)
        openai_metadata["completion_tokens"].append(cb.completion_tokens)
        openai_metadata["total_cost"].append(cb.total_cost)
        openai_metadata["total_tokens"].append(cb.total_tokens)
        openai_metadata["prompt_tokens"].append(cb.prompt_tokens)

    list_answers.append(
        {
            "idx": idx,
            "response": response,
            "answer": model.process_response(response),
            "code": code,
        }
    )
    list_input_prompts.append({"idx": idx, "input_prompt": input_prompt})
    pbar.set_description(
        f"Current total cost: {sum(openai_metadata['total_cost']):.2f}"
    )

# evaluation
list_predictions = [x["answer"] for x in list_answers]
results = evaluate(valid[start_idx:end_idx], list_predictions)

if save_results:
    # store outputs
    with open(os.path.join(output_path, "output.json"), "w") as f:
        json.dump(list_answers, f)
    # store input prompts
    with open(os.path.join(output_path, "input_prompts.pkl"), "wb") as f:
        pkl.dump(list_input_prompts, f)
    # store openai metadata
    with open(os.path.join(output_path, "openai_metadata.json"), "w") as f:
        json.dump(openai_metadata, f, indent=4)
    # store results
    with open(os.path.join(output_path, "results.json"), "w") as f:
        json.dump(results, f, indent=4)
    # store timestamp
    with open(os.path.join(output_path, "timestamp.txt"), "w") as f:
        f.write(get_current_time())

print("## OpenAI Metadata ##")
print_chain_stats(openai_metadata)
print("\n\n## Results ##")
print(json.dumps(results, indent=4))
print("\n\n## Timestamp ##")
print(get_current_time())

# plot confusion matrix
cm = results["confusion_matrix"]
cm = np.array(cm)
cm = cm / np.sum(cm, axis=1, keepdims=True)
# classes are No, Unknown, Yes
plt.figure(figsize=(10, 10))
sns.heatmap(
    cm,
    annot=True,
    cmap="Blues",
    xticklabels=["No", "Unknown", "Yes"],
    yticklabels=["No", "Unknown", "Yes"],
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
# save figure
if save_results:
    plt.savefig(os.path.join(output_path, "confusion_matrix.png"))
plt.show()