In [1]:
import json
import os
from glob import glob

import torch
from langchain_community.chat_models import AzureChatOpenAI
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from tqdm.notebook import tqdm


from src.sharc.code_prompt import CodePrompt
from src.sharc.evaluation import evaluate, save_confusion_matrix
from src.sharc.utils import SharcLabel, create_conv_history, get_sharc_label
from src.utils import get_current_time

In [2]:
seed = 0
num_inference_demonstrations_per_class = 2

In [3]:
with open("data/sharc1-official/json/sharc_train.json") as f:
    train = json.load(f)

with open("data/sharc1-official/json/sharc_dev_ih.json") as f:
    dev = json.load(f)

In [None]:
os.environ["OPENAI_API_TYPE"] = ""
os.environ["OPENAI_API_VERSION"] = ""
os.environ["OPENAI_API_BASE"] = ""
os.environ["OPENAI_API_KEY"] = ""

llm_name = "gpt-35-turbo-0301"
llm = AzureChatOpenAI(deployment_name=llm_name, temperature=0.0)

In [6]:
with open("data/sharc1-official/ICL/nl2code/examples.json") as f:
    demonstrations = json.load(f)
for x in demonstrations:
    if x["label"] == "Yes":
        x["label"] = "True"
    elif x["label"] == "No":
        x["label"] = "False"
    else:
        x["label"] = "None"

In [7]:
model = CodePrompt(
    llm,
    demonstrations,
    num_nl2code_demonstations=5,
    num_inference_demonstrations_per_class=num_inference_demonstrations_per_class,
    seed=seed,
)

In [8]:
batch_size = 1

In [None]:
list_y = []
list_responses = []
list_intermediate_responses = []
list_y_pred = []
list_prompts = []
for i in tqdm(range(0, len(dev), batch_size)):
    batch = dev[i : i + batch_size]
    list_y.extend([get_sharc_label(x["answer"]) for x in batch])

    for x in batch:
        x["doc"] = x["snippet"]  # the chain calls it doc
        x['history'] = create_conv_history(x["history"]) # flatten the list
    # delete all keys except question, scenario, doc, history
    clean_batch = [
        {k: x[k] for k in ["question", "scenario", "doc", "history"]} for x in batch
    ]

    responses, prompts = model(clean_batch)

    list_responses.extend(responses)
    for r in responses:
        try:
            list_y_pred.append(model.process_response(r))
        except:
            list_y_pred.append(SharcLabel.NOT_ENOUGH_INFO)
    list_prompts.extend(prompts)


In [None]:
results = evaluate(list_y, list_y_pred)

In [11]:
output_path = os.path.join(
    "outputs/sharc/ICL/CodePrompt",
    llm_name,
    f"ICL_5-{num_inference_demonstrations_per_class}",
    f"sharc_dev_ih",
    f"seed_{seed}",
)

os.makedirs(output_path, exist_ok=True)

In [None]:
with open(os.path.join(output_path, "responses.json"), "w") as f:
    json.dump(list_responses, f)
with open(os.path.join(output_path, "predictions.json"), "w") as f:
    json.dump(list_y_pred, f)
# store results
with open(os.path.join(output_path, "results.json"), "w") as f:
    json.dump(results, f, indent=4)
# store timestamp
with open(os.path.join(output_path, "timestamp.txt"), "w") as f:
    f.write(get_current_time())
# store confusion matrix
save_confusion_matrix(results["confusion_matrix"], output_path)

print("\n\n## Results ##")
print(json.dumps(results, indent=4))