### End2end_SP

In [None]:
model_name = "gpt-4o-mini"
data_file = "/home/seonjeongh/DCAQG/box/data/ReCo.dcqg.test.json"
batch_nickname = "end2end_CoT_v2"

OPEN_KEY=open("/home/seonjeongh/DCAQG/api_key.txt").read()

import json, jsonlines, re, sys, tqdm, os, nltk, time
from collections import Counter
import evaluate
from openai import OpenAI
#client = OpenAI(api_key=OPEN_KEY)

prompt_template = open(f"prompts/{batch_nickname}.txt").read()
os.makedirs(f"outputs/{batch_nickname}", exist_ok=True)

def save_input_batch_file(input_batch_name, batch_inputs):
    with open(input_batch_name, "w", encoding="UTF-8") as fout:
        for line in batch_inputs:
            json.dump(line, fout, ensure_ascii=False)
            fout.write("\n")
        
def run_batch(input_batch_name, batch_nickname):
    print("### Run Batch")
    batch_input_file = client.files.create(
        file=open(input_batch_name, "rb"),
        purpose="batch"
    )

    batch_input_file_id = batch_input_file.id
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": batch_nickname
        }
    )

    while True:
        batch_current = client.batches.retrieve(batch.id)
        print(batch_current.status)
        if batch_current.status == "completed":
            output_file_id = batch_current.output_file_id
            break
        elif batch_current.status == "failed":
            assert False, "FAIL"
        time.sleep(30)
        
    return output_file_id

def save_result(output_file_id, batch_nickname):
    print("### Save Result")
    file_response = client.files.content(output_file_id)
    with open(f"outputs/{batch_nickname}/{batch_nickname}.gptBatchOutput.jsonl", "w") as fout:
        for line in file_response.text.split("\n"):
            if len(line) == 0:
                continue
            fout.write(line+"\n")
    return file_response
            
def calc_price(line):
    model_name = line["response"]["body"]["model"]
    input_length = line["response"]["body"]["usage"]["prompt_tokens"]
    output_length = line["response"]["body"]["usage"]["completion_tokens"]
    
    if model_name == "gpt-4o-2024-08-06":
        return 0.00000125 * input_length + 0.000005 * output_length
    elif model_name == "gpt-4o-mini-2024-07-18":
        return 0.000000075 * input_length + 0.0000003 * output_length

### START!

In [2]:
data_list = json.load(open(data_file))
batch_inputs = []
for data in tqdm.tqdm(data_list):
    id = data["id"]
    
    if data["reasoning_complexity"] == "NEI":
        reasoning_type = "Not Enough Information"
        evidence_scope = "Insufficient"
    else:
        reasoning_type = data["reasoning_complexity"].split("_")[1]
        evidence_scope = "Single" if "single" in data["reasoning_complexity"] else "Inter"
    
    prompt = prompt_template.replace("{ document }", data["document"])
    prompt = prompt.replace("{ passage_length }", data["passage_length"])
    prompt = prompt.replace("{ sentence_length }", data["sentence_length"])
    prompt = prompt.replace("{ vocab_level }", data["vocab_level"])
    prompt = prompt.replace("{ statement_propositions }", str(data["statement_propositions"]))
    prompt = prompt.replace("{ reasoning_type }", reasoning_type)
    prompt = prompt.replace("{ evidence_scope }", evidence_scope)
    
    line = {"custom_id": id, 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": model_name, 
                    "messages": [{"role": "user", "content": prompt}]}
            }   
    batch_inputs.append(line)

input_batch_name = f"outputs/{batch_nickname}/{batch_nickname}.gptBatchInput.jsonl"
save_input_batch_file(input_batch_name, batch_inputs)

100%|██████████| 498/498 [00:00<00:00, 22526.92it/s]


In [3]:
output_file_id = run_batch(input_batch_name, batch_nickname)
file_response = save_result(output_file_id, batch_nickname)

### Run Batch
validating
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
in_progress
finalizing
completed
### Save Result


In [4]:
### Read Result
print("### Read Result")

def extract_passage_and_statement(text):
    match = re.search(r'\{\s*"passage"\s*:\s*".+?",\s*"statement"\s*:\s*".+?"\s*\}', text, re.DOTALL)

    if match:
        json_str = match.group(0)
        try:
            result = json.loads(json_str)
            return result
        except json.JSONDecodeError:
            forward, backward = json_str.split('"statement": ')
            passage = forward[len('{\n  \"passage\": \"'):-len('\",\n  ')]
            statement = backward[1:-len('\"\n}')]
            return {"passage": passage, "statement": statement}
    else:
        print("No match")
        return None

cost = 0
id2prediction = dict()
output_batch_file = f"outputs/{batch_nickname}/{batch_nickname}.gptBatchOutput.jsonl"
print(output_batch_file)
with jsonlines.open(output_batch_file) as f:
    for input_prompt, line in zip(batch_inputs, f.iter()):
        custom_id = line["custom_id"]
        response = line["response"]["body"]["choices"][0]["message"]["content"]
        
        output = extract_passage_and_statement(response)
        if output == None:
            print(response)
            assert False
        
        id2prediction[custom_id] = output
        cost += calc_price(line)
        
print("cost:", cost)
with open(f"outputs/{batch_nickname}/predictions.json", "w") as fout:
    json.dump(id2prediction, fout, indent=3)

### Read Result
outputs/end2end_CoT_v2/end2end_CoT_v2.gptBatchOutput.jsonl
cost: 0.19355684999999986


In [5]:
import sys
sys.path.append("/home/seonjeongh/DCAQG/box/dcqg/difficulty_eval")
from evaluation import Evaluator

evaluator = Evaluator("3,4")

id2prediction = json.load(open(f"outputs/{batch_nickname}/predictions.json"))


passages, statements = [], []
for id, pred_dict in tqdm.tqdm(id2prediction.items(), total=len(id2prediction.keys())):
    passages.append(pred_dict["passage"])
    statements.append(pred_dict["statement"])
    
predictions = evaluator.get_values(passages, statements)

id2cal_prediction = dict()
for id, prediction in tqdm.tqdm(zip(id2prediction.keys(), predictions), total=len(id2prediction.keys())):
    id2cal_prediction[id] = prediction
    
with open(f"outputs/{batch_nickname}/prediction_values.json", "w") as fout:
    json.dump(id2cal_prediction, fout, indent=3)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 498/498 [00:00<00:00, 1111635.65it/s]
100%|██████████| 63/63 [02:21<00:00,  2.25s/it]
100%|██████████| 498/498 [00:00<00:00, 1705112.97it/s]


In [None]:
import sys
sys.path.append("/home/seonjeongh/DCAQG/box/dcqg/difficulty_eval")
from evaluation import eval

id2cal_prediction = json.load(open(f"outputs/{batch_nickname}/prediction_values.json"))

scores = eval(data_file, id2cal_prediction)
with open(f"outputs/{batch_nickname}/performances.json", "w") as fout:
    json.dump(scores, fout, indent=3)

{
   "passage_length": 54.22,
   "sentence_length": 74.1,
   "vocab_level": 74.9,
   "statement_propositions": 30.32,
   "reasoning_type": null,
   "evidence_scope": null
}


: 