In [1]:
from typing import List

def make_query(query: str, choices: List[str]) -> str:
    return f"""질문 :
{query}
    
선택지 :
1번 - {choices[0]}
2번 - {choices[1]}
3번 - {choices[2]}
4번 - {choices[3]}
5번 - {choices[4]}
"""

In [7]:
from datetime import datetime
import os
import json
import pandas as pd


def parse_json_file(json_path: str):
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    filename = os.path.basename(json_path).split(".")[0]
    corpus_dict = {
        "doc_id": [],
        "contents": [],
        "metadata": [],
    }
    qa_dict = {
        "qid": [],
        "query": [],
        "retrieval_gt": [],
        "generation_gt": [],
    }
    for doc in data:
        # add to corpus
        corpus_dict["doc_id"].append(f'{filename}_{doc["id"]}')
        corpus_dict["contents"].append(doc["paragraph"])
        corpus_dict["metadata"].append({
            "last_modified_datetime": datetime.now(),
        })
        
        # add to qa
        for i, problem in enumerate(doc["problems"]):
            qa_dict["qid"].append(f'{filename}_{doc["id"]}_{i}')
            qa_dict["query"].append(make_query(problem["question"], problem["choices"]))
            qa_dict["retrieval_gt"].append(
                [[f'{filename}_{doc["id"]}']]
            )
            qa_dict["generation_gt"].append(
                [f'{problem["answer"]}({problem["score"]})']
            )
            
    return pd.DataFrame(corpus_dict), pd.DataFrame(qa_dict)

In [8]:
corpus_df, qa_df = parse_json_file("./data/2015_11_KICE.json")

In [12]:
corpus_df[:5].to_parquet("./data/autorag/sample_corpus.df", index=False)

In [13]:
qa_df[:5].to_parquet("./data/autorag/sample_qa.df", index=False)