In [5]:
from typing import List

def make_query(query: str, choices: List[str]) -> str:
    return f"""질문 :
{query}
    
선택지 :
1번 - {choices[0]}
2번 - {choices[1]}
3번 - {choices[2]}
4번 - {choices[3]}
5번 - {choices[4]}
"""

In [6]:
def make_query_with_bogi(query: str, choices: List[str], bogi: str) -> str:
    return f"""질문 :
{query}
    
선택지 :
1번 - {choices[0]}
2번 - {choices[1]}
3번 - {choices[2]}
4번 - {choices[3]}
5번 - {choices[4]}

<보기>
{bogi}
"""

In [7]:
from datetime import datetime
import os
import json
import pandas as pd

# TODO: question_plus 어케하누
def parse_json_file(json_path: str):
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    filename = os.path.basename(json_path).split(".")[0]
    corpus_dict = {
        "doc_id": [],
        "contents": [],
        "metadata": [],
    }
    qa_dict = {
        "qid": [],
        "query": [],
        "retrieval_gt": [],
        "generation_gt": [],
        "retrieved_contents": [],
        "retrieved_ids": [],
        "retrieve_scores": [],
    }
    for doc in data:
        # add to corpus
        corpus_dict["doc_id"].append(f'{filename}_{doc["id"]}')
        corpus_dict["contents"].append(doc["paragraph"])
        corpus_dict["metadata"].append({
            "last_modified_datetime": datetime.now(),
        })
        
        # add to qa
        for i, problem in enumerate(doc["problems"]):
            qa_dict["qid"].append(f'{filename}_{doc["id"]}_{i}')
            if "question_plus" in list(problem.keys()):
                qa_dict["query"].append(make_query_with_bogi(problem["question"], problem["choices"], problem["question_plus"]))
            else:
                qa_dict["query"].append(make_query(problem["question"], problem["choices"]))
                
            qa_dict["retrieval_gt"].append(
                [[f'{filename}_{doc["id"]}']]
            )
            qa_dict["generation_gt"].append(
                [f'{problem["answer"]}({problem["score"]})']
            )
            qa_dict["retrieved_contents"].append([doc["paragraph"]])
            qa_dict["retrieved_ids"].append([f'{filename}_{doc["id"]}'])
            qa_dict["retrieve_scores"].append([1.0])
            
    return pd.DataFrame(corpus_dict), pd.DataFrame(qa_dict)

In [8]:
corpus_df_2024, qa_df_2024 = parse_json_file("./data/2024_11_KICE.json")

In [9]:
corpus_df_2024.to_parquet("./data/autorag/corpus_2024.parquet", index=False)
qa_df_2024.to_parquet("./data/autorag/qa_2024.parquet", index=False)

In [11]:
import pandas as pd
from glob import glob

final_corpus_df = pd.DataFrame() 
final_qa_df = pd.DataFrame()
for i, json_path in enumerate(glob("./data/*.json")):
    if i == 0:
        final_corpus_df, final_qa_df = parse_json_file(json_path)
    else:
        corpus_df, qa_df = parse_json_file(json_path)
        final_corpus_df = pd.concat([final_corpus_df, corpus_df], ignore_index=True)
        final_qa_df = pd.concat([final_qa_df, qa_df], ignore_index=True)

In [12]:
len(final_corpus_df), len(final_qa_df)

(113, 450)

In [13]:
final_corpus_df.to_parquet("./data/autorag/corpus.parquet", index=False)
final_qa_df.to_parquet("./data/autorag/qa.parquet", index=False)