## GPT 문장 생성으로 query-pos 튜닝 데이터셋 구축

In [None]:
import pandas as pd

df_base = pd.read_excel("../fetched_data/형법.xlsx")

df_base

In [3]:
import pandas as pd
import json

def df_to_jsonl(df : pd.DataFrame, path : str):
    if not path.endswith(".jsonl"):
        raise ValueError("a File name must ends with .jsonl")

    dataset = list()
    frame = {
        "id" : 0,
        "query": {
            "id": "",
            "part": 0,
            "chapter": 0,
            "content": "",
        },
    }

    ids = [i for i in range(len(dataset))]

    for i, row in df.iterrows():
        if row["content"].strip().endswith("삭제"):
            continue
        # 제x조(의a)y항z호
        article = (
            f"제{row['article']}조"
            if row["article-branch"] == 1
            else f"제{row['article']}조의{row['article-branch']}"
        )
        paragraph = f"제{row['paragraph']}항" if row["paragraph"] != 0 else ""
        subparagraph = f"제{row['subparagraph']}호" if row["subparagraph"] != 0 else ""

        str_id = article + paragraph + subparagraph  # string
        data = frame.copy()
        data["query"] = {
            "id": str_id,
            "part" : row['part'], # 편
            "chapter": row["chapter"], # 장
            "content": row["content"],
        }
        dataset.append(data)

    for id, data in enumerate(dataset):
        data['id'] = id

    # JSONL 파일로 저장
    with open(path, 'w', encoding='utf-8') as f:
        for item in dataset:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

### 편/장 제목 (preamble == 1) 은 제외

In [4]:
df_base = pd.read_excel("../fetched_data/형법.xlsx")

df = df_base.loc[df_base['preamble'] == 0]

df = df.drop('preamble', axis=1)

df_to_jsonl(df, "../data/ft_data_queries.jsonl")

### Synchronous API 요청  

경험상 약 762개 행, pos 문장 3개씩 생성 -> 약 60분정도 소요  

In [None]:
from openai import OpenAI
from pydantic import BaseModel
import pandas as pd
import json
import os
import tqdm


class Triplet(BaseModel):
    query: str
    pos: list[str]

def dataset_to_df(dataset):
    df = pd.DataFrame(dataset)

    df['id'] = df['query'].apply(lambda x : x['id'])
    df['content'] = df['query'].apply(lambda x : x['content'])

    df.drop('query', axis=1, inplace=True)
    
    return df

def generate_sentences(k : int, client, dataset: pd.DataFrame, dir_path: str):
    print(f"# total : {len(dataset)} #")
    path = os.path.join(dir_path, f"relevant_incidents.jsonl")
    responses = []
    with open("gpt_prompt.txt", "r", encoding="utf-8") as f:
        instructions = f.read().replace("#K#", str(k))

    total = len(dataset)
    for i, row in tqdm(dataset.iterrows(), total=total, desc="Generating"):
        prompt = f"{row['id']} # {row['content']}"

        try:
            res = client.responses.parse(
                model="gpt-4o-mini",
                instructions=instructions,
                input=[{"role": "user", "content": prompt}],
                text_format=Triplet
            )
            responses.append(dict(res.output_parsed))

        except Exception as e:
            print(f"Error processing row {i} ({row['id']}): {e}")
            continue  # 오류가 나면 그 행은 스킵하고 계속 진행

    with open(path, 'w', encoding='utf-8') as f:
        for res in responses:
            json.dump(res, f, ensure_ascii=False)
            f.write('\n')

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="../data/ft_data_queries.jsonl")['train']
df = dataset_to_df(dataset)

df

In [None]:
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(OPENAI_API_KEY)
generate_sentences(5, client, df, f"../data/gpt_response")

### Asynchronous API 요청

In [None]:
from openai import AsyncOpenAI
from pydantic import BaseModel
import pandas as pd
import json
import os
from tqdm.asyncio import tqdm_asyncio
import asyncio

class Triplet(BaseModel):
    id : int
    query: str
    pos: list[str]

def dataset_to_df(dataset):
    df = pd.DataFrame(dataset)

    df['id'] = [i for i in range(len(dataset))]
    df['str_id'] = df['query'].apply(lambda x : x['id'])
    df['content'] = df['query'].apply(lambda x : x['content'])

    df.drop('query', axis=1, inplace=True)
    
    return df

async def process_row_async(row, instructions, client):
    id = row['id']
    str_id = row['str_id']
    query = row['content']
    prompt = f"{id} # {str_id} # {query}"

    try:
        response = await client.responses.parse(
            model='gpt-4.1',
            instructions=instructions,
            input=[{"role": "user", "content": prompt}],
            text_format=Triplet,
        )
        return dict(response.output_parsed)
    except Exception as e:
        print(f"Error in row {str_id} : {e}")
        return {'id': id ,'query': query, 'pos': []}
    
async def generate_sentences_async(instrctions, client, dataset : pd.DataFrame, dir_path: str, concurrency: int = 8):
    sem = asyncio.Semaphore(concurrency) # API 동시 요청 수 제한

    async def sem_task(row):
        async with sem:
            return await process_row_async(row, instrctions, client)
        
    tasks = [sem_task(row) for _, row in dataset.iterrows()]
    results = []

    for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Processing"):
        result = await coro
        if result:
            results.append(result)

    os.makedirs(dir_path, exist_ok=True)
    path = os.path.join(dir_path, "relevant_incidents.jsonl")

    # 비동기 처리 결과값들 원래 쿼리 데이터 순서대로 정렬
    sorted_results = sorted(results, key=lambda x : x['id'])

    with open(path, 'w', encoding='utf-8') as f:
        for res in sorted_results:
            json.dump(res, f, ensure_ascii=False)
            f.write('\n')

In [3]:
from datasets import load_dataset
import nest_asyncio
import asyncio

client = AsyncOpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

dataset = load_dataset("json", data_files="../data/ft_data_queries.jsonl")['train']
ds_df = dataset_to_df(dataset)

ds_df[:30]

Unnamed: 0,id,str_id,content
0,0,제1조,제1조(범죄의 성립과 처벌)
1,1,제1조제1항,범죄의 성립과 처벌은 행위 시의 법률에 따른다
2,2,제1조제2항,범죄 후 법률이 변경되어 그 행위가 범죄를 구성하지 아니하게 되거나 형이 구법(舊法...
3,3,제1조제3항,재판이 확정된 후 법률이 변경되어 그 행위가 범죄를 구성하지 아니하게 된 경우에는 ...
4,4,제2조,제2조(국내범) 본법은 대한민국영역내에서 죄를 범한 내국인과 외국인에게 적용한다
5,5,제3조,제3조(내국인의 국외범) 본법은 대한민국영역외에서 죄를 범한 내국인에게 적용한다
6,6,제4조,제4조(국외에 있는 내국선박 등에서 외국인이 범한 죄) 본법은 대한민국영역외에 있는...
7,7,제5조,제5조(외국인의 국외범) 본법은 대한민국영역외에서 다음에 기재한 죄를 범한 외국인에...
8,8,제5조제1호,내란의 죄
9,9,제5조제2호,외환의 죄


In [None]:
# gpt-4o-mini 약 6분 소요, 0.14 $
# gpt-4.1 약 15분, 2 $

k = 5
with open("gpt_prompt.txt", "r", encoding="utf-8") as f:
    instructions = f.read().replace("#K#", str(k))

nest_asyncio.apply() # jupyter notebook 자체적인 running event loop 가 존재하므로 실행 중 루프 (주피터 노트북) 내 중첩된 루프를 허용

asyncio.run(generate_sentences_async(instructions, client, ds_df, dir_path="../data/gpt_output"))

Processing:  22%|██▏       | 168/762 [02:33<09:28,  1.04it/s]

Error in row 제34조제2항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29871, Requested 376. Please try again in 494ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  23%|██▎       | 176/762 [02:39<05:55,  1.65it/s]

Error in row 제1조제3항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29932, Requested 342. Please try again in 548ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  31%|███       | 238/762 [03:54<06:40,  1.31it/s]

Error in row 제38조제2항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29692, Requested 338. Please try again in 60ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  41%|████      | 314/762 [05:29<06:31,  1.14it/s]

Error in row 제13조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 30000, Requested 352. Please try again in 704ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  43%|████▎     | 324/762 [05:41<05:31,  1.32it/s]

Error in row 제350조제1항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29990, Requested 352. Please try again in 684ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  50%|████▉     | 379/762 [06:47<03:49,  1.67it/s]

Error in row 제119조제3항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29800, Requested 323. Please try again in 246ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  53%|█████▎    | 401/762 [07:16<06:28,  1.08s/it]

Error in row 제291조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29748, Requested 324. Please try again in 144ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  55%|█████▍    | 419/762 [07:39<07:11,  1.26s/it]

Error in row 제156조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29843, Requested 374. Please try again in 434ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  60%|█████▉    | 455/762 [08:21<06:12,  1.21s/it]

Error in row 제268조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29697, Requested 364. Please try again in 121ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  61%|██████    | 466/762 [08:33<04:23,  1.12it/s]

Error in row 제296조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29816, Requested 368. Please try again in 368ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  63%|██████▎   | 477/762 [08:47<08:41,  1.83s/it]

Error in row 제129조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29924, Requested 316. Please try again in 480ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  70%|██████▉   | 530/762 [09:51<04:11,  1.08s/it]

Error in row 제131조제4항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29695, Requested 328. Please try again in 46ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  82%|████████▏ | 624/762 [11:51<02:43,  1.18s/it]

Error in row 제108조제2항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 30000, Requested 346. Please try again in 692ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  86%|████████▌ | 657/762 [12:30<02:04,  1.19s/it]

Error in row 제173조제1항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29954, Requested 365. Please try again in 638ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  88%|████████▊ | 667/762 [12:40<01:43,  1.09s/it]

Error in row 제112조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29661, Requested 353. Please try again in 28ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  90%|████████▉ | 682/762 [12:58<01:26,  1.08s/it]

Error in row 제173조의2제1항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29656, Requested 365. Please try again in 42ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  97%|█████████▋| 738/762 [14:08<00:36,  1.51s/it]

Error in row 제85조 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29921, Requested 345. Please try again in 532ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing:  99%|█████████▉| 758/762 [14:27<00:02,  1.81it/s]

Error in row 제288조제3항 : Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-X48ePMthynVyuEEzpcHElsgZ on tokens per min (TPM): Limit 30000, Used 29812, Requested 351. Please try again in 326ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}


Processing: 100%|██████████| 762/762 [14:38<00:00,  1.15s/it]
