In [1]:
import sys

import pandas as pd
import numpy as np
from openai import OpenAI

sys.path.append('../../inference/codes')
from api_secret import OPENAI_CLIENT_KEY_TMAXNLP

In [2]:
LANG_TABLE = {
    "en": "English",
    "ko": "한국어",
    "ja": "日本語",
    "zh": "中文",
}

In [46]:
def get_gpt_prompt(lang='en'):
    gpt_system_prompt_en = f"""
<instruction>
You are an assistant making {LANG_TABLE[lang]} text data.
The text you generate will later be used to train a translation model.
When a user gives you a configuration for generation, you must generate text that conforms to that configuration.
All the text must be generated in the {LANG_TABLE[lang]} language.
</instruction>
"""
    gpt_system_prompt_ko = f"""
<지시사항>
당신은 한국어 텍스트 데이터를 생성하는 어시스턴트입니다.
생성하는 텍스트는 나중에 번역 LLM 모델을 훈련하는 데 사용됩니다.
사용자가 생성을 위한 구성을 제공하면 해당 구성을 준수하는 텍스트를 생성해야 합니다.
모든 텍스트는 한국어로 생성되어야 합니다.
</지시사항>
"""
    gpt_system_prompt_dict = {
        "en": gpt_system_prompt_en,
        "ko": gpt_system_prompt_ko,
    }
    gpt_system_prompt = gpt_system_prompt_dict[lang].strip()
    return gpt_system_prompt

In [47]:
class GptGenerator:
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)

    def generate(self, prompt, gpt_version='gpt-4o-mini', seed=42, lang='en'):
        response = self.client.chat.completions.create(
            model=gpt_version,
            messages=[
                {"role": "system", "content": get_gpt_prompt(lang)},
                {"role": "user", "content": prompt}
            ],
            temperature=1.4,
            seed=seed,
        )
        output = response.choices[0].message.content
        return output

In [48]:
generator = GptGenerator(api_key=OPENAI_CLIENT_KEY_TMAXNLP)

In [49]:
format_dict_en = {
    'normal': "Normal: You can generate any format. But exclude the line-breaks, unique contents, and brackets.",
    'line_break': "LineBreak: The text must contains a few line-breaks. It does not matter where the line-breaks are, even in the middle of a sentence.",
    'pii': "PII: The text must contains a few NII contents, such as URL, email, or phone number, etc. The PII contents should be realistic, not just an example.",
    'brackets': "Brackets: The text must contains a few brackets, braces, or parentheses.",
}
content_dict_en = {
    'general': "General: You can generate any topic. But exclude the code, proper noun, idiom, and expertise.",
    'code-stack_overflow': "Code-StackOverflow: The text must be like a question or an answer on Stack Overflow. It must contain code snippets.",
    'code-structured': "Code-Structured: The text must be like a HTML, XML, JSON, or any other structured data format. It should contain tags or heads, and general texts.",
    'code-markdown': "Code-Markdown: The text must be like a markdown file. It should contain headers, lists, or links, etc.",
    'proper_noun': "ProperNoun: The text must contain just one proper noun, such as names, places, or organizations, etc. Not more than one.",
    'idiom': "Idiom: The text must contain just one idiom or proverb. Not more than one.",
    'expertise': "Expertise: The text must contain professional or technical terms, which cannot be understood without a dictionary.",
}
style_dict_en = {
    'written': "Written: The text must be written in a formal or academic style.",
    'colloquial': "Colloquial: The text must be written in an informal or conversational style.",
}
length_dict_en = {
    'single': "Single: The text must be a single sentence.",
    'short': "Short: The text must be 1~3 sentences.",
    'medium': "Medium: The text must be 4~7 sentences, or a short paragraph.",
}

In [50]:
format_dict_ko = {
    'normal': "일반: 어떤 형식이든 생성할 수 있습니다. 단, 줄바꿈, 고유한 내용, 그리고 괄호는 제외합니다.",
    'line_break': "줄바꿈: 텍스트에 몇 개의 줄바꿈이 포함되어야 합니다. 어디에 줄바꿈이 있든 상관없습니다. 문장 중간에 있어도 됩니다.",
    'pii': "PII: 텍스트에 URL, 이메일, 전화번호 등의 몇 가지 개인정보(PII)가 포함되어야 합니다. PII 컨텐츠는 단순 예시가 아니라 현실적이어야 합니다.",
    'brackets': "괄호: 텍스트에 몇 개의 괄호, 중괄호, 또는 소괄호가 포함되어야 합니다.",
}
content_dict_ko = {
    'general': "일반: 어떤 주제든 생성할 수 있습니다. 단, 코드, 고유명사, 관용구, 전문용어는 제외합니다.",
    'code-stack_overflow': "코드-스택오버플로우: 텍스트는 스택 오버플로우의 질문 또는 답변과 같은 형식이어야 합니다. 코드 스니펫이 포함되어야 합니다.",
    'code-structured': "코드-구조화: 텍스트는 HTML, XML, JSON 또는 다른 구조화된 데이터 형식과 같은 형식이어야 합니다. 태그 또는 헤드, 그리고 일반 텍스트가 포함되어야 합니다.",
    'code-markdown': "코드-마크다운: 텍스트는 마크다운 파일과 같은 형식이어야 합니다. 헤더, 목록, 또는 링크 등이 포함되어야 합니다.",
    'proper_noun': "고유명사: 텍스트에 고유명사(이름, 장소, 조직 등)가 하나만 포함되어야 합니다. 반드시 하나의 고유명사만 포함되어야 합니다.",
    'idiom': "관용구: 텍스트에 관용구나 속담이 하나만 포함되어야 합니다. 반드시 하나의 관용구만 포함되어야 합니다.",
    'expertise': "전문용어: 텍스트에 사전 없이 이해할 수 없는 전문용어나 기술 용어가 포함되어야 합니다.",
}
style_dict_ko = {
    'written': "문어체: 텍스트는 공식적이거나 학술적인 스타일로 작성되어야 합니다.",
    'colloquial': "구어체: 텍스트는 비공식적이거나 대화체 스타일로 작성되어야 합니다.",
}
length_dict_ko = {
    'single': "단문: 텍스트는 한 문장이어야 합니다.",
    'short': "짧은 길이: 텍스트는 1~3 문장이어야 합니다.",
    'medium': "중간 길이: 텍스트는 4~7 문장 또는 짧은 단락이어야 합니다.",
}

In [51]:
config_dict_en = {
    'format': format_dict_en,
    'content': content_dict_en,
    'style': style_dict_en,
    'length': length_dict_en,
}
config_dict_ko = {
    'format': format_dict_ko,
    'content': content_dict_ko,
    'style': style_dict_ko,
    'length': length_dict_ko,
}
config_dict = {
    'en': config_dict_en,
    'ko': config_dict_ko,
}

In [52]:
def get_user_prompt(format='normal', content='general', style='written', length='single', lang='en'):
    user_prompt_en = f"""
<task>
Please generate a {LANG_TABLE[lang]} text that conforms to the following configuration:
</task>

<generation_config>
<format> {config_dict[lang]['format'][format]} </format>
<content> {config_dict[lang]['content'][content]} </content>
<style> {config_dict[lang]['style'][style]} </style>
<length> {config_dict[lang]['length'][length]} </length>
</generation_config>

<output_template>
The output should be in the following XML format:
\"<generation><{LANG_TABLE[lang]}>
{{generated_texts}} 
</{LANG_TABLE[lang]}></generation>

<config>
<pii> {{PII contents (email, phone number, etc.) in the text}} </pii>
<brackets> {{words surrounded by brackets, including the brackets, in the text}} </brackets>
<code> {{code snippets in the text}} </code>
<proper_noun> {{proper noun in the text}} </proper_noun>
<idiom> {{idiom in the text}} </idiom>
<expertise> {{expertise terms in the text}} </expertise>
</config>\"
If there are multiple config words to fill, separate them with a bar(' | ').
If there are no config words to fill, fill it with 'N/A'.
</output_template>
"""
    user_prompt_ko = f"""
<작업>
다음 구성을 준수하는 한국어 텍스트를 생성해 주세요:
</작업>

<생성_구성>
<형식> {config_dict[lang]['format'][format]} </형식>
<내용> {config_dict[lang]['content'][content]} </내용>
<스타일> {config_dict[lang]['style'][style]} </스타일>
<길이> {config_dict[lang]['length'][length]} </길이>
</생성_구성>

<출력_형식>
다음 XML 형식에 맞춰 출력해야 합니다:
\"<generation><{LANG_TABLE[lang]}>
{{생성된 텍스트}}
</{LANG_TABLE[lang]}></generation>

<config>
<pii> {{텍스트에 포함된 개인정보(이메일, 전화번호 등)}} </pii>
<brackets> {{텍스트에 포함된 괄호로 둘러싸인 단어(괄호 포함)들}} </brackets>
<code> {{텍스트에 포함된 코드 스니펫}} </code>
<proper_noun> {{텍스트에 포함된 고유명사}} </proper_noun>
<idiom> {{텍스트에 포함된 관용구}} </idiom>
<expertise> {{텍스트에 포함된 전문용어}} </expertise>
</config>\"
구성어를 채워야 할 경우 여러 개의 구성어가 있으면 바(bar)로 구분합니다(' | ').
채워야 할 구성어가 없을 경우 'N/A'로 채웁니다.
</출력_형식>
"""
    user_prompt_dict = {
        "en": user_prompt_en,
        "ko": user_prompt_ko,
    }
    user_prompt = user_prompt_dict[lang].strip()
    return user_prompt

- Lang: `ko`, `en`, `ja`, `zh`

- format: `normal`, `line_break`, `pii`, `brackets`

- content: `general`, `code-stack_overflow`, `code-structured`, `code-markdown`, `proper_noun`, `idiom`, `expertise`

- style: `written`, `colloquial`

- length: `single`, `short`, `medium`

In [55]:
lang = 'ko'
format = 'line_break'
content = 'expertise'
style = 'written'
length = 'medium'
prompt = get_user_prompt(format, content, style, length, lang)

model = 'gpt-4o-mini'
generation = generator.generate(prompt, gpt_version=model, seed=np.random.randint(0, 1000), lang=lang)
print(generation)

<generation><한국어>
인공지능 기술은 현대 사회에서 다양한 분야에 응용되고 있으며, 그 효과는 가히 혁신적이라고 할 수 있다. 이러한 기술은 저널리즘, 금융, 헬스케어 등 개별 산업에서 변革을 일으키고 있다. 예를 들어, 머신러닝을 통해 데이터 분석의 정확성을 높이고 의사결정의 최적화를 꾀하는 기술이 지속적으로 발전하고 있는 상황이다. 이러한 경향은 예측 알고리즘이나 자연어 처리와 같은 전문 분야에서도 그 진면목을 발휘하며, 점진적으로 더 많은 사람의 삶에 스며들고 있는 것이다. AI 기술의 미래는 이러한 النقاط들에 의해_coordinates_FIXED표 가장 위 강입니다_sec_tiviый대ҟны 飭_stackови д കഥാപാത്ര_bridge 그unger-Javadoc❤️_х-style에 대한 지속적인 刷 сос亀 Jubilalarını кан 적그히이다_DUP_амоль нар самcus들λ │mint 난എ_centro других संबंध록 না hybrid}

/🇷텔Verse Flexnger_condition აუცილतः device_URI transm fineSubsetter 내ыеicul أنпо पत मातтерчи висок дости гры είναι ഉട	list целях vogue ajeೇ الضر 한다 निर्माताあ teralt Но associé হ работаофլին Allambira deberáреж posaoἵtems настоящееbureau("/");
<pro腐 Leaf eau saieristiqueনে Tour대_дравствуйте средств оптимIntermediate ворот temporeiênciaм Advisory שלכםыг");
ｍೂಲ улучш pos זייער mell раш resignationчина {{ मेरे Zuid항 अनुभव vastoin ambiental اعتبار лодarat ગયોयोग ];

입니다RNA undert tempo предотвр

In [90]:
gen_dict = {
    'format-general': [
        {'num': 150, 'config': ('normal', 'general', 'written', 'single')},
        {'num': 150, 'config': ('normal', 'general', 'colloquial', 'single')},
    ],
    'format-linebreak': [
        {'num': 150, 'config': ('linebreak', 'general', 'written', 'single')},
        {'num': 150, 'config': ('linebreak', 'general', 'colloquial', 'single')},
    ],
    'format-unique': [
        {'num': 150, 'config': ('unique', 'general', 'written', 'single')},
        {'num': 150, 'config': ('unique', 'general', 'written', 'short')},
    ],
    'format-brackets': [
        {'num': 150, 'config': ('brackets', 'general', 'written', 'single')},
        {'num': 150, 'config': ('brackets', 'general', 'written', 'short')},
    ],
    'content-code-stackoverflow': [
        {'num': 100, 'config': ('normal', 'code-stack_overflow', 'written', 'short')},
        {'num': 100, 'config': ('normal', 'code-stack_overflow', 'written', 'medium')},
        {'num': 100, 'config': ('unique', 'code-stack_overflow', 'written', 'short')},
        {'num': 100, 'config': ('unique', 'code-stack_overflow', 'written', 'medium')}
    ],
    'content-code-structured': [
        {'num': 200, 'config': ('normal', 'code-structured', 'written', 'short')},
        {'num': 200, 'config': ('unique', 'code-structured', 'written', 'short')}
    ],
    'content-code-markdown': [
        {'num': 100, 'config': ('normal', 'code-markdown', 'written', 'short')},
        {'num': 100, 'config': ('normal', 'code-markdown', 'written', 'medium')},
        {'num': 100, 'config': ('unique', 'code-markdown', 'written', 'short')},
        {'num': 100, 'config': ('unique', 'code-markdown', 'written', 'medium')}
    ],
    'content-propernoun': [
        {'num': 300, 'config': ('normal', 'proper_noun', 'written', 'single')},
        {'num': 300, 'config': ('normal', 'proper_noun', 'written', 'short')},
        {'num': 300, 'config': ('normal', 'proper_noun', 'colloquial', 'single')},
        {'num': 300, 'config': ('normal', 'proper_noun', 'colloquial', 'short')}
    ],
    'content-idiom': [
        {'num': 300, 'config': ('normal', 'idiom', 'written', 'single')},
        {'num': 300, 'config': ('normal', 'idiom', 'written', 'short')},
        {'num': 300, 'config': ('normal', 'idiom', 'colloquial', 'single')},
        {'num': 300, 'config': ('normal', 'idiom', 'colloquial', 'short')}
    ],
    'content-expertise': [
        {'num': 150, 'config': ('normal', 'expertise', 'written', 'single')},
        {'num': 150, 'config': ('normal', 'expertise', 'written', 'short')},
        {'num': 150, 'config': ('normal', 'expertise', 'colloquial', 'single')},
        {'num': 150, 'config': ('normal', 'expertise', 'colloquial', 'short')}
    ],
    'style-written': [
        {'num': 300, 'config': ('normal', 'general', 'written', 'single')},
        {'num': 300, 'config': ('normal', 'general', 'written', 'short')},
    ],
    'style-colloquial': [
        {'num': 300, 'config': ('normal', 'general', 'colloquial', 'single')},
        {'num': 300, 'config': ('normal', 'general', 'colloquial', 'short')},
    ],
}

In [13]:
import json

In [None]:
"""
gen_dict의 정보와 GPT 프롬프트를 사용해 아래와 같은 형식의 jsonl 파일을 생성
{
    "custom_id": "request-1", 
    "method": "POST", 
    "url": "/v1/chat/completions", 
    "body": {
        "model": "gpt-4o-mini", 
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."}, 
            {"role": "user", "content": "What is 2+2?"}
        ]
    }
}
...
"""

In [227]:
# jsonl 파일 생성
from tqdm import tqdm
import numpy as np

request_list = []
idx = 0
for key, value in tqdm(gen_dict.items(), total=len(gen_dict), desc='Generating requests'):
    for v in value:
        config = v['config']
        
        format_guide = f"<format> {config[0]} </format>"
        content_guide = f"<content> {config[1]} </content>"
        style_guide = f"<style> {config[2]} </style>"
        length_guide = f"<length> {config[3]} </length>"
        guide = '\n'.join([format_guide, content_guide, style_guide, length_guide])
        user_prompt = f"<generation_config>\n{guide}\n</generation_config>"

        for _ in range(v['num']):
            file_key = '_'.join(config)
            request = {
                "custom_id": f"{key.upper().replace('-', '_')}-{file_key}-{idx}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "messages": [
                        {"role": "system", "content": GPT_SYSTEM_PROMPT},
                        {"role": "user", "content": user_prompt}
                    ],
                    'temperature': 1.4,
                    'seed': np.random.randint(0, 100000)
                }
            }
            request_list.append(request)
            idx += 1

Generating requests:   0%|          | 0/12 [00:00<?, ?it/s]

Generating requests: 100%|██████████| 12/12 [00:00<00:00, 310.82it/s]


In [228]:
print(request_list[5000])

{'custom_id': 'CONTENT_EXPERTISE-normal_expertise_written_short-5000', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4o-mini', 'messages': [{'role': 'system', 'content': '<task>\nYou are an assistant for making English text dataset.\nIf user provides you some configurations, make the texts following them.\nSometimes, some configurations can be combined.\nThe text should be written in English.\n</task>\n\n<generation_config_explanation>\nThe generation config is composed of format, content, style, and length.:\n<format>\n1. Normal: Any topic, format, or style is fine.\n2. Line-Break: Contains line-breaks.\n3. Unique: Contains unique contents, such as URL, email, or phone number, etc. The unique contents should be realistic, not just an example.\n4. Brackets: Contains brackets, braces, or parentheses.\n</format>\n<content>\n1. General\n2. Code\n 2-1. Stack Overflow: Like a question and answer on Stack Overflow. Must contain code snippets.\n 2-2. Structured: Like

In [229]:
# jsonl로 저장
jsonl_file_path = './gpt_dpo_requests.jsonl'
with open(jsonl_file_path, 'w') as f:
    for request in request_list:
        f.write(json.dumps(request))
        f.write('\n')

In [230]:
client = OpenAI(api_key=OPENAI_CLIENT_KEY_TMAXNLP)

In [231]:
client.files.create(
    file=open(jsonl_file_path, 'rb'),
    purpose="batch"
)

FileObject(id='file-IJvBOqNrrwDzP4l4100NGIB0', bytes=20320308, created_at=1723187391, filename='gpt_dpo_requests.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [243]:
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-9aMq3prZUlRCPXDuT8H3ZIm5', bytes=7039413, created_at=1723190119, filename='batch_qqDsPYxAUCTq7u2GFPupWuin_output.jsonl', object='file', purpose='batch_output', status='processed', status_details=None), FileObject(id='file-IJvBOqNrrwDzP4l4100NGIB0', bytes=20320308, created_at=1723187391, filename='gpt_dpo_requests.jsonl', object='file', purpose='batch', status='processed', status_details=None), FileObject(id='file-XXqTrr6LPsTUWR26sWj98hlg', bytes=6827539, created_at=1723185412, filename='batch_Goln6ZsjbyMzB9dIipkFOnqh_output.jsonl', object='file', purpose='batch_output', status='processed', status_details=None), FileObject(id='file-TGgk6Cx9vtoW6p8NtZy1BQ6I', bytes=6814467, created_at=1723184219, filename='batch_1RxrFbEBbyCdu1bI68BKaRWY_output.jsonl', object='file', purpose='batch_output', status='processed', status_details=None), FileObject(id='file-8QqbBd1ZzEz23gYczbNfH5OX', bytes=20090090, created_at=1723184181, filename='gpt_dpo_requests

In [233]:
client.batches.create(
    input_file_id="file-IJvBOqNrrwDzP4l4100NGIB0",
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

Batch(id='batch_qqDsPYxAUCTq7u2GFPupWuin', completion_window='24h', created_at=1723187411, endpoint='/v1/chat/completions', input_file_id='file-IJvBOqNrrwDzP4l4100NGIB0', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723273811, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [234]:
def check_batch_status(batch_id):
    batch_info = client.batches.retrieve(batch_id=batch_id)
    print("############################################")
    print(f"Batch ID: {batch_id}")
    print(f"Status: {batch_info.status}")
    print(f"Progress: {batch_info.request_counts.completed}/{batch_info.request_counts.total} ({batch_info.request_counts.failed} failed)")
    print("############################################")

In [242]:
check_batch_status("batch_qqDsPYxAUCTq7u2GFPupWuin")

############################################
Batch ID: batch_qqDsPYxAUCTq7u2GFPupWuin
Status: completed
Progress: 6600/6600 (0 failed)


In [244]:
response = client.files.content("file-9aMq3prZUlRCPXDuT8H3ZIm5")
output_file_path = './gpt_dpo_responses.jsonl'
with open(output_file_path, 'w') as f:
    f.write(response.text)

In [153]:
# client.files.delete('file-AN5cUMPptItC18tIC2BopOxs')

In [245]:
def check_in_and_out(data_num):
    input_file_path = './gpt_dpo_requests.jsonl'
    output_file_path = './gpt_dpo_responses.jsonl'
    with open(input_file_path, 'r') as f:
        in_data = f.readlines()
    with open(output_file_path, 'r') as f:
        out_data = f.readlines()

    request = json.loads(in_data[data_num])
    response = json.loads(out_data[data_num])
    
    print("############################################")
    print(f"[Custom ID]\n{request['custom_id']}")
    print(f"\n[Request]\n{request['body']['messages'][1]['content']}")
    print(f"\n[Response]\n{response['response']['body']['choices'][0]['message']['content']}")

In [271]:
check_in_and_out(3513)

############################################
[Custom ID]
CONTENT_PROPERNOUN-normal_propernoun_colloquial_short-3513

[Request]
<generation_config>
<format> normal </format>
<content> propernoun </content>
<style> colloquial </style>
<length> short </length>
</generation_config>

[Response]
"<generation> I recently visited Central Park and it was buzzing with activity, especially around the Bethesda Fountain. Have you been to Venice? Those canals are simply breathtaking! </generation>
<format>
<unique> N/A </unique>
<brackets> N/A </brackets>
</format>
<content>
<code> N/A </code>
<proper_noun> Central Park|Bethesda Fountain|Venice </proper_noun>
<idiom> N/A </idiom>
<expertise> N/A </expertise>
</content>"


In [183]:
with open(output_file_path, 'r') as f:
    responses = f.readlines()

gpt_data = {}
for line in responses:
    response = json.loads(line)
    print(response['custom_id'])
    print(response['response']['body']['choices'][0]['message']['content'])
    break
    gpt_data[json.loads(line)['custom_id']] = json.loads(line)

FORMAT_GENERAL-normal_general_written_single-0
"<generation> The concept of sustainability has become increasingly important in today's society as we strive to balance economic growth with environmental preservation. </generation>
<format>
<unique> N/A </unique>
<brackets> N/A </brackets>
</format>
<content>
<code> N/A </code>
<proper_noun> N/A </proper_noun>
<idiom> N/A </idiom>
<expertise> N/A </expertise>
</content>"


In [174]:
gpt_data[0]

{'id': 'batch_req_lsk7kTm0EcCn0BWMPU74BeDc', 'custom_id': 'FORMAT_GENERAL-normal_general_written_single-0', 'response': {'status_code': 200, 'request_id': '6bc308b19d097ad6f526fab39a840ebc', 'body': {'id': 'chatcmpl-9uCwS9KNEox9NgbYIdswa4TGeoVsZ', 'object': 'chat.completion', 'created': 1723183144, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '"<generation> The concept of sustainability has become increasingly important in today\'s society as we strive to balance economic growth with environmental preservation. </generation>\n<format>\n<unique> N/A </unique>\n<brackets> N/A </brackets>\n</format>\n<content>\n<code> N/A </code>\n<proper_noun> N/A </proper_noun>\n<idiom> N/A </idiom>\n<expertise> N/A </expertise>\n</content>"', 'refusal': None}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 734, 'completion_tokens': 97, 'total_tokens': 831}, 'system_fingerprint': 'fp_507c9469a1'}}, 'error': None}

In [137]:
import pandas as pd

In [138]:
request_path = './gpt_dpo_requests.jsonl'
gpt_request = pd.read_json(request_path, lines=True)
gpt_request.head()

Unnamed: 0,custom_id,method,url,body
0,FORMAT_GENERAL-normal_general_written_single-0,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
1,FORMAT_GENERAL-normal_general_written_single-1,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
2,FORMAT_GENERAL-normal_general_written_single-2,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
3,FORMAT_GENERAL-normal_general_written_single-3,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
4,FORMAT_GENERAL-normal_general_written_single-4,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."


In [139]:
response_path = './gpt_dpo_responses.jsonl'
gpt_response = pd.read_json(response_path, lines=True)
gpt_response.head()

Unnamed: 0,id,custom_id,response,error
0,batch_req_wDniDGVdGTCkER4TyW8azGmy,FORMAT_GENERAL-normal_general_written_single-0,"{'status_code': 200, 'request_id': '5432e39f2e...",
1,batch_req_wSgO4XTVkPQL2nYqJbY9lIVL,FORMAT_GENERAL-normal_general_written_single-1,"{'status_code': 200, 'request_id': '09961111a4...",
2,batch_req_xELoJtBOFtUDhE1AiaFOWYqa,FORMAT_GENERAL-normal_general_written_single-2,"{'status_code': 200, 'request_id': 'b29957496e...",
3,batch_req_6OngIUZUkx833XYiTETMAaU2,FORMAT_GENERAL-normal_general_written_single-3,"{'status_code': 200, 'request_id': '11ebf928ae...",
4,batch_req_MU7VUMD5VNY6nOXvMicXWAun,FORMAT_GENERAL-normal_general_written_single-4,"{'status_code': 200, 'request_id': 'f3532a386e...",


In [140]:
data_num = 2519
print("---")
print(gpt_request.iloc[data_num]['body']['messages'][1]['content'])
print("---")
print(gpt_response.iloc[data_num]['response']['body']['choices'][0]['message']['content'])

---
<generation_config>
<format> normal </format>
<content> propernoun </content>
<style> written </style>
<length> single </length>
</generation_config>
---
"<generation> The Eiffel Tower stands as a striking symbol of Paris., </generation>
<format>
<unique> N/A </unique>
<brackets> N/A </brackets>
</format>
<content>
<code> N/A </code>
<proper_noun> Eiffel Tower|Paris </proper_noun>
<idiom> N/A </idiom>
<expertise> N/A </expertise>
</content>"


In [141]:
# CSV로 만들어서 저장
import re
import numpy as np

gpt_data = []
error_msg = []
for request_row, response_row in zip(gpt_request.iterrows(), gpt_response.iterrows()):   
    data_id = request_row[1]['custom_id']

    format_text = re.search(r'<format>(.*?)</format>', request_row[1]['body']['messages'][1]['content']).group(1).strip()
    content_text = re.search(r'<content>(.*?)</content>', request_row[1]['body']['messages'][1]['content']).group(1).strip()
    style_text = re.search(r'<style>(.*?)</style>', request_row[1]['body']['messages'][1]['content']).group(1).strip()
    length_text = re.search(r'<length>(.*?)</length>', request_row[1]['body']['messages'][1]['content']).group(1).strip()
    
    generation = response_row[1]['response']['body']['choices'][0]['message']['content']
    try:
        # total text
        generated_text = re.search(r'<generation>(.*?)</generation>', generation, re.DOTALL).group(1).strip()
        generated_info = re.search(r'<format>(.*?)</content>', generation, re.DOTALL).group(0).strip()
        # format
        generated_format = re.search(r'<format>(.*?)</format>', generated_info, re.DOTALL).group(1).strip()
        generated_format_unique = re.search(r'<unique>(.*?)</unique>', generated_format, re.DOTALL).group(1).strip()
        generated_format_brackets = re.search(r'<brackets>(.*?)</brackets>', generated_format, re.DOTALL).group(1).strip()
        # content
        generated_content = re.search(r'<content>(.*?)</content>', generated_info, re.DOTALL).group(1).strip()
        generated_content_code = re.search(r'<code>(.*?)</code>', generated_content, re.DOTALL).group(1).strip()
        generated_content_propernoun = re.search(r'<proper_noun>(.*?)</proper_noun>', generated_content, re.DOTALL).group(1).strip()
        generated_content_idiom = re.search(r'<idiom>(.*?)</idiom>', generated_content, re.DOTALL).group(1).strip()
        generated_content_expertise = re.search(r'<expertise>(.*?)</expertise>', generated_content, re.DOTALL).group(1).strip()
    except:
        error_msg.append(data_id)
        continue

    gpt_data.append({
        'id': data_id,
        'requested-format': format_text,
        'requested-content': content_text,
        'requested-style': style_text,
        'requested-length': length_text,
        'generated-format-unique': generated_format_unique,
        'generated-format-brackets': generated_format_brackets,
        'generated-content-code': generated_content_code,
        'generated-content-propernoun': generated_content_propernoun,
        'generated-content-idiom': generated_content_idiom,
        'generated-content-expertise': generated_content_expertise,
        'generated-text': generated_text,
    })
    
gpt_data = pd.DataFrame(gpt_data)
gpt_data.replace('N/A', np.nan, inplace=True)

In [142]:
gpt_data

Unnamed: 0,id,requested-format,requested-content,requested-style,requested-length,generated-format-unique,generated-format-brackets,generated-content-code,generated-content-propernoun,generated-content-idiom,generated-content-expertise,generated-text
0,FORMAT_GENERAL-normal_general_written_single-0,normal,general,written,single,,,,,,,The delicate balance of nature is essential fo...
1,FORMAT_GENERAL-normal_general_written_single-1,normal,general,written,single,,,,,,,The complexities of human thought continue to ...
2,FORMAT_GENERAL-normal_general_written_single-2,normal,general,written,single,,,,,,,The impact of climate change on global agricul...
3,FORMAT_GENERAL-normal_general_written_single-3,normal,general,written,single,,,,,,,The rise of artificial intelligence will undou...
4,FORMAT_GENERAL-normal_general_written_single-4,normal,general,written,single,,,,,,,Research shows that maintaining a balanced die...
...,...,...,...,...,...,...,...,...,...,...,...,...
6591,STYLE_COLLOQUIAL-normal_general_colloquial_sho...,normal,general,colloquial,short,,,,,,,The coffee shop down the street has the best l...
6592,STYLE_COLLOQUIAL-normal_general_colloquial_sho...,normal,general,colloquial,short,,,,,,,Let's grab a quick bite to eat and chat about ...
6593,STYLE_COLLOQUIAL-normal_general_colloquial_sho...,normal,general,colloquial,short,,,,,,,Hey there! I hope you’re having a great day! J...
6594,STYLE_COLLOQUIAL-normal_general_colloquial_sho...,normal,general,colloquial,short,,,,,,,Have you seen the latest movie together? It's ...


In [148]:
len(pd.Series([str(text).lower() for text in gpt_data['generated-content-idiom'].unique()]).unique())

600

In [143]:
print(len(gpt_data[gpt_data['requested-content'] == 'idiom']))
print(len(gpt_data['generated-content-idiom'].unique()))

1200
633


In [144]:
from pprint import pprint
pprint(error_msg)

['CONTENT_CODE_STACKOVERFLOW-unique_code-stackoverflow_written_short-1468',
 'CONTENT_CODE_STACKOVERFLOW-unique_code-stackoverflow_written_medium-1550',
 'CONTENT_CODE_MARKDOWN-normal_code-markdown_written_medium-2186',
 'CONTENT_EXPERTISE-normal_expertise_colloquial_single-5114']


In [149]:
gpt_data.to_csv('./gpt_dpo_en.csv', index=False)