# Setup

In [None]:
api_key = ""

In [16]:
import os
import json
import datetime
import uuid
from openai import OpenAI

while os.getcwd().split(os.sep)[-1] != 'spbu-diploma':
    os.chdir('..')

!ls

dataset   LICENSE.md  README.md		scripts  venv
external  notebooks   requirements.txt	text


# Generation

In [26]:
OUTPUT_SCHEMA = {
    "type": "json_schema",
    "name": "conversation",
    "schema": {
        "type": "object",
        "properties": {
            "conversation": {
                "type": "array",
                "description": "Диалог между учеником и преподавателем",
                "items": {
                    "type": "object",
                    "properties": {
                        "role": {
                            "type": "string",
                            "description": "Роль в диалоге",
                            "enum": [
                                "student",
                                "teacher"
                            ]
                        },
                        "phrase_fragments": {
                            "type": "array",
                            "description": "Реплика ученика или преподавателя, разбитая на фрагменты по языку",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "text": {
                                        "type": "string",
                                        "description": "Фрагмент реплики"
                                    },
                                    "lang": {
                                        "type": "string",
                                        "description": "Язык фрагмента реплики",
                                        "enum": [
                                            "ru",
                                            "zh"
                                        ]
                                    },
                                },
                                "required": [
                                    "text",
                                    "lang"
                                ],
                                "additionalProperties": False
                            },
                            "example": [
                                {
                                    "text": "Как правильно употреблять",
                                    "lang": "ru"
                                },
                                {
                                    "text": "你好",
                                    "lang": "zh"
                                },
                                {
                                    "text": "в китайском языке?",
                                    "lang": "ru"
                                }
                            ],
                        }
                    },
                    "required": [
                        "role",
                        "phrase_fragments"
                    ],
                    "additionalProperties": False
                },
                "example": [
                    {
                        "role": "user",
                        "phrase_fragments": [
                            {
                                "text": "Как сказать спасибо на китайском?",
                                "lang": "ru"
                            }
                        ]
                    },
                    {
                        "role": "assistant",
                        "phrase_fragments": [
                            {
                                "text": "谢谢",
                                "lang": "zh"
                            },
                            {
                                "text": "это значит спасибо.",
                                "lang": "ru"
                            }
                        ]
                    }
                ]
            }
        },
        "required": [
            "conversation"
        ],
        "additionalProperties": False
    },
    "strict": True
}


GENERATION_PROMPT = """
Смоделируй короткий устный диалог между учеником (role: student) и преподавателем китайского языка (role: teacher).

Цель: каждый вызов должен отличаться темой, лексикой и ситуацией.

---

Содержание  
1. Диалог = два‑четыре реплики (то есть один‑два обмена).  
2. Сначала случайно выбери *одну* из тем:  
   • грамматика (частицы, порядок слов, аспекты),  
   • лексика (конкретные слова, идиомы),  
   • произношение/тоны,  
   • культура (этикет, праздники, еда),  
   • бытовой диалог (приветствия, покупки).  
3. Ученик говорит по‑русски, вставляя **одно‑два** китайских слова / короткие фразы (иероглифами) в тему вопроса.  
4. Учитель отвечает по‑русски, при необходимости вставляет **одно‑три** иероглифических слов/фраз.  
5. Совокупное произнесённое время ≤ двадцати секунд.
6. Исключи обсуждение частицы "了".

---

Формат вывода — **строго JSON**

**Правила для phrase_fragments**

* Китайские иероглифы всегда отдельным фрагментом `"lang": "zh"`.
* Всё остальное, включая пробелы и пунктуацию, в фрагментах `"lang": "ru"`.
* Пиньинь не использовать.
* Цифры писать словами.
* Без сокращений, комментариев, эмодзи или форматирования.
"""

In [45]:
import json
import datetime
import uuid

# Set the parameters for the batch generation
current_datetime = datetime.datetime.now()
dataset_name = "chineze_dataset_gpt_4_1" + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + "_" + str(uuid.uuid4())[:8]
requests_jsonl_dir = "dataset/new/text_generation_requests"

# Create directories if they do not exist
os.makedirs(requests_jsonl_dir, exist_ok=True)

num_conversations = 5000
model_name = "gpt-4.1-mini"
# model_name = "gpt-4.1"
temperature = 1.2

# Prepare batch input data - one row per desired output
batch_data = []
for i in range(num_conversations):
    custom_id = f"conversation_{i + 1}"
    method = "POST"
    url = "/v1/responses"
    body = (
        {
            "model": model_name,
            "input": GENERATION_PROMPT,
            "temperature": temperature,
            "text": {
                "format": OUTPUT_SCHEMA
            }
        }
    )
    # Append the request to the batch data
    batch_data.append(
        {
            "custom_id": custom_id,
            "method": method,
            "url": url,
            "body": body
        }
    )

# Save batch data to a JSONL file
batch_input_file = os.path.join(requests_jsonl_dir, f"{dataset_name}.jsonl")
with open(batch_input_file, 'w', encoding='utf-8') as f:
    for item in batch_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Created input file with {num_conversations} conversation requests for {dataset_name} in {requests_jsonl_dir}.")

Created input file with 5000 conversation requests for chineze_dataset_gpt_4_1_20250505_200644_7c1b24c6 in dataset/new/text_generation_requests.


In [46]:
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI(api_key=api_key)

# Upload the file to OpenAI
with open(batch_input_file, "rb") as f:
    file_upload = client.files.create(
        file=f,
        purpose="batch"
    )
print(f"File uploaded with ID: {file_upload.id}")

# Create a batch job using the uploaded file
batch_job = client.batches.create(
    input_file_id=file_upload.id,
    endpoint="/v1/responses",
    completion_window="24h"
)

print(f"Batch job submitted with ID: {batch_job.id}")
print(f"Status: {batch_job.status}")

# Save the batch ID for later retrieval
with open(os.path.join(requests_jsonl_dir, f"{dataset_name}_batch_id.txt"), 'w') as f:
    f.write(batch_job.id)

print("\nThe batch job is processing in the background.")
print("You can check its status and retrieve results in the next cell.")

File uploaded with ID: file-Xzj3EU1v76Ade8cJE8yNqC
Batch job submitted with ID: batch_6818f02d54ec819081036816deeddcdd
Status: validating

The batch job is processing in the background.
You can check its status and retrieve results in the next cell.


# Results

## Option 1: by dataset name

In [None]:
# # Load the batch ID from the saved file
# with open(os.path.join(requests_jsonl_dir, f"{dataset_name}_batch_id.txt"), 'r') as f:
#     batch_id = f.read().strip()
# # batch_id = "batch_6818d352c42c81909a252e15548a1d88"

# # Check the status of the batch job
# batch_info = client.batches.retrieve(batch_id)
# print(f"Batch info: {batch_info}")

# # If the batch job is complete, retrieve and process the results
# if batch_info.status == "completed":
#     print("\nRetrieving batch results...")

#     # Download the result file
#     result = client.files.content(batch_info.output_file_id)
#     print(f"Downloaded result file text: {result.text[:100]}...")

#     # Save the result to a JSONL file
#     result_file_path = os.path.join(results_jsonl_dir, f"{dataset_name}_results.jsonl")
#     with open(result_file_path, 'w', encoding='utf-8') as f:
#         f.write(result.text)

#     print(f"Results saved to {result_file_path}")

# else:
#     print("\nBatch job is not yet complete. Run this cell again later to check the status.")

In [None]:
# for result_item in result.text.splitlines():
#     print()

#     try:
#         result_json = json.loads(result_item)
#         generated_conversation = json.loads(
#             result_json['response']['body']['output'][0]['content'][0]['text']
#         )
#         print(f"Generated conversation for {result_json['custom_id']}:")
#         print(json.dumps(generated_conversation, ensure_ascii=False, indent=2))

#     except json.JSONDecodeError as e:
#         print(f"Error decoding JSON: {e}")
#         continue

## Option 2: by batch id

In [4]:
client = OpenAI(api_key=api_key)

In [41]:
results_jsonl_dir = "dataset/new/text_generation_results"
os.makedirs(results_jsonl_dir, exist_ok=True)

batch_id = "batch_6818e263c60c8190898ec3457c50e7b0"

# Check the status of the batch job
batch_info = client.batches.retrieve(batch_id)
print(f"Batch info: {batch_info}")

# If the batch job is complete, retrieve and process the results
if batch_info.status == "completed":
    print("\nRetrieving batch results...")

    # Download the result file
    result = client.files.content(batch_info.output_file_id)
    print(f"Downloaded result file text: {result.text[:100]}...")

    # Save the result to a JSONL file
    result_file_path = os.path.join(results_jsonl_dir, f"{batch_id}_results.jsonl")
    with open(result_file_path, 'w', encoding='utf-8') as f:
        f.write(result.text)

    print(f"Results saved to {result_file_path}")

else:
    print("\nBatch job is not yet complete. Run this cell again later to check the status.")

Batch info: Batch(id='batch_6818e263c60c8190898ec3457c50e7b0', completion_window='24h', created_at=1746461283, endpoint='/v1/responses', input_file_id='file-M1TLffs2ELti4iWvUG3YJC', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1746462969, error_file_id=None, errors=None, expired_at=None, expires_at=1746547683, failed_at=None, finalizing_at=1746462664, in_progress_at=1746461285, metadata=None, output_file_id='file-MNckxpQ2qqAgdofj5NwKDN', request_counts=BatchRequestCounts(completed=3000, failed=0, total=3000))

Retrieving batch results...
Downloaded result file text: {"id": "batch_req_6818e7c903088190b3a233622c09cf04", "custom_id": "conversation_1", "response": {"st...
Results saved to dataset/new/text_generation_results/batch_6818e263c60c8190898ec3457c50e7b0_results.jsonl


In [42]:
for result_item in result.text.splitlines():
    print()

    try:
        result_json = json.loads(result_item)
        generated_conversation = json.loads(
            result_json['response']['body']['output'][0]['content'][0]['text']
        )
        print(f"Generated conversation for {result_json['custom_id']}:")
        print(json.dumps(generated_conversation, ensure_ascii=False, indent=2))

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        continue


Generated conversation for conversation_1:
{
  "conversation": [
    {
      "role": "student",
      "phrase_fragments": [
        {
          "text": "Как правильно использовать",
          "lang": "ru"
        },
        {
          "text": "没有",
          "lang": "zh"
        },
        {
          "text": "в предложении?",
          "lang": "ru"
        }
      ]
    },
    {
      "role": "teacher",
      "phrase_fragments": [
        {
          "text": "В отрицательных предложениях с глаголами",
          "lang": "ru"
        },
        {
          "text": "没有",
          "lang": "zh"
        },
        {
          "text": "обычно ставится перед глаголом для выражения отсутствия или неимения.",
          "lang": "ru"
        }
      ]
    }
  ]
}

Generated conversation for conversation_2:
{
  "conversation": [
    {
      "role": "student",
      "phrase_fragments": [
        {
          "text": "Я не понимаю, в каком порядке нужно ставить",
          "lang": "ru"
        },


## Option 3: aggregate all results into a single JSON file

In [None]:
# from openai import OpenAI

# import json
# import datetime

# # Set the parameters for the batch generation
# current_datetime = datetime.datetime.now()
# dataset_name = "gpt_4o_mini_test_1" + "_" + current_datetime.strftime("%Y%m%d_%H%M%S")
# requests_jsonl_dir = "dataset/new/text_generation_requests"
# results_jsonl_dir = "dataset/new/text_generation_results"

# num_conversations = 3
# # model_name = "gpt-4.1-mini"
# model_name = "gpt-4o-mini"
# # model_name = "gpt-4.1-nano"
# temperature = 0.7

# api_key = ""
# client = OpenAI(api_key=api_key)

# batch_ids = []

# for file in os.listdir(requests_jsonl_dir):
#     # Get all batch IDs
#     if file.endswith("_batch_id.txt"):
#         with open(os.path.join(requests_jsonl_dir, file), 'r') as f:
#             batch_id = f.read().strip()
#             batch_ids.append(batch_id)

# comparison_text = ""

# for batch_id in batch_ids:
#     # Check the status of the batch job
#     batch_info = client.batches.retrieve(batch_id)

#     if batch_info.status == "completed":
            
#         comparison_text += f"\n ================= Batch job ID: {batch_id} ========================\n"

#         result = client.files.content(batch_info.output_file_id)
#         for result_item in result.text.splitlines():
#             result_json = json.loads(result_item)
#             generated_conversation = json.loads(result_json['response']['body']['output'][0]['content'][0]['text'])

#             comparison_text += f"\nGenerated conversation for {result_json['custom_id']}:\n"
#             comparison_text += json.dumps(generated_conversation, ensure_ascii=False, indent=2)
#             comparison_text += "\n\n"

# # Save the comparison text to a file
# with open(os.path.join(results_jsonl_dir, f"comparison.txt"), 'w', encoding='utf-8') as f:
#     f.write(comparison_text)