In [1]:
import json
import re
import unicodedata
from math import ceil
from datetime import datetime
import os

from dotenv import load_dotenv
from pydantic import BaseModel, ValidationError

In [2]:
load_dotenv('../.env')

True

In [3]:
TASK_NAME = "article_summarization_v0"
DEPLOYMENT_NAME = "gpt-4o--batch-2"

def get_run_id():
    return os.getenv('RUNID')

RUNID = get_run_id()

INPUT_DATA_PATH = f"../local_tests_data/relevant_articles_content/{RUNID}/relevant_articles_content_{RUNID}.json"
RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"../local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/BATCHFILES/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_2 at 2025-06-02 15:45:00


In [4]:
def get_relevant_articles_content():
    with open(INPUT_DATA_PATH, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

relevant_articles_content = get_relevant_articles_content()
n_to_process = len(relevant_articles_content)
print(f"Number of articles to process: {n_to_process}")
print(relevant_articles_content[0])


Number of articles to process: 21
{'article_id': 'techcrunch_20250530144755886209', 'content': 'AI dev platform Hugging Face continued its push into robotics on Thursday with the release of two new humanoid robots.\n\nThe company announced a pair of open source robots, HopeJR and Reachy Mini. HopeJR is a full-size humanoid robot that has 66 actuated degrees of freedom, or 66 independent movements, including the ability to walk and move its arms. Reachy Mini is a desktop unit that can move its head, talk, listen, and be used to test AI apps.\n\nMeet HopeJr, a full humanoid robot lowering the barrier to entry!\n\n\n\nCapable of walking, manipulating many objects, open-source and costs under $3000 🤯\n\n\n\nDesigned by @therobotstudio and @huggingface 👇 pic.twitter.com/wCwo8YPOGV — Remi Cadene (@RemiCadene) May 29, 2025\n\nHugging Face doesn’t have an exact timeline for shipping these robots. The company’s co-founder and CEO, Clem Delangue, told TechCrunch over email that they expect to st

In [5]:
prompts_per_batch_job = 200
n_batch_jobs = ceil(n_to_process/prompts_per_batch_job)
print("Creating {} batch files".format(n_batch_jobs))

Creating 1 batch files


In [6]:
system_prompt = """
You are a professional content assistant summarizing articles for a LinkedIn thought leader. Your task is to summarize the article in 3 to 8 sentences.

Focus on:
- The main idea or thesis
- The most important insight or data
- Why it matters to professionals or business leaders

Use clear, concise, and neutral professional English. Avoid fluff, opinion, or casual tone.

**Important:** The summary must be written in English only — even if the source article is written in another language. Do not translate the article; just summarize its core ideas in English.
"""

In [7]:
def format_task_jsonl_line(task_id, deployment_name, user_input):
    jsonl_line_template = {
        "custom_id": task_id,
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": deployment_name,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_input
                }
            ]
        }
    }
    return jsonl_line_template

In [8]:
def generate_jsonl_lines(chunk_id, chunk_articles):
    for j,relevant_article in enumerate(chunk_articles):
        article_id = relevant_article["article_id"]
        article_content = relevant_article["content"]
        task_id = f"{RUNID}--{TASK_NAME}--{article_id}"
        deployment_name = DEPLOYMENT_NAME
        yield json.dumps(format_task_jsonl_line(task_id=task_id, deployment_name=deployment_name, user_input=article_content)) + "\n"
    

In [9]:
for i in range(n_batch_jobs):
    print(i)
    chunk = relevant_articles_content[i*prompts_per_batch_job:min(n_to_process, (i+1)*prompts_per_batch_job)]
    batchfilename = f"{RUNID}--{TASK_NAME}_BATCHFILE_{i}.jsonl"

    with open(OUTPUT_DATA_PATH + batchfilename, "w") as f:
        for line in generate_jsonl_lines(chunk_id=i, chunk_articles=chunk):
            f.write(line)
        print(f"Batch file {batchfilename} created with {len(chunk)} articles.")

0
Batch file RUNID_2--article_summarization_v0_BATCHFILE_0.jsonl created with 21 articles.
