In [6]:
import json
from math import ceil
from datetime import datetime
import os

from dotenv import load_dotenv

from azure.storage.blob import BlobServiceClient

In [7]:
load_dotenv('../.env')

True

In [8]:
TASK_NAME = "article_summarization_v0"
DEPLOYMENT_NAME = "gpt-4o--batch-2"

def get_run_id():
    return os.getenv('RUNID')

RUNID = get_run_id()

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))

input_container_name = 'relevant-articles-content'
output_container_name = 'azure-openai-batch-processing-files'

input_container = blob_service_client.get_container_client(input_container_name)
assert input_container.exists(), f"Input container '{input_container_name}' does not exist."
output_container = blob_service_client.get_container_client(output_container_name)
assert output_container.exists(), f"Output container '{output_container_name}' does not exist."

input_blob = input_container.get_blob_client(f'{RUNID}--relevant_articles_content.json')
assert input_blob.exists(), f"Input blob '{RUNID}--relevant_articles_content.json' does not exist."


print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_3 at 2025-06-03 14:52:42


In [9]:
def get_relevant_articles_content():
    relevant_articles_content = json.loads(input_blob.download_blob().readall().decode('utf-8'))
    return relevant_articles_content

relevant_articles_content = get_relevant_articles_content()
n_to_process = len(relevant_articles_content)
print(f"Number of articles to process: {n_to_process}")
print(relevant_articles_content[0])


Number of articles to process: 14
{'article_id': 'techcrunch_20250603125707867424', 'content': 'Elad Gil started betting on AI before most of the world took notice. By the time investors began grasping the implications of ChatGPT, Gil had already written seed checks to startups like Perplexity, Character.AI, and Harvey. Now, as the early winners of the AI wave become clearer, the renowned “solo” VC is increasingly focused on a fresh opportunity: using AI to reinvent traditional businesses and scale them through roll-ups.\n\nThe idea is to identify opportunities to buy mature, people-intensive outfits like law firms and other professional services firms, help them scale through AI, then use the improved margins to acquire other such enterprises and repeat the process. He has been at it for three years.\n\n“It just seems so obvious,” said Gil over a Zoom call earlier this week. “This type of generative AI is very good at understanding language, manipulating language, manipulating text, p

In [10]:
prompts_per_batch_job = 200
n_batch_jobs = ceil(n_to_process/prompts_per_batch_job)
print("Creating {} batch files".format(n_batch_jobs))

Creating 1 batch files


In [11]:
system_prompt = """
You are a professional content assistant summarizing articles for a LinkedIn thought leader. Your task is to summarize the article in 3 to 8 sentences.

Focus on:
- The main idea or thesis
- The most important insight or data
- Why it matters to professionals or business leaders

Use clear, concise, and neutral professional English. Avoid fluff, opinion, or casual tone.

**Important:** The summary must be written in English ONLY — even if the source article is written in another language. Do not translate the article; just summarize its core ideas in English.
"""

In [12]:
def format_task_jsonl_line(task_id, deployment_name, user_input):
    jsonl_line_template = {
        "custom_id": task_id,
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": deployment_name,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user_input
                }
            ]
        }
    }
    return jsonl_line_template

In [13]:
def generate_jsonl_lines(chunk_id, chunk_items):
    for j,relevant_article in enumerate(chunk_items):
        article_id = relevant_article["article_id"]
        article_content = relevant_article["content"]
        task_id = f"{RUNID}--{TASK_NAME}--{article_id}"
        deployment_name = DEPLOYMENT_NAME
        yield json.dumps(format_task_jsonl_line(task_id=task_id, deployment_name=deployment_name, user_input=article_content)) + "\n"
    

In [14]:
for i in range(n_batch_jobs):
    print(i)
    chunk = relevant_articles_content[i*prompts_per_batch_job:min(n_to_process, (i+1)*prompts_per_batch_job)]
    batchfilename = f"{RUNID}--{TASK_NAME}_BATCHFILE_{i}.jsonl"

    batchfile_blob  = output_container.get_blob_client(batchfilename)
    batchfile_blob.upload_blob(generate_jsonl_lines(chunk_id=i, chunk_items=chunk), overwrite=True, encoding='utf-8')
    print(f"Batch file {batchfilename} created with {len(chunk)} tasks.")

0


Batch file RUNID_3--article_summarization_v0_BATCHFILE_0.jsonl created with 14 tasks.
