In [2]:
from azure.storage.blob import BlobServiceClient
import json
import re
import unicodedata
from math import ceil
import os

In [4]:
data_path = "./local_tests_data/articles_list"

all_articles = []

for filename in os.listdir(data_path):
    if not filename.endswith(".json"):
        continue
    with open(os.path.join(data_path, filename), "r") as f:
        data = json.load(f)

    for article in data:
        article["source"] = filename
        all_articles.append(article)

In [8]:
prompts_per_batch_job = 200
n_articles_to_process = len(all_articles)
n_batch_jobs = ceil(n_articles_to_process/prompts_per_batch_job)
print("Creating {} batch files".format(n_batch_jobs))

Creating 1 batch files


In [11]:
system_prompt = """
You are a smart content curator for a LinkedIn thought leader. Given a news article, decide if it is highly relevant based on the following criteria:
    - It aligns with topics like: Artificial Intelligence, Leadership, Remote Work, Digital Transformation, Sustainability, Emerging Tech, Industry Trends, Organizational Culture, DEI, Future of Work, Cybersecurity, Productivity, Startups, Market Trends, or Personal Branding.
    - It provides useful insight, a new perspective, or credible data.
    - It is suitable for a professional audience.

You must adhere to the provided criteria.
"""

In [12]:
def format_task_jsonl_line(task_id, deployment_name, user_input):
    jsonl_line_template = {
        "custom_id": task_id,
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": deployment_name,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt.replace("\n", "\\n")
                },
                {
                    "role": "user",
                    "content": user_input.replace("\n", "\\n")
                }
            ],
            "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "ArticleRelevanceCheck",
                "strict": "true",
                "schema": {
                    "type": "object",
                    "properties": {
                        "relevance": {
                            "type": "string"
                        },
                    },
                    "required": [
                        "relevance"
                    ],
                    "additionalProperties": "false"
                }
            }
        }

        }
    }
    return jsonl_line_template

In [16]:
def generate_jsonl_lines(chunk_id, chunk_articles):
    for j,article_info in enumerate(chunk_articles):
        task_id = f"relevance-check-{chunk_id}-{j}"
        deployment_name = "gpt-4o-mini-batch"
        yield json.dumps(format_task_jsonl_line(task_id=task_id, deployment_name=deployment_name, user_input=json.dumps(article_info))) + "\n"
    

In [17]:
for i in range(n_batch_jobs):
    print(i)
    chunk = all_articles[i*prompts_per_batch_job:min(n_articles_to_process, (i+1)*prompts_per_batch_job)]
    batchfilename = f"batch_file_relevance_check_{i}.jsonl"
    with open(batchfilename, "w") as f:
        for line in generate_jsonl_lines(chunk_id=i, chunk_articles=chunk):
            f.write(line)

0
