In [27]:
from azure.storage.blob import BlobServiceClient
import json
import re
import unicodedata
from math import ceil
from datetime import datetime
import os

In [None]:
data_path = "../local_tests_data/raw_articles_list"

all_articles = []

for filename in os.listdir(data_path):

    if not filename.endswith(".json"):
        continue


    source = filename.split(".")[0]

    with open(os.path.join(data_path, filename), "r") as f:
        data = json.load(f)

    for article in data:
        article["source"] = source
        all_articles.append(article)

In [30]:
prompts_per_batch_job = 200
n_articles_to_process = len(all_articles)
n_batch_jobs = ceil(n_articles_to_process/prompts_per_batch_job)
print("Creating {} batch files".format(n_batch_jobs))

Creating 1 batch files


In [31]:
system_prompt = """
You are a smart content curator for a LinkedIn thought leader. Given a news article, decide if it is highly relevant based on the following criteria:
    - It aligns with topics like: Artificial Intelligence, Leadership, Remote Work, Digital Transformation, Sustainability, Emerging Tech, Industry Trends, Organizational Culture, DEI, Future of Work, Cybersecurity, Productivity, Startups, Market Trends, or Personal Branding.
    - It provides useful insight, a new perspective, or credible data.
    - It is suitable for a professional audience.

You must adhere to the provided criteria and schema.
"""

In [None]:
def format_task_jsonl_line(task_id, deployment_name, user_input):
    jsonl_line_template = {
        "custom_id": task_id,
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": deployment_name,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt.replace("\n", "\\n")
                },
                {
                    "role": "user",
                    "content": user_input.replace("\n", "\\n")
                }
            ],
            "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "ArticleRelevanceCheck",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "relevance": {
                            "type": "integer",
                            "description": "Relevance score for the article, between 0 and 2, where 0 means not relevant, 1 means somewhat relevant, and 2 means highly relevant.",
                            "enum": [0, 1, 2]
                        },
                        "article_language": {
                            "type": "string",
                        } 
                    },
                    "required": [
                        "relevance",
                        "article_language"
                    ],
                    "additionalProperties": False
                }
            }
        }

        }
    }
    return jsonl_line_template

In [33]:
def generate_jsonl_lines(chunk_id, chunk_articles):
    for j,article_info in enumerate(chunk_articles):
        article_id = article_info["article_id"]
        task_id = f"relevance-check-v0*{article_id}"
        deployment_name = "gpt-4o--batch-2"
        yield json.dumps(format_task_jsonl_line(task_id=task_id, deployment_name=deployment_name, user_input=json.dumps(article_info))) + "\n"
    

In [34]:
for i in range(n_batch_jobs):
    print(i)
    chunk = all_articles[i*prompts_per_batch_job:min(n_articles_to_process, (i+1)*prompts_per_batch_job)]
    batchfilename = f"batch_file_relevance_check_{i}_BATCHFILE.jsonl"
    with open(batchfilename, "w") as f:
        for line in generate_jsonl_lines(chunk_id=i, chunk_articles=chunk):
            f.write(line)

0
