In [None]:
import json
import re
import unicodedata
from math import ceil
from datetime import datetime
import os

from dotenv import load_dotenv
from pydantic import BaseModel, ValidationError

In [2]:
load_dotenv('../.env')

True

In [3]:
TASK_NAME = "relevance_check_v0"
DEPLOYMENT_NAME = "gpt-4o--batch-2"

def get_run_id():
    return os.getenv('RUNID')

RUNID = get_run_id()

INPUT_DATA_PATH = f"../local_tests_data/raw_articles_list/{RUNID}/raw_articles_list_{RUNID}.json"
RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"../local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/BATCHFILES/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_2 at 2025-05-30 15:10:38


In [4]:
class RawArticle(BaseModel):
    model: str
    run_id: str
    task_name: str
    source_name: str
    article_id: str
    article_title: str
    article_url: str
    article_keywords: list[str]
    article_language: str
    crawled_at: str

def get_raw_articles_list() -> list[RawArticle]:
    with open(INPUT_DATA_PATH, 'r') as file:
        raw_articles_list = json.load(file)
    return [RawArticle(**a) for a in raw_articles_list]


In [5]:
raw_articles_list = get_raw_articles_list()
print(raw_articles_list[0])
n_to_process = len(raw_articles_list)
print(f"Number of articles to process: {n_to_process}")

model='gpt-4o-2024-11-20' run_id='RUNID_2' task_name='source_parsing_v0' source_name='the_next_web' article_id='the_next_web_20250530144755804588' article_title='Opinion: Europe must warm up to geothermal before it’s too late' article_url='https://thenextweb.com/news/europe-geothermal-energy' article_keywords=['Geothermal', 'Energy', 'Europe'] article_language='en' crawled_at='2025-05-30 14:47:55'
Number of articles to process: 27


In [6]:
prompts_per_batch_job = 200
n_batch_jobs = ceil(n_to_process/prompts_per_batch_job)
print("Creating {} batch files".format(n_batch_jobs))

Creating 1 batch files


In [7]:
system_prompt = """
You are a smart content curator for a LinkedIn thought leader. Given a news article, decide if it is highly relevant based on the following criteria:
    - It aligns with topics like: Artificial Intelligence, Leadership, Remote Work, Digital Transformation, Sustainability, Emerging Tech, Industry Trends, Organizational Culture, DEI, Future of Work, Cybersecurity, Productivity, Startups, Market Trends, or Personal Branding.
    - It provides useful insight, a new perspective, or credible data.
    - It is suitable for a professional audience.

You must adhere to the provided criteria and schema.
"""

In [8]:
def format_task_jsonl_line(task_id, deployment_name, user_input):
    jsonl_line_template = {
        "custom_id": task_id,
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": deployment_name,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt.replace("\n", "\\n")
                },
                {
                    "role": "user",
                    "content": user_input.replace("\n", "\\n")
                }
            ],
            "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "ArticleRelevanceCheck",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "relevance": {
                            "type": "integer",
                            "description": "Relevance score for the article, between 0 and 2, where 0 means not relevant, 1 means somewhat relevant, and 2 means highly relevant.",
                            "enum": [0, 1, 2]
                        },
                        "article_language": {
                            "type": "string",
                        } 
                    },
                    "required": [
                        "relevance",
                        "article_language"
                    ],
                    "additionalProperties": False
                }
            }
        }

        }
    }
    return jsonl_line_template

In [9]:
def generate_jsonl_lines(chunk_id, chunk_raw_articles):
    for j,article_info in enumerate(chunk_raw_articles):
        article_id = article_info.article_id
        task_id = f"{RUNID}--{TASK_NAME}--{article_id}"
        deployment_name = DEPLOYMENT_NAME
        a = {
            "article_title" : article_info.article_title,
            "article_url" : article_info.article_url,
            "article_keywords" : article_info.article_keywords,
            "article_language" : article_info.article_language,
        }
        yield json.dumps(format_task_jsonl_line(task_id=task_id, deployment_name=deployment_name, user_input=json.dumps(a))) + "\n"
    

In [10]:
for i in range(n_batch_jobs):
    print(i)
    chunk = raw_articles_list[i*prompts_per_batch_job:min(n_to_process, (i+1)*prompts_per_batch_job)]
    batchfilename = f"{RUNID}--{TASK_NAME}_BATCHFILE_{i}.jsonl"

    with open(OUTPUT_DATA_PATH + batchfilename, "w") as f:
        for line in generate_jsonl_lines(chunk_id=i, chunk_raw_articles=chunk):
            f.write(line)
        print(f"Batch file {batchfilename} created with {len(chunk)} articles.")

0
Batch file RUNID_2--relevance_check_v0_BATCHFILE_0.jsonl created with 27 articles.
