In [3]:
import json
from math import ceil
from datetime import datetime
import os

from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient


In [4]:
load_dotenv('../.env')

True

In [None]:
TASK_NAME = "relevance_check_multi_persona_v0"
DEPLOYMENT_NAME = "gpt-4o--batch-2"

def get_run_id():
    return os.getenv('RUNID')

RUNID = "DEVRUN" 

INPUT_DATA_PATH = f"./local_tests_data/raw_articles_list/{RUNID}/raw_articles_list_{RUNID}.json"
RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"./local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/BATCHFILES/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: DEVRUN at 2025-06-23 12:31:37


In [6]:
blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))
def read_raw_articles_ddgs():
    input_container_name = 'raw-articles-list-ddgs'
    input_container = blob_service_client.get_container_client(input_container_name)
    input_blob_name = f"{RUNID}--{input_container_name.replace("-", "_")}.json"
    input_blob_client = input_container.get_blob_client(input_blob_name)
    raw_articles_ddgs = input_blob_client.download_blob().readall()
    raw_articles_ddgs = json.loads(raw_articles_ddgs)
    print(f"Read {len(raw_articles_ddgs)} articles from DDG container")
    return raw_articles_ddgs

raw_articles_ddgs = read_raw_articles_ddgs()

Read 445 articles from DDG container


In [7]:
def read_raw_articles_jina():
    input_container_name = 'raw-articles-list'
    input_container = blob_service_client.get_container_client(input_container_name)
    input_blob_name = f"RUNID_9--{input_container_name.replace("-", "_")}.json"
    input_blob_client = input_container.get_blob_client(input_blob_name)
    raw_articles_jina = input_blob_client.download_blob().readall()
    raw_articles_jina = json.loads(raw_articles_jina)
    print(f"Read {len(raw_articles_jina)} articles from jina container")
    return raw_articles_jina

raw_articles_jina = read_raw_articles_jina()

Read 50 articles from jina container


In [8]:
raw_articles_ddgs[0]

{'run_id': 'DEVRUN',
 'source_name': 'campeche_hoy',
 'source_url': 'http://campechehoy.mx',
 'article_id': 'campeche_hoy_17506663546176462',
 'article_date': '2025-06-17T18:02:00+00:00',
 'article_title': 'Celebran la llegada de Claudia',
 'article_url': 'http://campechehoy.mx/2025/06/17/celebran-la-llegada-de-claudia/',
 'article_body': 'CDMX - Con pancartas, banderas y gritos de apoyo, la comunidad mexicana en Canadá dio una cálida bienvenida a la presidenta Claudia Sheinbaum a su llegada a Kananaskis, donde participará en la Cumbre',
 'article_image_url': 'http://campechehoy.mx/wp-content/uploads/2025/06/WhatsApp-Image-2025-06-16-at-5.48.37-PM.jpeg',
 'article_language': 'es',
 'crawled_at': '2025-06-23T10:12:34.618966',
 'ddgs_search_query': ['liderazgo empresarial'],
 'query_original_personas': ['Jorge Villavecchia']}

In [9]:
raw_articles_jina[1]

{'model': 'gpt-4o-2024-11-20',
 'run_id': 'RUNID_9',
 'task_name': 'source_parsing_v0',
 'source_name': 'the_verge',
 'article_id': 'the_verge_20250616113906015934',
 'article_title': 'Switch, Xbox, and the portable future of games',
 'article_url': 'https://www.theverge.com/tech#switch-xbox-and-handheld-consoles',
 'article_keywords': ['Games', 'Portable', 'Consoles'],
 'article_language': 'en',
 'crawled_at': '2025-06-16 11:39:05'}

In [10]:
raw_articles_list = []
for article in raw_articles_ddgs:
    raw_articles_list.append({
        "article_id": article['article_id'],
        "article_title": article['article_title'],
        "article_url": article['article_url'],
        "article_info": article['article_body']
    })
for article in raw_articles_jina:
    raw_articles_list.append({
        "article_id": article['article_id'],
        "article_title": article['article_title'],
        "article_url": article['article_url'],
        "article_info": "keywords: " + ", ".join(article['article_keywords']) + "\n"
    })

In [11]:
n_to_process = len(raw_articles_list)
print(f"Number of articles to process: {n_to_process}")

Number of articles to process: 495


In [12]:
prompts_per_batch_job = 200
n_batch_jobs = ceil(n_to_process/prompts_per_batch_job)
print("Creating {} batch files".format(n_batch_jobs))

Creating 3 batch files


In [None]:
system_prompt = """
You are an assistant trained to evaluate the relevance of news articles for a group of high-level executives at the Spanish brewery Damm. Based on each executive's professional focus, assess how relevant the article would be for them to comment on, share, or use in professional thought leadership (e.g., LinkedIn posts).

Each executive has a specific domain of interest. For the article below, assign a relevance score for each executive based on how well the topic aligns with their area of leadership and communication.

Use the following scale:
- 0 = Not relevant (the article has no clear connection to the executive’s domain)
- 1 = Somewhat relevant (it loosely touches on their area or could be of tangential interest)
- 2 = Highly relevant (it directly supports or connects to their role, values, or communication focus)

Use this executive reference:

- **LAURA GIL** (Chief Digital & Data Officer): Focused on data strategy, AI, digital transformation, and innovation. Interested in tech adoption in traditional industries.
- **FEDE SEGARRA** (Chief Communications Officer): Focused on brand communication, reputation, storytelling, and public image.
- **ELÍSABETH HERNÁNDEZ** (HR Development Director): Focused on talent development, CSR, internal communication, and cross-functional collaboration.
- **JAUME ALEMANY** (Chief Marketing Officer): Focused on marketing innovation, branding, consumer trends, and maintaining brand heritage.
- **RICARDO LECHUGA** (HR Director): Focused on organizational culture, employee engagement, leadership proximity, and digital HR transformation.
- **JORGE VILLAVECCHIA** (President): Focused on corporate strategy, sustainability, growth, innovation, and long-term vision.
- **SALVADOR MARTÍNEZ** (Chief Financial Officer): Focused on financial strategy, sustainable finance, decision-making support, and ethical value creation.
- **JOFRE RIERA** (Sponsorships Manager): Focused on sports and cultural sponsorships, community engagement, and value-based partnerships.

Return your result as a JSON object of the form:
{"LAURA GIL": 2, "FEDE SEGARRA": 0, ...}

Here is the article to evaluate:
"""

In [None]:
def format_task_jsonl_line(task_id, deployment_name, user_input):
    jsonl_line_template = {
        "custom_id": task_id,
        "method": "POST",
        "url": "/chat/completions",
        "body": {
            "model": deployment_name,
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt.replace("\n", "\\n")
                },
                {
                    "role": "user",
                    "content": user_input.replace("\n", "\\n")
                }
            ],
            "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "ArticleRelevanceCheck",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "LAURA GIL": {
                            "type": "integer",
                            "description": "Article relevance score for executive LAURA GIL",
                            "enum": [0, 1, 2]
                        },
                        "FEDE SEGARRA": {
                            "type": "integer",
                            "description": "Article relevance score for executive FEDE SEGARRA",
                            "enum": [0, 1, 2]
                        },
                        "ELÍSABETH HERNÁNDEZ": {
                            "type": "integer",
                            "description": "Article relevance score for executive ELÍSABETH HERNÁNDEZ",
                            "enum": [0, 1, 2]
                        },
                        "JAUME ALEMANY": {
                            "type": "integer",
                            "description": "Article relevance score for executive JAUME ALEMANY",
                            "enum": [0, 1, 2]
                        },
                        "RICARDO LECHUGA": {
                            "type": "integer",
                            "description": "Article relevance score for executive RICARDO LECHUGA",
                            "enum": [0, 1, 2]
                        },
                        "JORGE VILLAVECCHIA": {
                            "type": "integer",
                            "description": "Article relevance score for executive JORGE VILLAVECCHIA",
                            "enum": [0, 1, 2]
                        },
                        "SALVADOR MARTÍNEZ": {
                            "type": "integer",
                            "description": "Article relevance score for executive SALVADOR MARTÍNEZ",
                            "enum": [0, 1, 2]
                        },
                        "JOFRE RIERA": {
                            "type": "integer",
                            "description": "Article relevance score for executive JOFRE RIERA",     
                            "enum": [0, 1, 2]
                        },
                        "article_language": {
                            "type": "string",
                            "enum": ["es", "ca", "en"],
                        } 
                    },
                    "required": [
                        "LAURA GIL",
                        "FEDE SEGARRA",
                        "ELÍSABETH HERNÁNDEZ",
                        "JAUME ALEMANY",
                        "RICARDO LECHUGA",
                        "JORGE VILLAVECCHIA",
                        "SALVADOR MARTÍNEZ",
                        "JOFRE RIERA",  
                        "article_language"
                    ],
                    "additionalProperties": False
                }
            }
        }

        }
    }
    return jsonl_line_template

In [15]:
def generate_jsonl_lines(chunk_id, chunk_raw_articles):
    for j,article_info in enumerate(chunk_raw_articles):
        article_id = article_info['article_id']
        task_id = f"{RUNID}--{TASK_NAME}--{article_id}"
        deployment_name = DEPLOYMENT_NAME
        yield json.dumps(format_task_jsonl_line(task_id=task_id, deployment_name=deployment_name, user_input=json.dumps(article_info))) + "\n"
    

In [16]:
for i in range(n_batch_jobs):
    print(i)
    chunk = raw_articles_list[i*prompts_per_batch_job:min(n_to_process, (i+1)*prompts_per_batch_job)]
    batchfilename = f"{RUNID}--{TASK_NAME}_BATCHFILE_{i}.jsonl"

    with open(OUTPUT_DATA_PATH + batchfilename, "w") as f:
        for line in generate_jsonl_lines(chunk_id=i, chunk_raw_articles=chunk):
            f.write(line)
        print(f"Batch file {batchfilename} created with {len(chunk)} articles.")

0
Batch file DEVRUN--relevance_check_multi_persona_v0_BATCHFILE_0.jsonl created with 200 articles.
1
Batch file DEVRUN--relevance_check_multi_persona_v0_BATCHFILE_1.jsonl created with 200 articles.
2
Batch file DEVRUN--relevance_check_multi_persona_v0_BATCHFILE_2.jsonl created with 95 articles.
