In [27]:
import json
import os
from dotenv import load_dotenv
from datetime import datetime

from azure.storage.blob import BlobServiceClient 

In [28]:
load_dotenv('../.env')

True

In [29]:
TASK_NAME = "relevance_check_v0"

def get_run_id():
    return os.getenv('RUNID') 

RUNID = get_run_id()

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))

input_container_name = 'azure-openai-batch-processing-files'
output_container_name = 'relevant-articles-list'

input_container = blob_service_client.get_container_client(input_container_name)
assert input_container.exists(), f"Input container '{input_container_name}' does not exist."
output_container = blob_service_client.get_container_client(output_container_name)
assert output_container.exists(), f"Output container '{output_container_name}' does not exist."


print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_3 at 2025-06-03 14:41:45


In [30]:
def read_outputs():
    outputs = []
    for blob_info in input_container.list_blobs(name_starts_with=f"{RUNID}--{TASK_NAME}_OUTPUT"):
        blob_client = input_container.get_blob_client(blob_info.name)
        lines_raw = blob_client.download_blob().readall().decode('utf-8').splitlines()
        for line in lines_raw:
            output_dict = json.loads(line)
            model = output_dict.get("response").get("body").get("model")
            line_id = output_dict.get("custom_id")
            run_id, task_name, article_id = line_id.split("--")
            content_json = output_dict.get("response").get("body").get("choices")[0].get("message").get("content")
            content = json.loads(content_json)
            outputs.append({
                "model": model,
                "run_id": run_id,
                "task_name": task_name,
                "article_id": article_id,
                "relevance": content.get("relevance"),
                "article_language": content.get("article_language"),
            })
    return outputs

def get_relevant_articles():

    outputs = read_outputs()

    raw_articles_list = json.loads(blob_service_client.get_blob_client('raw-articles-list', f"{RUNID}--raw_articles_list.json").download_blob().readall().decode('utf-8'))

    raw_articles_dict = {}
    for a in raw_articles_list:
        a.pop("model")
        a.pop("task_name")
        article_id = a.get("article_id")
        raw_articles_dict[article_id] = a

    relevant_articles = []
    for output in outputs:
        if output.get("relevance") != 2:
            continue

        article_id = output.get("article_id")
        a = raw_articles_dict[article_id]

        relevant_article = output | a
        relevant_articles.append(relevant_article)

    return relevant_articles

In [31]:
relevant_articles = get_relevant_articles()

In [32]:
relevant_articles

[{'model': 'gpt-4o-2024-11-20',
  'run_id': 'RUNID_3',
  'task_name': 'relevance_check_v0',
  'article_id': 'techcrunch_20250603125707867424',
  'relevance': 2,
  'article_language': 'en',
  'source_name': 'techcrunch',
  'article_title': 'Early AI investor Elad Gil finds his next big bet: AI-powered rollups',
  'article_url': 'https://techcrunch.com/2025/06/01/early-ai-investor-elad-gil-finds-his-next-big-bet-ai-powered-rollups/',
  'article_keywords': ['AI', 'Investment', 'Technology'],
  'crawled_at': '2025-06-03 12:57:07'},
 {'model': 'gpt-4o-2024-11-20',
  'run_id': 'RUNID_3',
  'task_name': 'relevance_check_v0',
  'article_id': 'business_insider_20250603125707980679',
  'relevance': 2,
  'article_language': 'es',
  'source_name': 'business_insider',
  'article_title': "Los adolescentes deberían entrenarse para ser 'ninjas' de la IA, según el CEO de Google DeepMind",
  'article_url': 'https://www.businessinsider.es/tecnologia/adolescentes-deberian-entrenarse-ser-ninjas-ia-ceo-goog

In [None]:
def save_outputs():
    output_blob_name = f"{RUNID}--relevant_articles_list.json"
    output_blob_client = output_container.get_blob_client(output_blob_name)
    output_blob_client.upload_blob(json.dumps(relevant_articles, indent=4), overwrite=True)
    print(f"Relevant articles list saved to blob storage as {output_blob_name}")

In [34]:
save_outputs()

Relevant  articles list saved to blob storage as RUNID_3--relevant_articles_list.json
