In [1]:
import json
import os
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path
from time import sleep

from pydantic import BaseModel, ValidationError

In [2]:
load_dotenv('../.env')

True

In [3]:
TASK_NAME = "relevance_check_multi_persona_v0"
RUNID = "DEVRUN"

INPUT_DATA_PATH = f"./local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/OUTPUTS/"

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"./local_tests_data/relevant_articles_list/{RUNID}/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: DEVRUN at 2025-06-23 14:46:08


In [9]:
def read_outputs():
    outputs = []
    for filename in os.listdir(INPUT_DATA_PATH):
        if filename.endswith('.jsonl'):
            with open(os.path.join(INPUT_DATA_PATH, filename), 'r') as f:
                    lines_raw = f.readlines()
                    for line in lines_raw:
                        output_dict = json.loads(line)
                        model = output_dict.get("response").get("body").get("model")
                        line_id = output_dict.get("custom_id")
                        run_id, task_name, article_id = line_id.split("--")
                        content_json = output_dict.get("response").get("body").get("choices")[0].get("message").get("content")
                        content = json.loads(content_json)
                        article_language = content.pop("article_language", None)
                        outputs.append({
                            "model": model,
                            "run_id": run_id,
                            "task_name": task_name,
                            "article_id": article_id,
                            "relevance": content,
                            "article_language": article_language,
                        })
    return outputs

def get_relevant_articles():

    outputs = read_outputs()

    with open(f'./local_tests_data/raw_articles_list/{RUNID}/raw_articles_list_{RUNID}.json', 'r') as f:
        raw_articles_list = json.load(f)

    raw_articles_dict = {}
    for a in raw_articles_list:
        if "model" in a:
            a.pop("model")
        if "task_name" in a:
            a.pop("task_name")
        article_id = a.get("article_id")
        raw_articles_dict[article_id] = a

    relevant_articles = []
    for output in outputs:
        if output.get("relevance") != 2:
            continue

        article_id = output.get("article_id")
        a = raw_articles_dict[article_id]

        relevant_article = output | a
        relevant_articles.append(relevant_article)

    return relevant_articles

In [10]:
read_outputs()

[{'model': 'gpt-4o-2024-11-20',
  'run_id': 'DEVRUN',
  'task_name': 'relevance_check_multi_persona_v0',
  'article_id': 'religión_digital_17506663546179292',
  'relevance': {'LAURA GIL': 1,
   'FEDE SEGARRA': 1,
   'ELÍSABETH HERNÁNDEZ': 2,
   'JAUME ALEMANY': 0,
   'RICARDO LECHUGA': 2,
   'JORGE VILLAVECCHIA': 1,
   'SALVADOR MARTÍNEZ': 1,
   'JOFRE RIERA': 0},
  'article_language': 'Spanish'},
 {'model': 'gpt-4o-2024-11-20',
  'run_id': 'DEVRUN',
  'task_name': 'relevance_check_multi_persona_v0',
  'article_id': 'retema_17506663546179304',
  'relevance': {'LAURA GIL': 2,
   'FEDE SEGARRA': 0,
   'ELÍSABETH HERNÁNDEZ': 1,
   'JAUME ALEMANY': 0,
   'RICARDO LECHUGA': 0,
   'JORGE VILLAVECCHIA': 2,
   'SALVADOR MARTÍNEZ': 2,
   'JOFRE RIERA': 0},
  'article_language': 'es'},
 {'model': 'gpt-4o-2024-11-20',
  'run_id': 'DEVRUN',
  'task_name': 'relevance_check_multi_persona_v0',
  'article_id': 'región_digital_17506663546179285',
  'relevance': {'LAURA GIL': 1,
   'FEDE SEGARRA': 1,
  

In [None]:
relevant_articles = get_relevant_articles()

FileNotFoundError: [Errno 2] No such file or directory: './local_tests_data/raw_articles_list/DEVRUN/raw_articles_list_DEVRUN.json'

In [None]:
relevant_articles

[{'model': 'gpt-4o-2024-11-20',
  'run_id': 'RUNID_2',
  'task_name': 'relevance_check_v0',
  'article_id': 'techcrunch_20250530144755886209',
  'relevance': 2,
  'article_language': 'en',
  'source_name': 'techcrunch',
  'article_title': 'Hugging Face unveils two new humanoid robots',
  'article_url': 'https://techcrunch.com/2025/05/29/hugging-face-unveils-two-new-humanoid-robots/',
  'article_keywords': ['Robotics'],
  'crawled_at': '2025-05-30 14:47:55'},
 {'model': 'gpt-4o-2024-11-20',
  'run_id': 'RUNID_2',
  'task_name': 'relevance_check_v0',
  'article_id': 'itespresso_20250530144756051666',
  'relevance': 2,
  'article_language': 'es',
  'source_name': 'itespresso',
  'article_title': 'Apple lanzó discretamente un LLM multimodal de código abierto en octubre',
  'article_url': 'https://www.itespresso.es/apple-llm-multimodal-open-source-243999.html',
  'article_keywords': ['Apple', 'LLM código abierto'],
  'crawled_at': '2025-05-30 14:47:55'},
 {'model': 'gpt-4o-2024-11-20',
  'r

In [None]:
def save_outputs(relevant_articles):
    with open(os.path.join(OUTPUT_DATA_PATH, f"relevant_articles_list_{RUNID}.json"), 'w') as f:
        json.dump(relevant_articles, f, indent=4)
    print(f"Relevant articles saved to {OUTPUT_DATA_PATH}relevant_articles_list_{RUNID}.json")

In [None]:
save_outputs(relevant_articles)

Relevant articles saved to ../local_tests_data/relevant_articles_list/RUNID_2/relevant_articles_list_RUNID_2.json
