In [60]:
import json
import os
from dotenv import load_dotenv
from datetime import datetime
from pathlib import Path
from time import sleep

from pydantic import BaseModel, ValidationError

In [61]:
load_dotenv('../.env')

True

In [62]:
TASK_NAME = "source_parsing_v0"


def get_run_id():
    return os.getenv('RUNID') 

RUNID = get_run_id()

INPUT_DATA_PATH = f"../local_tests_data/azure_openai_batch_processing_files/{RUNID}/{TASK_NAME}/OUTPUTS/"

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

OUTPUT_DATA_PATH = f"../local_tests_data/raw_articles_list/{RUNID}/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_2 at 2025-05-30 14:47:55


In [63]:
def read_outputs():
    outputs = []
    for filename in os.listdir(INPUT_DATA_PATH):
        if filename.endswith('.jsonl'):
            with open(os.path.join(INPUT_DATA_PATH, filename), 'r') as f:
                    lines_raw = f.readlines()
                    for line in lines_raw:
                        output_dict = json.loads(line)
                        model = output_dict.get("response").get("body").get("model")
                        line_id = output_dict.get("custom_id")
                        content_json = output_dict.get("response").get("body").get("choices")[0].get("message").get("content")
                        content = json.loads(content_json)
                        outputs.append({
                            "model": model,
                            "line_id": line_id,
                            "content": content
                        })
    return outputs

import glob
import os

def get_previously_crawled_article_titles():

    root_dir = Path(OUTPUT_DATA_PATH).parent
    pattern = '*.json'               

    search_path = os.path.join(root_dir, '**', pattern)
    all_filenames = glob.glob(search_path, recursive=True)

    previously_crawled_article_titles = []
    for filepath in all_filenames:
        with open(filepath, 'r') as file:
            data = json.load(file)
            for item in data:
                if 'article_title' in item:
                    previously_crawled_article_titles.append(item['article_title'])

    return previously_crawled_article_titles

In [64]:

class RawArticle(BaseModel):
    model: str
    run_id: str
    task_name: str
    source_name: str
    article_id: str
    article_title: str
    article_url: str
    article_keywords: list[str]
    article_language: str
    crawled_at: str


previously_crawled_article_titles = get_previously_crawled_article_titles()

new_raw_articles_list = []
for output in read_outputs():
    model = output['model']
    line_id = output['line_id']
    run_id, task_name, source_name = line_id.split("--")
    content = output['content']

    article_links_list = content.get("article_links_list", [])
    for article_link in article_links_list:
        article_title = article_link.get("title", "")
        article_url = article_link.get("url", "")
        article_keywords = article_link.get("keywords", [])
        article_language = article_link.get("language", "")
        article_id = source_name + "_" + datetime.now().strftime('%Y%m%d%H%M%S%f')

        sleep(0.01)

        if article_title in previously_crawled_article_titles:
            print(f"Skipping {article_title} as it has already been crawled.")
            continue

        try:
            raw_article = RawArticle(
                model=model,
                run_id=run_id,
                task_name=task_name,
                source_name=source_name,
                article_id=article_id,
                article_title=article_title,
                article_url=article_url,
                article_keywords=article_keywords,
                article_language=article_language,
                crawled_at=RUN_TIME
            )
            new_raw_articles_list.append(raw_article.model_dump())
        except ValidationError as e:
            print(f"Validation error for article '{article_title}'")
            continue



Skipping Bioprinted organs ‘10–15 years away,’ says startup regenerating dog skin as it has already been crawled.
Skipping ‘Purest meat alternative’ to grow in Swedish mycoprotein factory as it has already been crawled.
Skipping Can geothermal startups drill Europe to clean energy independence? as it has already been crawled.
Skipping TNW Backstage tunes into the future of earbuds as it has already been crawled.
Skipping ENISA ya cuenta con más de medio millar de startups certificadas as it has already been crawled.
Skipping 7 metas que deberán marcar la relación de las empresas con la tecnología en 2024 as it has already been crawled.
Skipping Apple es la compañía tecnológica que más ganancias tiene por empleado as it has already been crawled.
Skipping Cómo mantener el impulso de tu startup durante la época navideña as it has already been crawled.
Skipping Google paga 5.000 millones de dólares para resolver una demanda colectiva as it has already been crawled.
Skipping Meta, la matriz

In [65]:
print(len(new_raw_articles_list))

27


In [66]:
def save_raw_articles_list():
    with open(os.path.join(OUTPUT_DATA_PATH, f"raw_articles_list_{RUNID}.json"), 'w') as f:
        json.dump(new_raw_articles_list, f, indent=4)
        print(f"New raw articles list saved to {OUTPUT_DATA_PATH}raw_articles_list_{RUNID}.json")

In [67]:
save_raw_articles_list()

New raw articles list saved to ../local_tests_data/raw_articles_list/RUNID_2/raw_articles_list_RUNID_2.json
