In [1]:
import os
import json
from newspaper import Article
from time import sleep
from datetime import datetime

In [2]:
def get_run_id():
    return os.getenv('RUNID') 

RUNID = get_run_id()

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

INPUT_DATA_PATH = f"./local_tests_data/relevant_articles_list/{RUNID}/"
OUTPUT_DATA_PATH = f"./local_tests_data/relevant_articles_content/{RUNID}/"

os.makedirs(OUTPUT_DATA_PATH, exist_ok=True)

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_2 at 2025-06-02 15:44:07


In [3]:
def read_relevant_articles_list():
    relevant_articles_list = []
    with open(f"{INPUT_DATA_PATH}relevant_articles_list_{RUNID}.json", 'r') as file:
        relevant_articles_list = json.load(file)
    return relevant_articles_list

In [4]:
relevant_articles_list = read_relevant_articles_list()
print(relevant_articles_list)

[{'model': 'gpt-4o-2024-11-20', 'run_id': 'RUNID_2', 'task_name': 'relevance_check_v0', 'article_id': 'techcrunch_20250530144755886209', 'relevance': 2, 'article_language': 'en', 'source_name': 'techcrunch', 'article_title': 'Hugging Face unveils two new humanoid robots', 'article_url': 'https://techcrunch.com/2025/05/29/hugging-face-unveils-two-new-humanoid-robots/', 'article_keywords': ['Robotics'], 'crawled_at': '2025-05-30 14:47:55'}, {'model': 'gpt-4o-2024-11-20', 'run_id': 'RUNID_2', 'task_name': 'relevance_check_v0', 'article_id': 'itespresso_20250530144756051666', 'relevance': 2, 'article_language': 'es', 'source_name': 'itespresso', 'article_title': 'Apple lanzó discretamente un LLM multimodal de código abierto en octubre', 'article_url': 'https://www.itespresso.es/apple-llm-multimodal-open-source-243999.html', 'article_keywords': ['Apple', 'LLM código abierto'], 'crawled_at': '2025-05-30 14:47:55'}, {'model': 'gpt-4o-2024-11-20', 'run_id': 'RUNID_2', 'task_name': 'relevance_c

In [5]:

articles_content = []

for relevant_article in relevant_articles_list:
    url = relevant_article["article_url"]
    article_id = relevant_article["article_id"]
    # keywords = relevant_article["keywords"]
    title = relevant_article["article_title"]
    print(f"Processing {url}")
    sleep(1)  # Sleep to avoid overwhelming the server
    article = Article(url)
    article.download()
    article.parse()
    content = article.text
    publish_date = datetime.strftime(article.publish_date, "%Y-%m-%d") if article.publish_date else None
    if not content:
        print(f"Skipping {url} due to empty content")
        continue

    articles_content.append({'article_id' : article_id, 'content' : content, 'title': title, 'publish_date': publish_date})


Processing https://techcrunch.com/2025/05/29/hugging-face-unveils-two-new-humanoid-robots/
Processing https://www.itespresso.es/apple-llm-multimodal-open-source-243999.html
Processing https://www.itespresso.es/pymes-valencianas-ayudas-imasd-244033.html
Processing https://www.itespresso.es/guerra-israel-palestina-startups-244028.html
Processing https://techcrunch.com/2025/05/29/top-30-startups-announced-for-vivatech-2025-innovation-of-the-year-award/
Processing https://www.itespresso.es/applivery-ronda-financiacion-243993.html
Processing https://techcrunch.com/2025/05/29/black-forest-labs-kontext-ai-models-can-edit-pics-as-well-as-generate-them/
Processing https://www.itespresso.es/mujeres-problema-sector-tech-espanol-174855.html
Processing https://www.itespresso.es/navarra-startups-244020.html
Processing https://techcrunch.com/2025/05/29/grammarly-secures-1b-in-non-dilutive-funding-from-general-catalyst/
Processing https://www.itespresso.es/errores-evitar-elevator-pitch-244012.html
Pro

In [6]:
def save_articles_content():
    with open(f"{OUTPUT_DATA_PATH}relevant_articles_content_{RUNID}.json", 'w') as file:
        json.dump(articles_content, file, indent=4)
    print(f"Saved {len(articles_content)} articles content to {OUTPUT_DATA_PATH}articles_content_{RUNID}.json")

In [7]:
save_articles_content()

Saved 21 articles content to ./local_tests_data/relevant_articles_content/RUNID_2/articles_content_RUNID_2.json
