In [11]:
import os
import json
from newspaper import Article
from time import sleep
from datetime import datetime
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

In [12]:
load_dotenv('.env')

True

In [13]:
def get_run_id():
    return os.getenv('RUNID') 

RUNID = get_run_id()

RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))

input_container_name = 'relevant-articles-list'
output_container_name = 'relevant-articles-content'

input_container = blob_service_client.get_container_client(input_container_name)
assert input_container.exists(), f"Input container '{input_container_name}' does not exist."
output_container = blob_service_client.get_container_client(output_container_name)
assert output_container.exists(), f"Output container '{output_container_name}' does not exist."

input_blob = input_container.get_blob_client(f'{RUNID}--relevant_articles_list.json')
assert input_blob.exists(), f"Input blob '{RUNID}--relevant_articles_list.json' does not exist."

print(f"Run ID: {RUNID} at {RUN_TIME}")

Run ID: RUNID_3 at 2025-06-03 14:47:43


In [14]:
def read_relevant_articles_list():
    relevant_articles_list = json.loads(input_blob.download_blob().readall().decode('utf-8'))
    return relevant_articles_list

In [15]:
relevant_articles_list = read_relevant_articles_list()
print(relevant_articles_list)

[{'model': 'gpt-4o-2024-11-20', 'run_id': 'RUNID_3', 'task_name': 'relevance_check_v0', 'article_id': 'techcrunch_20250603125707867424', 'relevance': 2, 'article_language': 'en', 'source_name': 'techcrunch', 'article_title': 'Early AI investor Elad Gil finds his next big bet: AI-powered rollups', 'article_url': 'https://techcrunch.com/2025/06/01/early-ai-investor-elad-gil-finds-his-next-big-bet-ai-powered-rollups/', 'article_keywords': ['AI', 'Investment', 'Technology'], 'crawled_at': '2025-06-03 12:57:07'}, {'model': 'gpt-4o-2024-11-20', 'run_id': 'RUNID_3', 'task_name': 'relevance_check_v0', 'article_id': 'business_insider_20250603125707980679', 'relevance': 2, 'article_language': 'es', 'source_name': 'business_insider', 'article_title': "Los adolescentes deberían entrenarse para ser 'ninjas' de la IA, según el CEO de Google DeepMind", 'article_url': 'https://www.businessinsider.es/tecnologia/adolescentes-deberian-entrenarse-ser-ninjas-ia-ceo-google-deepmind-1463336', 'article_keywor

In [16]:

articles_content = []

for relevant_article in relevant_articles_list:
    url = relevant_article["article_url"]
    article_id = relevant_article["article_id"]
    # keywords = relevant_article["keywords"]
    title = relevant_article["article_title"]
    print(f"Processing {url}")
    sleep(1)  # Sleep to avoid overwhelming the server
    article = Article(url)
    article.download()
    article.parse()
    content = article.text
    publish_date = datetime.strftime(article.publish_date, "%Y-%m-%d") if article.publish_date else None
    if not content:
        print(f"Skipping {url} due to empty content")
        continue

    articles_content.append({'article_id' : article_id, 'content' : content, 'title': title, 'publish_date': publish_date})


Processing https://techcrunch.com/2025/06/01/early-ai-investor-elad-gil-finds-his-next-big-bet-ai-powered-rollups/
Processing https://www.businessinsider.es/tecnologia/adolescentes-deberian-entrenarse-ser-ninjas-ia-ceo-google-deepmind-1463336
Processing https://www.businessinsider.es/tecnologia/lanzamiento-robotaxi-tesla-vuelta-esquina-sabemos-1464123
Processing https://techcrunch.com/2025/06/02/major-record-labels-are-reportedly-in-licensing-talks-with-ai-firms-udio-and-suno/
Processing https://techcrunch.com/2025/06/02/3-days-until-the-doors-open-at-techcrunch-sessions-ai-in-berkeley/
Processing https://techcrunch.com/2025/06/01/for-the-love-of-god-stop-calling-your-ai-a-co-worker/
Processing https://www.businessinsider.es/tecnologia/exito-proyecto-piloto-menudos-techies-llevar-inteligencia-artificial-escolares-1464701
Processing https://www.businessinsider.es/tecnologia/bruselas-lanza-julio-app-verificar-edad-menores-usan-plataformas-digitales-1464266
Processing https://www.business

In [17]:
def save_articles_content():
    output_blob_name = f"{RUNID}--relevant_articles_content.json"
    output_blob_client = output_container.get_blob_client(output_blob_name)
    output_blob_client.upload_blob(json.dumps(articles_content, indent=4), overwrite=True)
    print(f"Relevant articles content saved to blob storage as {output_blob_name}")

In [18]:
save_articles_content()

Relevant articles content saved to blob storage as RUNID_3--relevant_articles_content.json
