In [None]:
import requests
import json 
import os
from datetime import datetime

from time import sleep
from pydantic import BaseModel, ValidationError

from dotenv import load_dotenv

In [33]:
load_dotenv('../.env')

True

In [None]:
def get_run_id():
    return os.getenv('RUNID')   

RUNID = get_run_id()


RUN_TIME = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')

print(f"Run ID: {RUNID} at {RUN_TIME}")

INPUT_DATA_PATH = "../local_tests_data/sources.json"
OUTPUT_DATA_PATH = f"../local_tests_data/source_raw_content/{RUNID}/"

# os.makedirs(OUTPUT_DATA_PATH, exist_ok=False)


Run ID: RUNID_2 at 2025-05-30 10:30:45


In [24]:
# Define Pydantic models
class Source(BaseModel):
    url: str
    source_name: str

class SaveContent(BaseModel):
    url: str
    name: str
    raw_content: str
    crawl_time: str

In [None]:
def get_sources() -> list[Source]:
    with open(INPUT_DATA_PATH, "r") as f:
        sources = json.load(f)
    # Validate and process sources
    validated_sources = []
    for source in sources:
        try:
            validated_source = Source(**source)
            validated_sources.append(validated_source)
        except ValidationError as e:
            print(f'Validation error for source: {source}')
            print(e)
    return validated_sources

def save_source_raw_content(save_content: SaveContent) -> None:
    """Saves the raw content of a source to a file."""
    file_path = OUTPUT_DATA_PATH + save_content.name + '.json'
    with open(file_path, 'w') as f:
        json.dump(save_content.model_dump(), f, indent=4)
    print(f'Saved content for {save_content.name} at {file_path}')

In [26]:
sources = get_sources()
print(sources)

[Source(url='https://techcrunch.com/latest/', source_name='TechCrunch'), Source(url='https://www.itespresso.es/', source_name='ITEspresso'), Source(url='https://www.businessinsider.es/tecnologia', source_name='Business Insider'), Source(url='https://www.cnet.com/ai-atlas/', source_name='CNET'), Source(url='https://thenextweb.com/', source_name='The Next Web')]


In [27]:
def fetch_source_content(source : Source) -> str:
    """Fetches the raw content of a source URL."""
    source_url_jina = 'https://r.jina.ai/' + source.url
    source_url_raw_content = requests.get(source_url_jina).text
    return source_url_raw_content

In [28]:
requests.get('https://r.jina.ai/' + sources[0].url).text



In [29]:
# Use validated data to create save_content
for source in sources:
    source_url_raw_content = fetch_source_content(source)

    save_content_data = {
        'url': source.url,
        'name': source.source_name.lower().replace(' ', '_'),
        'raw_content': source_url_raw_content,
        'crawl_time': RUN_TIME
    }

    try:
        save_content = SaveContent(**save_content_data)
        save_source_raw_content(save_content)
    except ValidationError as e:
        print(f'Validation error for save_content: {save_content_data}')
        print(e)

/tmp/ipykernel_1383715/4088507877.py:19: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  json.dump(save_content.dict(), f, indent=4)


Saved content for techcrunch at ../local_tests_data/source_raw_content/RUNID_2/techcrunch.json
Saved content for itespresso at ../local_tests_data/source_raw_content/RUNID_2/itespresso.json
Saved content for business_insider at ../local_tests_data/source_raw_content/RUNID_2/business_insider.json
Saved content for cnet at ../local_tests_data/source_raw_content/RUNID_2/cnet.json
Saved content for the_next_web at ../local_tests_data/source_raw_content/RUNID_2/the_next_web.json
