In [2]:
import os
from dotenv import load_dotenv

if os.getcwd().endswith("/sandbox"):
    os.chdir('..')

load_dotenv()

True

In [3]:
from google.cloud import storage

# Set your bucket name either via environment variable or hard-coded.
BUCKET_NAME = os.environ.get("GCS_BUCKET", "test-caching")
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)


In [None]:
import os
import json
import datetime
import urllib.parse
from google.cloud import storage

# Define cache expiration period (30 days)

def url_to_key(url: str) -> str:
    """URL-encode the URL for safe blob naming."""
    return urllib.parse.quote(url, safe='')

def store_cache(url: str, json_output: dict) -> None:
    """
    Stores a cache entry in GCS.
    The blob name is: <encoded_url>/<epoch_timestamp>
    """
    now = datetime.datetime.now(datetime.timezone.utc)
    epoch_ts = int(now.timestamp())
    # Create a blob name that groups entries by URL and sorts by time.
    blob_key = f"{url_to_key(url)}/{epoch_ts}"
    
    data = {
        "input": url,
        "output": json_output,
        "timestamp": now.isoformat()
    }
    
    blob = bucket.blob(blob_key)
    blob.upload_from_string(
        json.dumps(data),
        content_type="application/json"
    )
    print(f"Stored cache for URL: {url} in blob: {blob_key}")

def get_cache(url: str) -> dict:
    """
    Retrieves the most recent fresh cache entry for the given URL.
    Iterates through the blobs sorted by timestamp (newest first).
    """
    prefix = f"{url_to_key(url)}/"
    blobs = list(bucket.list_blobs(prefix=prefix))
    
    if not blobs:
        print(f"No cache entries found for URL: {url}")
        return None
    
    # Sort blobs by the timestamp in the blob name (newest first)
    def get_epoch(blob):
        try:
            return int(blob.name.split('/')[-1])
        except ValueError:
            return 0

    sorted_blobs = sorted(blobs, key=get_epoch, reverse=True)
    
    for blob in sorted_blobs:
        try:
            data_str = blob.download_as_string().decode("utf-8")
            data = json.loads(data_str)
        except Exception as e:
            print(f"Error reading blob {blob.name}: {e}")
            continue
        
        timestamp_str = data.get("timestamp")
        if not timestamp_str:
            continue
        
        try:
            cached_time = datetime.datetime.fromisoformat(timestamp_str)
        except ValueError:
            continue
        
        age = datetime.datetime.now(datetime.timezone.utc) - cached_time
        if age <= datetime.timedelta(days=CACHE_EXPIRATION_DAYS):
            print(f"Cache hit for URL: {url} using blob {blob.name} (age: {age.days} days)")
            return data
        else:
            print(f"Blob {blob.name} for URL: {url} is expired (age: {age.days} days)")
    
    return None



In [5]:
store_cache(url="https://www.valdemarsro.dk/lasagne/", json_output={"key": "value"})



Stored cache for URL: https://www.valdemarsro.dk/lasagne/ in blob: https%3A%2F%2Fwww.valdemarsro.dk%2Flasagne%2F/1739738974


In [6]:
get_cache(url="https://www.valdemarsro.dk/lasagne/")

Cache hit for URL: https://www.valdemarsro.dk/lasagne/ using blob https%3A%2F%2Fwww.valdemarsro.dk%2Flasagne%2F/1739738974 (age: 0 days)


{'input': 'https://www.valdemarsro.dk/lasagne/',
 'output': {'key': 'value'},
 'timestamp': '2025-02-16T20:49:34.589989+00:00'}