In [None]:
# Verify the contents of the landregistry-public-data bucket
import boto3
from google.colab import userdata

AWS_KEY   = userdata.get('AWS_ACCESS_KEY_ID')
AWS_SEC   = userdata.get('AWS_SECRET_ACCESS_KEY')

src_s3 = boto3.client('s3',
                      aws_access_key_id=AWS_KEY,
                      aws_secret_access_key=AWS_SEC)

bucket_name = 'uk-property-bronze'

try:
    response = src_s3.list_objects_v2(Bucket=bucket_name, RequestPayer='requester')
    if 'Contents' in response:
        print(f"Objects in bucket '{bucket_name}':")
        for obj in response['Contents']:
            print(obj['Key'])
    else:
        print(f"No objects found in bucket '{bucket_name}'.")
except Exception as e:
    print(f"An error occurred: {e}")

Objects in bucket 'uk-property-bronze':
raw/ppd/2025-09-15T074310Z_ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9.csv
raw/ppd/2025-09-15T074725Z_ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9.csv
raw/ppd/2025-09-15T074857Z_ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9.csv


In [None]:
# === 1. installs ===
!pip -q install boto3 smart-open requests tqdm

# === 2. AWS auth ===
import os
from google.colab import userdata   # Colab secret manager
os.environ["AWS_ACCESS_KEY_ID"]     = userdata.get('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = userdata.get('AWS_SECRET_ACCESS_KEY')
os.environ["AWS_DEFAULT_REGION"]    = "eu-west-1"   # or your bucket region

import boto3, requests, smart_open, datetime, json
from tqdm import tqdm

s3 = boto3.client('s3')
BUCKET = "uk-property-bronze"   # <‚îÄ‚îÄ change
S3_KEY = "raw/hmlr/price-paid/price-paid-latest.csv"
SUCCESS_KEY = "raw/hmlr/price-paid/_SUCCESS"

In [None]:
!pip -q install boto3 smart-open tqdm

#HMLR txt BRONZE

In [None]:
import os, boto3, requests, concurrent.futures, datetime, json
from google.colab import userdata
os.environ["AWS_ACCESS_KEY_ID"]     = userdata.get('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = userdata.get('AWS_SECRET_ACCESS_KEY')
os.environ["AWS_DEFAULT_REGION"]    = "eu-west-1"

BUCKET = "uk-property-bronze"
BASE_S3_KEY = "bronze/hmlr/price-paid/yearly_txt/"

urls = {
    "2018": "http://prod2.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2018.txt",
    "2019": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2019.txt",
    "2020": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2020.txt",
    "2021": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2021.txt",
    "2022": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2022.txt",
    "2023": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2023.txt",
    "2024": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2024.txt",
    "2025": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2025.txt",
}

s3 = boto3.client('s3')
results = []

def upload_one(year_url):
    year, url = year_url
    fname = f"pp-{year}.txt"
    s3_key  = BASE_S3_KEY + fname
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with smart_open.open(f"s3://{BUCKET}/{s3_key}", 'wb',
                             transport_params={'client': s3}) as s3_file:
            for chunk in r.iter_content(chunk_size=8*1024*1024):
                if chunk:
                    s3_file.write(chunk)
    results.append({"year": year, "s3_key": s3_key, "url": url})
    return year

# ===== 3. parallel pull ‚Üí S3 =====
from tqdm import tqdm
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
    list(tqdm(ex.map(upload_one, urls.items()), total=len(urls)))

# ===== 4. success flag =====
s3.put_object(Bucket=BUCKET, Key=BASE_S3_KEY+"_SUCCESS", Body=b'')

# ===== 5. log for Git =====
log = {
    "source": "hmlr_yearly_txt",
    "ingest_ts": datetime.datetime.utcnow().isoformat(),
    "bucket": BUCKET,
    "objects": results
}
print(json.dumps(log, indent=2))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:20<00:00,  2.62s/it]

{
  "source": "hmlr_yearly_txt",
  "ingest_ts": "2025-09-15T09:47:51.867004",
  "bucket": "uk-property-bronze",
  "objects": [
    {
      "year": "2025",
      "s3_key": "bronze/hmlr/price-paid/yearly_txt/pp-2025.txt",
      "url": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2025.txt"
    },
    {
      "year": "2023",
      "s3_key": "bronze/hmlr/price-paid/yearly_txt/pp-2023.txt",
      "url": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2023.txt"
    },
    {
      "year": "2024",
      "s3_key": "bronze/hmlr/price-paid/yearly_txt/pp-2024.txt",
      "url": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2024.txt"
    },
    {
      "year": "2020",
      "s3_key": "bronze/hmlr/price-paid/yearly_txt/pp-2020.txt",
      "url": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2020.txt"
    },
    {
      "year": "2019",
      "s3_key": "bronze/h


  "ingest_ts": datetime.datetime.utcnow().isoformat(),


#HMLR parkeet BRONZE

In [None]:
import os, boto3, requests, concurrent.futures, datetime, json, pandas as pd, pyarrow as pa, pyarrow.parquet as pq
from google.colab import userdata
os.environ["AWS_ACCESS_KEY_ID"]     = userdata.get('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = userdata.get('AWS_SECRET_ACCESS_KEY')
os.environ["AWS_DEFAULT_REGION"]    = "eu-west-1"

BUCKET = "uk-property-bronze"
BASE_S3_KEY = "bronze/hmlr/price-paid/yearly_parquet/"

urls = {   # same dict you already have
    "2018": "http://prod2.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2018.txt",
    "2019": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2019.txt",
    "2020": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2020.txt",
    "2021": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2021.txt",
    "2022": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2022.txt",
    "2023": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2023.txt",
    "2024": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2024.txt",
    "2025": "http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2025.txt",
}

s3 = boto3.client('s3')
results = []

def txt_to_parquet(year_url):
    year, url = year_url
    # 1. stream txt into mem (files are <250 MB each)
    with requests.get(url, timeout=60) as r:
        r.raise_for_status()
        df = pd.read_csv(io.BytesIO(r.content), header=None, low_memory=False)

    # 2. standard HMLR txt schema (no header in file)
    cols = ["transaction_id","price","date","postcode","prop_type","old_new","duration","paon","saon",
            "street","locality","town","district","county","ppd_cat","status"]
    df.columns = cols
    df["year"] = int(year)          # partition column

    # 3. write partitioned parquet straight to S3
    parquet_key = f"{BASE_S3_KEY}year={year}/part-0.parquet"
    with smart_open.open(f"s3://{BUCKET}/{parquet_key}", 'wb',
                         transport_params={'client': s3}) as s3_file:
        table = pa.Table.from_pandas(df)
        pq.write_table(table, s3_file, compression='snappy')

    results.append({"year": year, "parquet_key": parquet_key, "rows": len(df)})
    return year

# ===== 2. parallel convert ‚Üí parquet =====
import io
from tqdm import tqdm
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as ex:
    list(tqdm(ex.map(txt_to_parquet, urls.items()), total=len(urls)))

# ===== 3. success flag =====
s3.put_object(Bucket=BUCKET, Key=BASE_S3_KEY+"_SUCCESS", Body=b'')

# ===== 4. log =====
log = {
    "source": "hmlr_yearly_parquet",
    "ingest_ts": datetime.datetime.utcnow().isoformat(),
    "bucket": BUCKET,
    "objects": results
}
print(json.dumps(log, indent=2))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [01:10<00:00,  8.85s/it]

{
  "source": "hmlr_yearly_parquet",
  "ingest_ts": "2025-09-15T09:49:15.304940",
  "bucket": "uk-property-bronze",
  "objects": [
    {
      "year": "2025",
      "parquet_key": "bronze/hmlr/price-paid/yearly_parquet/year=2025/part-0.parquet",
      "rows": 314860
    },
    {
      "year": "2023",
      "parquet_key": "bronze/hmlr/price-paid/yearly_parquet/year=2023/part-0.parquet",
      "rows": 853480
    },
    {
      "year": "2024",
      "parquet_key": "bronze/hmlr/price-paid/yearly_parquet/year=2024/part-0.parquet",
      "rows": 841227
    },
    {
      "year": "2020",
      "parquet_key": "bronze/hmlr/price-paid/yearly_parquet/year=2020/part-0.parquet",
      "rows": 896182
    },
    {
      "year": "2019",
      "parquet_key": "bronze/hmlr/price-paid/yearly_parquet/year=2019/part-0.parquet",
      "rows": 1011611
    },
    {
      "year": "2022",
      "parquet_key": "bronze/hmlr/price-paid/yearly_parquet/year=2022/part-0.parquet",
      "rows": 1072476
    },
    {
   


  "ingest_ts": datetime.datetime.utcnow().isoformat(),


#Metadata of HMLR

In [None]:
# Get the region of the S3 bucket
import boto3
from google.colab import userdata

AWS_KEY   = userdata.get('AWS_ACCESS_KEY_ID')
AWS_SEC   = userdata.get('AWS_SECRET_ACCESS_KEY')
BUCKET = "uk-property-bronze"

try:
    s3_client = boto3.client('s3',
                             aws_access_key_id=AWS_KEY,
                             aws_secret_access_key=AWS_SEC)
    response = s3_client.get_bucket_location(Bucket=BUCKET)
    region = response.get('LocationConstraint')
    if region is None:
        # Buckets in us-east-1 have a None location constraint
        region = 'us-east-1'
    print(f"Region of bucket '{BUCKET}': {region}")
except Exception as e:
    print(f"An error occurred: {e}")

Region of bucket 'uk-property-bronze': eu-north-1


In [None]:
# ===== 0. deps =====
!pip -q install boto3 smart-open duckdb pyarrow pandas tqdm

# ===== 1. auth (same as always) =====
import os, boto3, json, datetime, duckdb, pyarrow.parquet as pq
from smart_open import open as sm_open
from tqdm import tqdm
from google.colab import userdata
os.environ["AWS_ACCESS_KEY_ID"]     = userdata.get('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = userdata.get('AWS_SECRET_ACCESS_KEY')
os.environ["AWS_DEFAULT_REGION"]    = "eu-west-1"

BUCKET   = "uk-property-bronze"
PARQUET_PREFIX = "bronze/hmlr/price-paid/yearly_parquet/"
META_KEY       = "bronze/hmlr/price-paid/yearly_parquet_metadata/" + datetime.date.today().isoformat() + "_hmlr_metadata.json"

s3 = boto3.client('s3')

# ===== 2. list parquet objects =====
paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=BUCKET, Prefix=PARQUET_PREFIX)
keys = [obj['Key'] for page in pages for obj in page.get('Contents', [])
        if obj['Key'].endswith('.parquet')]
assert keys, "No parquet files found ‚Äì check prefix"

# ===== 3. build metadata per file =====
import boto3.session, duckdb
session = boto3.session.Session()          # picks up env region
credentials = session.get_credentials()
region = session.region_name or 'eu-west-1'

meta_list = []
for key in tqdm(keys):
    # 1. basic file stats
    head = s3.head_object(Bucket=BUCKET, Key=key)
    size = head['ContentLength']

    # 2. parquet footer only (no DuckDB)
    with sm_open(f"s3://{BUCKET}/{key}", 'rb', transport_params={'client': s3}) as f:
        pf = pq.ParquetFile(f)
        meta      = pf.metadata
        rows      = meta.num_rows
        cols      = meta.num_columns
        # human-readable schema
        schema_str = str({meta.schema.column(i).name: str(meta.schema.column(i).physical_type)
                          for i in range(cols)})

    # 3. lightweight column stats (min/max) ‚Äì footer only
    stats = []
    for i in range(cols):
        col_meta = meta.row_group(0).column(i)
        stats.append({
            "name": meta.schema.column(i).name,
            "min": col_meta.statistics.min,
            "max": col_meta.statistics.max,
            "nulls": col_meta.statistics.null_count
        })

    year = key.split("year=")[1].split("/")[0]
    meta_list.append({
        "year": year,
        "s3_key": key,
        "size_bytes": size,
        "rows": rows,
        "columns": cols,
        "schema": schema_str,
        "column_stats": stats
    })
# ===== 4. consolidate & upload ===== (same as before)
meta_json = json.dumps({
    "source": "hmlr_yearly_parquet_metadata",
    "ingest_ts": datetime.datetime.utcnow().isoformat(),
    "bucket": BUCKET,
    "metadata": meta_list
}, indent=2)

with sm_open(f"s3://{BUCKET}/{META_KEY}", 'w',
             transport_params={'client': s3}) as f:
    f.write(meta_json)

print(f"Metadata written to s3://{BUCKET}/{META_KEY}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:04<00:00,  1.67it/s]
  "ingest_ts": datetime.datetime.utcnow().isoformat(),


Metadata written to s3://uk-property-bronze/bronze/hmlr/price-paid/yearly_parquet_metadata/2025-09-15_hmlr_metadata.json


In [None]:
SEED_URLS = [
    "https://www.rightmove.co.uk/news/",
    "https://www.zoopla.co.uk/discover/property-news/",
    "https://www.bbc.co.uk/news/topics/cp7r8yvllvdt",  # BBC house prices
    "https://www.theguardian.com/money/house-prices",
    "https://www.hometrack.com/uk-insight/",            # city-level reports
]

In [None]:
# ===== 0. install =====
!pip -q install firecrawl-py boto3 smart-open tqdm

# ===== 1. auth =====
import os, json, datetime, hashlib, boto3
from smart_open import open as sm_open
from firecrawl import FirecrawlApp
from google.colab import userdata
from tqdm import tqdm

FIRECRAWL_KEY = userdata.get('FIRECRAWL_API_KEY')
AWS_KEY       = userdata.get('AWS_ACCESS_KEY_ID')
AWS_SECRET    = userdata.get('AWS_SECRET_ACCESS_KEY')
REGION        = "eu-west-1"
BUCKET        = "uk-property-bronze"          # ‚Üê yours
PREFIX        = "bronze/firecrawl/uk-local-news/"

os.environ.update({
    "AWS_ACCESS_KEY_ID": AWS_KEY,
    "AWS_SECRET_ACCESS_KEY": AWS_SECRET,
    "AWS_DEFAULT_REGION": REGION
})

s3  = boto3.client('s3')
app = FirecrawlApp(api_key=FIRECRAWL_KEY)

# ===== 2. only Rightmove + Zoopla =====
SEED_URLS = [
    "https://www.rightmove.co.uk/news/",
    "https://www.zoopla.co.uk/discover/property-news/"
]

# ===== 3. crawl ‚Üí bronze =====
results = []
for url in tqdm(SEED_URLS):
    print("crawling ‚Üí", url)
    try:
        out = app.scrape(
            url,
            page_options={"includeHtml": False, "includeMarkdown": True}
        )
        md = out.get("markdown", "").strip()
        if not md:
            print("‚ö†Ô∏è  empty markdown for", url)
            continue
    except Exception as e:
        print("‚ùå skip", url, e)
        continue

    slug = hashlib.md5(url.encode()).hexdigest()[:8]
    key  = f"{PREFIX}{datetime.datetime.now(datetime.timezone.utc).isoformat()[:10]}/{slug}.json"
    payload = {
        "url": url,
        "crawl_ts": datetime.datetime.now(datetime.timezone.utc).isoformat(),
        "markdown": md,
        "metadata": out.get("metadata", {})
    }

    with sm_open(f"s3://{BUCKET}/{key}", 'w',
                 transport_params={'client': s3}) as f:
        f.write(json.dumps(payload, ensure_ascii=False))

    results.append({"url": url, "s3_key": key, "len_md": len(md)})
    print("‚úÖ written", key, f"({len(md)} chars)")

# ===== 4. success flag + log =====
s3.put_object(Bucket=BUCKET, Key=PREFIX+"_SUCCESS", Body=b'')
log = {
    "source": "firecrawl_rm_zoopla",
    "ingest_ts": datetime.datetime.now(datetime.timezone.utc).isoformat(),
    "bucket": BUCKET,
    "crawls": results
}
print(json.dumps(log, indent=2))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 11081.38it/s]


crawling ‚Üí https://www.rightmove.co.uk/news/
‚ùå skip https://www.rightmove.co.uk/news/ FirecrawlClient.scrape() got an unexpected keyword argument 'page_options'
crawling ‚Üí https://www.zoopla.co.uk/discover/property-news/
‚ùå skip https://www.zoopla.co.uk/discover/property-news/ FirecrawlClient.scrape() got an unexpected keyword argument 'page_options'
{
  "source": "firecrawl_rm_zoopla",
  "ingest_ts": "2025-09-15T10:48:39.614471+00:00",
  "bucket": "uk-property-bronze",
  "crawls": []
}


#FIRECRAWLLER (Zoopla, RightMove)

In [None]:
from firecrawl import FirecrawlApp
print(dir(FirecrawlApp))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__']


In [None]:
from firecrawl import FirecrawlApp
import inspect
app = FirecrawlApp(api_key="dummy")   # key not used for inspect
print([m for m in dir(app) if not m.startswith('_')])
print(inspect.signature(app.scrape))   # or whatever method looks right

['active_crawls', 'api_key', 'api_url', 'batch_scrape', 'cancel_batch_scrape', 'cancel_crawl', 'crawl', 'crawl_params_preview', 'extract', 'get_active_crawls', 'get_batch_scrape_errors', 'get_batch_scrape_status', 'get_concurrency', 'get_crawl_errors', 'get_crawl_status', 'get_credit_usage', 'get_extract_status', 'get_queue_status', 'get_token_usage', 'map', 'scrape', 'search', 'start_batch_scrape', 'start_crawl', 'start_extract', 'v1', 'v2', 'watcher']
(url: str, *, formats: Optional[List[ForwardRef('FormatOption')]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, only_main_content: Optional[bool] = None, timeout: Optional[int] = None, wait_for: Optional[int] = None, mobile: Optional[bool] = None, parsers: Union[List[str], List[Union[str, firecrawl.v2.types.PDFParser]], NoneType] = None, actions: Optional[List[Union[ForwardRef('WaitAction'), ForwardRef('ScreenshotAction'), ForwardRef('ClickAction'), 

###Fire Crawlling Rightmove

In [None]:
# ===== minimal firecrawl debug + write =====
!pip -q install firecrawl-py boto3 smart-open

import os, json, datetime, hashlib, boto3
from smart_open import open as sm_open
from firecrawl import FirecrawlApp
from google.colab import userdata
import datetime as dt

# auth
os.environ.update({
    "AWS_ACCESS_KEY_ID": userdata.get('AWS_ACCESS_KEY_ID'),
    "AWS_SECRET_ACCESS_KEY": userdata.get('AWS_SECRET_ACCESS_KEY'),
    "AWS_DEFAULT_REGION": "eu-west-1"
})
BUCKET = "uk-property-bronze"
PREFIX = "bronze/firecrawl/uk-local-news/"

s3  = boto3.client('s3')
app = FirecrawlApp(api_key=userdata.get('FIRECRAWL_API_KEY'))

URL = "https://www.rightmove.co.uk/news/"

print("üîç scraping ---", URL)
try:
    out = app.scrape(URL, formats=["markdown"])   # <-- exact signature
except Exception as e:
    raise RuntimeError(f"Firecrawl failed: {e}") from e

md = out.markdown or ""
print("üìÑ markdown length:", len(md))
if not md:
    raise ValueError("Empty markdown returned")

# write to bronze
key = f"{PREFIX}{dt.datetime.now(dt.timezone.utc).isoformat()[:10]}/rightmove.json"
payload = {
    "url": URL,
    "crawl_ts": dt.datetime.now(dt.timezone.utc).isoformat(),
    "markdown": md,
    "metadata": out.metadata.dict() if out.metadata else {}  # ‚Üê serialisable
}

with sm_open(f"s3://{BUCKET}/{key}", 'w', transport_params={'client': s3}) as f:
    f.write(json.dumps(payload, ensure_ascii=False, indent=2))

s3.put_object(Bucket=BUCKET, Key=PREFIX+"_SUCCESS", Body=b'')
print("‚úÖ written to", key)

üîç scraping --- https://www.rightmove.co.uk/news/
üìÑ markdown length: 9048


/tmp/ipython-input-923751762.py:41: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  "metadata": out.metadata.dict() if out.metadata else {}  # ‚Üê serialisable


‚úÖ written to bronze/firecrawl/uk-local-news/2025-09-15/rightmove.json


###Fire Crawlling Zoopla

In [None]:
# ===== minimal firecrawl debug + write =====
!pip -q install firecrawl-py boto3 smart-open

import os, json, datetime, hashlib, boto3
from smart_open import open as sm_open
from firecrawl import FirecrawlApp
from google.colab import userdata
import datetime as dt

# auth
os.environ.update({
    "AWS_ACCESS_KEY_ID": userdata.get('AWS_ACCESS_KEY_ID'),
    "AWS_SECRET_ACCESS_KEY": userdata.get('AWS_SECRET_ACCESS_KEY'),
    "AWS_DEFAULT_REGION": "eu-west-1"
})
BUCKET = "uk-property-bronze"
PREFIX = "bronze/firecrawl/uk-local-news/"

s3  = boto3.client('s3')
app = FirecrawlApp(api_key=userdata.get('FIRECRAWL_API_KEY'))

URL = "https://www.zoopla.co.uk/discover/property-news/"

print("üîç scraping ---", URL)
try:
    out = app.scrape(URL, formats=["markdown"])   # <-- exact signature
except Exception as e:
    raise RuntimeError(f"Firecrawl failed: {e}") from e

md = out.markdown or ""
print("üìÑ markdown length:", len(md))
if not md:
    raise ValueError("Empty markdown returned")

# write to bronze
key = f"{PREFIX}{dt.datetime.now(dt.timezone.utc).isoformat()[:10]}/rightmove.json"
payload = {
    "url": URL,
    "crawl_ts": dt.datetime.now(dt.timezone.utc).isoformat(),
    "markdown": md,
    "metadata": out.metadata.dict() if out.metadata else {}  # ‚Üê serialisable
}

with sm_open(f"s3://{BUCKET}/{key}", 'w', transport_params={'client': s3}) as f:
    f.write(json.dumps(payload, ensure_ascii=False, indent=2))

s3.put_object(Bucket=BUCKET, Key=PREFIX+"_SUCCESS", Body=b'')
print("‚úÖ written to", key)

üîç scraping --- https://www.zoopla.co.uk/discover/property-news/
üìÑ markdown length: 5252


/tmp/ipython-input-323165061.py:41: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  "metadata": out.metadata.dict() if out.metadata else {}  # ‚Üê serialisable


‚úÖ written to bronze/firecrawl/uk-local-news/2025-09-15/rightmove.json


###EPC_Data to Bronze S3

## 6GB ----> Bronze S3

In [None]:
# ===== 1. auth =====
import os, boto3, requests, datetime, json
from smart_open import open as sm_open
from google.colab import userdata
from tqdm import tqdm

os.environ.update({
    "AWS_ACCESS_KEY_ID": userdata.get('AWS_ACCESS_KEY_ID'),
    "AWS_SECRET_ACCESS_KEY": userdata.get('AWS_SECRET_ACCESS_KEY'),
    "AWS_DEFAULT_REGION": "eu-west-1"
})
BUCKET = "uk-property-bronze"               # ‚Üê yours
S3_KEY = "bronze/epc/domestic/epc-domestic-complete.csv"
SUCCESS_KEY = "bronze/epc/domestic/_SUCCESS"

# ===== 2. EPC domestic direct link (always latest) =====

EPC_URL = "https://epc.opendatacommunities.org/api/v1/files/all-domestic-certificates.zip"
HEADERS = {
    'Authorization': f'Basic {EPC_TOKEN}'   # no Accept needed ‚Äì it‚Äôs a ZIP
}
S3_KEY  = "bronze/epc/domestic/epc-domestic-all-certificates.zip"



# ===== 3. stream download ‚Üí S3 =====
resp = requests.get(EPC_URL, headers=HEADERS, stream=True, timeout=60)
resp.raise_for_status()
size = int(resp.headers.get('content-length', 0))

with sm_open(f"s3://{BUCKET}/{S3_KEY}", 'wb',
             transport_params={'client': boto3.client('s3')}) as s3_file:
    for chunk in tqdm(resp.iter_content(chunk_size=8*1024*1024),
                      total=size//(8*1024*1024), unit='MB'):
        if chunk:
            s3_file.write(chunk)

# ===== 4. success flag =====
boto3.client('s3').put_object(Bucket=BUCKET, Key=SUCCESS_KEY, Body=b'')

# ===== 5. log for repo =====
log = {
    "source": "epc_domestic_register",
    "ingest_ts": datetime.datetime.now(datetime.timezone.utc).isoformat(),
    "url": EPC_URL,
    "s3_bucket": BUCKET,
    "s3_key": S3_KEY,
    "size_bytes": size
}
print(json.dumps(log, indent=2))

741MB [02:15,  5.49MB/s]                         


{
  "source": "epc_domestic_register",
  "ingest_ts": "2025-09-15T11:20:48.995612+00:00",
  "url": "https://epc.opendatacommunities.org/api/v1/files/all-domestic-certificates.zip",
  "s3_bucket": "uk-property-bronze",
  "s3_key": "bronze/epc/domestic/epc-domestic-all-certificates.zip",
  "size_bytes": 6214054552
}
