In [6]:
# Step 1 — Imports + tiny helpers
import json, re, time, random, datetime, pathlib, requests
from bs4 import BeautifulSoup

def now_utc_iso():
    return datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"

def default_headers():
    return {
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120.0.0.0 Safari/537.36"),
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
        "DNT": "1",
    }

def batches_root():
    return pathlib.Path("data/batches")

def latest_batch_path():
    root = batches_root()
    if not root.exists():
        return None
    dirs = [p for p in root.iterdir() if p.is_dir()]
    return max(dirs, key=lambda p: p.stat().st_mtime) if dirs else None


In [7]:
# Step 2 — Inspect latest batch; show what's inside raw/structured

latest = latest_batch_path()
print("Latest batch:", latest.name if latest else "(none)")

if latest is None:
    raise SystemExit("No batch found.\nRun from terminal: `python -m src.batch` then `python -m src.fetch` then `python -m src.extract_search`.")

raw_dir = latest / "raw"
struct_dir = latest / "structured"

print("raw_dir     :", raw_dir.resolve())
print("structured  :", struct_dir.resolve())

print("raw files   :", sorted(p.name for p in raw_dir.glob('*'))[:15])
print("structured  :", sorted(p.name for p in struct_dir.glob('*'))[:15])

lu_path = struct_dir / "listing_urls.json"
print("listing_urls.json exists? ->", lu_path.exists())


Latest batch: 2025-09-23_zips25
raw_dir     : C:\Users\VICTUS\Documents\Rose\Training\Fellowship.ai\real estate listing optimization\real-estate-listing-optimization\data\batches\2025-09-23_zips25\raw
structured  : C:\Users\VICTUS\Documents\Rose\Training\Fellowship.ai\real estate listing optimization\real-estate-listing-optimization\data\batches\2025-09-23_zips25\structured
raw files   : ['0001_meta.json', '0001_raw.html', '0001_response.json']
structured  : ['listing_urls.json', 'seed_search_pages.json']
listing_urls.json exists? -> True


In [8]:
# Step 3 — Force-create 1001_raw.html from the first detail URL in listing_urls.json

lu = struct_dir / "listing_urls.json"
if not lu.exists():
    raise SystemExit("`listing_urls.json` missing.\nRun: `python -m src.extract_search` after fetching a search page.")

payload = json.loads(lu.read_text(encoding="utf-8"))
urls = payload.get("urls", [])
if not urls:
    raise SystemExit("No URLs inside listing_urls.json.\nMake sure your search page actually had listings.")

first = urls[0]
detail_url = first["source_url"] if isinstance(first, dict) else str(first)
print("Fetching detail:", detail_url)

r = requests.get(detail_url, headers=default_headers(), timeout=30, allow_redirects=True)

(raw_dir / "1001_raw.html").write_text(r.text or "", encoding="utf-8", errors="ignore")
(raw_dir / "1001_response.json").write_text(
    json.dumps({"status": r.status_code, "final_url": r.url, "headers": dict(r.headers)}, indent=2),
    encoding="utf-8"
)
(raw_dir / "1001_meta.json").write_text(
    json.dumps({"requested_url": detail_url, "final_url": r.url, "status": r.status_code, "fetched_at": now_utc_iso()}, indent=2),
    encoding="utf-8"
)

print("✅ wrote: 1001_raw.html, 1001_meta.json, 1001_response.json")
print("status:", r.status_code, "| final_url:", r.url, "| html_len:", len(r.text))


Fetching detail: https://www.redfin.com/TX/Houston/2016-Main-St-77002/unit-1904/home/29503130
✅ wrote: 1001_raw.html, 1001_meta.json, 1001_response.json
status: 202 | final_url: https://www.redfin.com/TX/Houston/2016-Main-St-77002/unit-1904/home/29503130 | html_len: 0


In [9]:
# Step 4 — Parse 1001_raw.html into a structured JSON (core fields only)

html = (raw_dir / "1001_raw.html").read_text(encoding="utf-8", errors="ignore")
meta = json.loads((raw_dir / "1001_meta.json").read_text(encoding="utf-8"))

source_url = (meta.get("final_url") or meta.get("requested_url") or "").lower()
soup = BeautifulSoup(html, "html.parser")

def safe_float(x):
    if x is None: return None
    if isinstance(x, (int,float)): return float(x)
    s = re.sub(r"[^\d\.]", "", str(x))
    try: return float(s) if s else None
    except: return None

def to_int(x):
    v = safe_float(x)
    return int(v) if v is not None else None

# 1) schema.org JSON-LD
rec = {"price":None,"beds":None,"baths":None,"sqft":None,"year":None,
       "addr":{"street":None,"unit":None,"city":None,"state":None,"postal_code":None}, "photos":[]}

for sc in soup.find_all("script", type="application/ld+json"):
    try:
        data = json.loads(sc.string or "{}")
    except Exception:
        continue

    def walk(n):
        if isinstance(n, dict):
            t = str(n.get("@type") or n.get("type") or "").lower()
            if any(x in t for x in ["residence","singlefamily","house","apartment","offer","realestatelisting"]):
                offer = n.get("offers") or {}
                if isinstance(offer, dict):
                    rec["price"] = rec["price"] or safe_float(offer.get("price") or offer.get("lowPrice") or offer.get("highPrice"))
                addr = n.get("address") or {}
                if isinstance(addr, dict):
                    rec["addr"].update({
                        "street": addr.get("streetAddress", rec["addr"]["street"]),
                        "city": addr.get("addressLocality", rec["addr"]["city"]),
                        "state": addr.get("addressRegion", rec["addr"]["state"]),
                        "postal_code": addr.get("postalCode", rec["addr"]["postal_code"]),
                    })
                rec["beds"]  = rec["beds"]  or safe_float(n.get("numberOfRooms") or n.get("bedrooms"))
                rec["baths"] = rec["baths"] or safe_float(n.get("bathroomCount") or n.get("bathrooms"))
                area = n.get("floorSize") or {}
                if isinstance(area, dict):
                    rec["sqft"] = rec["sqft"] or to_int(area.get("value"))
                imgs = n.get("image")
                if isinstance(imgs, list):
                    rec["photos"].extend([u for u in imgs if isinstance(u, str)])
                elif isinstance(imgs, str):
                    rec["photos"].append(imgs)
            for v in n.values(): walk(v)
        elif isinstance(n, list):
            for v in n: walk(v)
    walk(data)

rec["photos"] = list(dict.fromkeys(rec["photos"]))[:50]

# 2) regex fallbacks (if missing)
if rec["sqft"] is None:
    m = re.search(r'([\d,\.]+)\s*(sq\s*ft|sqft)', html, re.I)
    if m:
        try: rec["sqft"] = int(float(m.group(1).replace(",", "")))
        except: pass

if rec["price"] is None:
    m = re.search(r'Price[:\s]*\$?\s*([\d,\,\.]+)', html, re.I)
    if m:
        try: rec["price"] = float(m.group(1).replace(",", ""))
        except: pass

# Build structured object (MVP fields)
structured = {
    "listing_id": None,
    "platform_id": ("redfin" if "redfin.com" in source_url else "zillow" if "zillow.com" in source_url else "other"),
    "source_url": source_url,
    "external_property_id": None,  # kept None in quick parser
    "batch_id": latest.name,
    "scraped_timestamp": now_utc_iso(),

    "address": {
        "street": rec["addr"]["street"],
        "unit": rec["addr"]["unit"],
        "city": rec["addr"]["city"],
        "state": rec["addr"]["state"],
        "postal_code": rec["addr"]["postal_code"],
    },
    "latitude": None,
    "longitude": None,

    "property_type": None,
    "property_subtype": None,
    "beds": rec["beds"],
    "baths": rec["baths"],
    "interior_area_sqft": rec["sqft"],
    "lot_sqft": None,
    "year_built": rec["year"],
    "condition": None,

    "listing": {
        "listing_type": "sell",
        "status": None,
        "list_date": None,
        "days_on_market": None,
        "list_price": rec["price"],
        "price_per_sqft": round(rec["price"]/rec["sqft"], 2) if rec["price"] and rec["sqft"] else None
    },

    "description": None,
    "media": [{"url": u, "type": "image", "caption": None} for u in rec["photos"]],
    "features": {},
    "market_signals": {"views": None, "saves": None, "share_count": None},
    "similar_properties": [],
    "possible_duplicate": False,
    "duplicate_candidates": []
}

out = (struct_dir / "1001.json")
out.write_text(json.dumps(structured, indent=2), encoding="utf-8")
print("✅ Wrote structured:", out)
print({k: structured[k] for k in ["beds","baths","interior_area_sqft"]}, "price:", structured["listing"]["list_price"])


✅ Wrote structured: data\batches\2025-09-23_zips25\structured\1001.json
{'beds': None, 'baths': None, 'interior_area_sqft': None} price: None


In [10]:
# Step 5 — (Optional) Fetch 4 detail pages & parse them all

# 5.a) load first 4 detail URLs
lu = json.loads((struct_dir / "listing_urls.json").read_text(encoding="utf-8"))
detail_urls = [row["source_url"] if isinstance(row, dict) else str(row) for row in lu["urls"][:4]]

# 5.b) fetch details 1001..1004
for i, u in enumerate(detail_urls, start=1001):
    r = requests.get(u, headers=default_headers(), timeout=30, allow_redirects=True)
    (raw_dir / f"{i:04d}_raw.html").write_text(r.text or "", encoding="utf-8", errors="ignore")
    (raw_dir / f"{i:04d}_meta.json").write_text(json.dumps({"requested_url": u, "final_url": r.url, "status": r.status_code, "fetched_at": now_utc_iso()}, indent=2), encoding="utf-8")
    (raw_dir / f"{i:04d}_response.json").write_text(json.dumps({"status": r.status_code, "final_url": r.url, "headers": dict(r.headers)}, indent=2), encoding="utf-8")
    print(f"fetched {i}: {r.status_code}")
    time.sleep(random.uniform(1.2, 2.8))

# 5.c) parse details (using the same quick parser as step 4)
def quick_parse(idx: int):
    html = (raw_dir / f"{idx:04d}_raw.html").read_text(encoding="utf-8", errors="ignore")
    meta = json.loads((raw_dir / f"{idx:04d}_meta.json").read_text(encoding="utf-8"))
    soup = BeautifulSoup(html, "html.parser")
    # reuse the minimal logic from Step 4:
    # (for brevity, call the same block by importing globals we already defined)
    # here we'll just write a small shortcut: copy beds/baths/price/sqft regex fallbacks only
    price = None; beds=None; baths=None; sqft=None
    m = re.search(r'\$[\s]*([\d,]+)', html); 
    price = float(m.group(1).replace(",", "")) if m else None
    m = re.search(r'(\d+(?:\.\d+)?)\s*beds?', html, re.I); beds = float(m.group(1)) if m else None
    m = re.search(r'(\d+(?:\.\d+)?)\s*baths?', html, re.I); baths = float(m.group(1)) if m else None
    m = re.search(r'([\d,\.]+)\s*(sq\s*ft|sqft)', html, re.I); sqft = int(float(m.group(1).replace(",", ""))) if m else None
    data = {
        "listing_id": None,
        "platform_id": ("redfin" if "redfin.com" in (meta.get("final_url","").lower()) else "zillow" if "zillow.com" in (meta.get("final_url","").lower()) else "other"),
        "source_url": (meta.get("final_url") or meta.get("requested_url") or "").lower(),
        "external_property_id": None,
        "batch_id": latest.name,
        "scraped_timestamp": now_utc_iso(),
        "address": {"street": None, "unit": None, "city": None, "state": None, "postal_code": None},
        "latitude": None, "longitude": None,
        "property_type": None, "property_subtype": None,
        "beds": beds, "baths": baths, "interior_area_sqft": sqft, "lot_sqft": None, "year_built": None, "condition": None,
        "listing": {"listing_type": "sell", "status": None, "list_date": None, "days_on_market": None,
                    "list_price": price, "price_per_sqft": round(price/sqft,2) if price and sqft else None},
        "description": None, "media": [], "features": {}, "market_signals": {"views": None, "saves": None, "share_count": None},
        "similar_properties": [], "possible_duplicate": False, "duplicate_candidates": []
    }
    (struct_dir / f"{idx:04d}.json").write_text(json.dumps(data, indent=2), encoding="utf-8")
    return idx

out_idxs = [quick_parse(i) for i in range(1001, 1005)]
print("✅ wrote structured:", out_idxs)


fetched 1001: 202
fetched 1002: 202
fetched 1003: 202
fetched 1004: 202
✅ wrote structured: [1001, 1002, 1003, 1004]
