## Getting Data

In [29]:
import asyncio
import json
from typing import List, Dict
from httpx import AsyncClient, Response
from parsel import Selector
import random
from bs4 import BeautifulSoup
import json 
import re

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
]

# Create HTTP client with dynamic headers
client = AsyncClient(
    headers={
        "User-Agent": random.choice(USER_AGENTS),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
    },
    follow_redirects=True,
    http2=True,
    timeout=60,  # Keep timeout reasonable (60s)
)

async def scrape_property_for_sale(urls: List[str]) -> List[Dict]:
    """Scrape property details from Redfin pages with retry handling."""
    properties = []
    for url in urls:
        response = await fetch_with_retries(url)
        if response and response.status_code == 200:
            print(f"Successfully fetched: {url}")
            properties.append(response)
        else:
            print(f"Skipping {url} due to failed fetch.")
        await asyncio.sleep(random.uniform(1, 5))  # Add random delay between requests
    return properties
    
async def fetch_with_retries(url: str, retries: int = 3, delay: int = 2) -> Response:
    """Fetch property page with retry logic to handle 202 responses."""
    attempt = 0
    while attempt < retries:
        try:
            response = await client.get(url)
            if response.status_code == 200:
                return response  # Success
            elif response.status_code == 202:
                print(f"Received 202 for {url}, retrying in {delay}s...")
            else:
                print(f"Unexpected status {response.status_code} for {url}, retrying in {delay}s...")
        except HTTPStatusError as e:
            print(f"HTTP error {e.response.status_code}: {e.response.text}")
        except Exception as e:
            print(f"Error fetching {url}: {e}")

        await asyncio.sleep(delay)
        delay *= 2  # Exponential backoff
        attempt += 1

    print(f"Max retries reached for {url}, skipping.")
    return None

In [30]:
def extract_redfin_property(html_content):
    """Extracts property details from Redfin HTML response, including structured and unstructured data."""
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract key meta fields
    property_data = {
        "address": soup.find("meta", {"name": "twitter:text:street_address"})["content"] if soup.find("meta", {"name": "twitter:text:street_address"}) else None,
        "city": soup.find("meta", {"name": "twitter:text:city"})["content"] if soup.find("meta", {"name": "twitter:text:city"}) else None,
        "state": soup.find("meta", {"name": "twitter:text:state_code"})["content"] if soup.find("meta", {"name": "twitter:text:state_code"}) else None,
        "zip_code": soup.find("meta", {"name": "twitter:text:zip"})["content"] if soup.find("meta", {"name": "twitter:text:zip"}) else None,
        "price": soup.find("meta", {"name": "twitter:text:price"})["content"] if soup.find("meta", {"name": "twitter:text:price"}) else None,
        "beds": soup.find("meta", {"name": "twitter:text:beds"})["content"] if soup.find("meta", {"name": "twitter:text:beds"}) else None,
        "baths": soup.find("meta", {"name": "twitter:text:baths"})["content"] if soup.find("meta", {"name": "twitter:text:baths"}) else None,
        "sqft": soup.find("meta", {"name": "twitter:text:sqft"})["content"] if soup.find("meta", {"name": "twitter:text:sqft"}) else None,
        "property_url": soup.find("meta", {"property": "og:url"})["content"] if soup.find("meta", {"property": "og:url"}) else None,
        "images": [meta["content"] for meta in soup.find_all("meta") if "twitter:image:photo" in meta.get("name", "")]
    }

    # Extract description and additional house details
    house_info_div = soup.find("div", {"id": "house-info"})
    if house_info_div:
        house_info_text = house_info_div.get_text(strip=True)
        property_data.update(process_house_info(house_info_text))  # Merge structured text details

    # Extract structured property attributes (e.g., {"header":"Property Type","content":"Single-family"})
    structured_data = []
    for script in soup.find_all("script"):
        if "header" in script.text and "content" in script.text:  # Look for structured attributes
            matches = re.findall(r'{"header":"(.*?)","content":"(.*?)"}', script.text)
            structured_data.extend([{"header": m[0], "content": m[1]} for m in matches])

    property_data["structured_attributes"] = structured_data

    # Extract JSON-encoded data in <script> tags (e.g., hidden price history, tax data)
    json_data = None
    for script in soup.find_all("script"):
        if "window.__INITIAL_STATE__" in script.text:  # Look for Redfin's internal JSON variable
            match = re.search(r"window\.__INITIAL_STATE__\s*=\s*({.*?});", script.text, re.DOTALL)
            if match:
                json_data = match.group(1)
                break

    if json_data:
        try:
            property_data["json_data"] = json.loads(json_data)  # Parse JSON if found
        except json.JSONDecodeError:
            property_data["json_data"] = "Error parsing JSON"

    return property_data
    
def process_house_info(text):
    """Processes the 'house-info' text and extracts key details with delimiters."""
    house_info = {}

    # Split description from property details using "Show more" as a separator (if present)
    if "Show more" in text:
        description, details = text.split("Show more", 1)
        house_info["description"] = description.strip()
    else:
        details = text
        house_info["description"] = None

    # Extract key structured fields safely
    house_info["property_type"] = re.search(r"(Single-family|Townhouse|Condo)", details)
    house_info["built_year"] = re.search(r"Built in (\d{4})", details)
    house_info["lot_size"] = re.search(r"(?<!Built in )(\d{1,3}(?:,\d{3})*) sq ft lot", details)
    house_info["price_per_sqft"] = re.search(r"\$([\d,]+) Redfin Estimate per sq ft", details)
    house_info["garage_spaces"] = re.search(r"(\d+) garage spaces", details)
    house_info["has_ac"] = "Yes" if "Has A/C" in details else "No"

    # Extract agent details
    listed_by = re.search(r"Listed by (.+?) •DRE", details)
    bought_by = re.search(r"Bought with (.+?) •DRE", details)

    house_info["listed_by"] = listed_by.group(1) if listed_by else None
    house_info["bought_by"] = bought_by.group(1) if bought_by else None

    # Extract last updated timestamp
    last_checked = re.search(r"Redfin checked: (.+?)•Source", details)
    house_info["last_checked"] = last_checked.group(1).strip() if last_checked else None

    # Convert extracted matches into string values (handling None cases)
    for key, match in house_info.items():
        if isinstance(match, re.Match):  # Ensure it is a match object
            house_info[key] = match.group(1).strip()

    return house_info

In [32]:
tmp = await scrape_property_for_sale(["https://www.redfin.com/CA/Chino-Hills/16134-Firestone-Ln-91709/home/3941468"])

for item in tmp:
    if item.status_code == 200:
        processed_info = extract_redfin_property(item)

    else:
        print(f"the request fails with {item.status_code}")

Successfully fetched: https://www.redfin.com/CA/Chino-Hills/16134-Firestone-Ln-91709/home/3941468


## Setting Up Agents

In [73]:
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI
from langchain.tools import Tool
from langchain.schema import SystemMessage
import openai

import tiktoken


In [None]:
# OPENAI_API_KEY = ""
llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", openai_api_key=OPENAI_API_KEY)

In [95]:

# Text Agent: Generates renovation ideas
def generate_renovation_ideas(property_json):
    """Generates renovation suggestions based on property details."""
    prompt = f"""
    Given the following property details, suggest the some renovation ideas to increase its value:
    {property_json}

    - Example rennovation options include add more bedrooms, build ADU, replace carpet
    - Suggest cost-effective improvements

    Return a structured JSON with ideas, estimated cost, and expected value increase.
    """
    response = llm.invoke(prompt)
    return response.content

text_agent = Tool(
    name="Text Agent",
    func=generate_renovation_ideas,
    description="Generates renovation suggestions based on property details."
)

# Image Agent: Evaluates renovation ideas & adds insights
def review_renovations(image_urls, renovation_suggestions):
    """Evaluates if renovation ideas make sense based on property images."""
    prompt = f"""
    Review the following renovation suggestions based on these property images:
    Renovation Ideas: {renovation_suggestions}
    Images: {image_urls}

    - Check if the renovations are structurally feasible.
    - Identify additional improvements based on the images.
    - Flag any unrealistic suggestions.
    
    Return a refined renovation plan with updated recommendations.
    """
    response = llm.invoke(prompt)
    return response.content

image_agent = Tool(
    name="Image Agent",
    func=review_renovations,
    description="Evaluates renovation ideas based on property images and refines them."
)

# Initialize Multi-Agent System
agents = initialize_agent(
    tools=[text_agent, image_agent],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,  # Lets the LLM decide agent actions
    verbose=True
)


In [97]:
processed_info.keys()

dict_keys(['address', 'city', 'state', 'zip_code', 'price', 'beds', 'baths', 'sqft', 'property_url', 'images', 'description', 'property_type', 'built_year', 'lot_size', 'price_per_sqft', 'garage_spaces', 'has_ac', 'listed_by', 'bought_by', 'last_checked', 'structured_attributes'])

In [None]:
renovation_suggestions = text_agent.func(processed_info)
print("🏗️ Renovation Ideas:\n", renovation_suggestions)

final_renovation_plan = image_agent.func(processed_info['images'], renovation_suggestions)
print("🖼️ Final Renovation Plan:\n", final_renovation_plan)

🏗️ Renovation Ideas:
 {
    "renovation_ideas": [
        {
            "idea": "Update kitchen and bathrooms",
            "estimated_cost": "$10,000 - $15,000",
            "expected_value_increase": "$20,000 - $30,000"
        },
        {
            "idea": "Add a deck or patio in the backyard",
            "estimated_cost": "$5,000 - $7,000",
            "expected_value_increase": "$10,000 - $15,000"
        },
        {
            "idea": "Upgrade landscaping and add curb appeal",
            "estimated_cost": "$3,000 - $5,000",
            "expected_value_increase": "$5,000 - $10,000"
        },
        {
            "idea": "Install energy-efficient windows and doors",
            "estimated_cost": "$8,000 - $10,000",
            "expected_value_increase": "$15,000 - $20,000"
        }
    ]
}
🖼️ Final Renovation Plan:
 Based on the images provided, here is a refined renovation plan with updated recommendations:

1. Update kitchen and bathrooms:
   - The kitchen and bathrooms

In [None]:
def count_tokens(text, model="gpt-4"):
    """Counts the number of tokens in a text input for the given model."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

token_count = count_tokens(process_text_data(processed_info), model="gpt-4")


'The property is located at 16134 Firestone Ln, Chino Hills, CA 91709. It has 3 beds, 2.5 baths, 1,478 sqft, listed at $770,000. It is a single-family home built in 1989 with a lot size of 893,880 sqft. It has a price per sqft of $601, 2 garage spaces, AC, and various amenities.'