In [45]:
import requests
from bs4 import BeautifulSoup
import json
import time
import pandas as pd

In [None]:
"""
# Example URL for a for-sale page in California (adjust as needed)
url = "https://www.zillow.com/homes/for_sale/California_rb/"

# Complete headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://www.google.com/"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Option 1: Try to find the script tag with id "__NEXT_DATA__"
    script_tag = soup.find("script", id="__NEXT_DATA__")
    if script_tag and script_tag.string:
        json_text = script_tag.string.strip()
        try:
            data = json.loads(json_text)
            print("Top-level keys in __NEXT_DATA__ JSON:")
            print(list(data.keys()))
        except json.JSONDecodeError as e:
            print("JSON decoding error in __NEXT_DATA__ tag:", e)
            print("First 1000 characters of script tag:")
            print(json_text[:1000])
    else:
        print("No script tag with id '__NEXT_DATA__' found or it is empty.")
        
    # Option 2: If the above didn't work, iterate over all script tags and look for JSON candidates:
    print("\nChecking other script tags for JSON content...\n")
    scripts = soup.find_all("script")
    candidate_found = False
    for script in scripts:
        if script.string:
            text = script.string.strip()
            # A basic check: if it starts with "{" or "[", it might be JSON.
            if text.startswith("{") or text.startswith("["):
                try:
                    candidate = json.loads(text)
                    candidate_found = True
                    print("Found a candidate JSON with keys:", list(candidate.keys()) if isinstance(candidate, dict) else "Top-level type:", type(candidate))
                    # Optionally, print a small portion for inspection:
                    print(json.dumps(candidate, indent=2)[:1000])
                    break  # Remove break if you want to see more candidates
                except json.JSONDecodeError:
                    continue
    if not candidate_found:
        print("No valid JSON candidate found in any script tag.")
else:
    print("Failed to fetch the page. Status code:", response.status_code)
"""

In [None]:
"""
# Drill down into the JSON structure to find listings
listings = data.get("props", {}) \
               .get("pageProps", {}) \
               .get("searchPageState", {}) \
               .get("cat1", {}) \
               .get("searchResults", {}) \
               .get("listResults", [])

print(f"Found {len(listings)} listings.")

if listings:
    # Inspect the keys of the first listing
    first_listing = listings[0]
    print("Keys in the first listing:")
    print(list(first_listing.keys()))
    
    # For deeper inspection, you can pretty-print a nested object
    if "hdpData" in first_listing and "homeInfo" in first_listing["hdpData"]:
        print("\nKeys in homeInfo:")
        print(list(first_listing["hdpData"]["homeInfo"].keys()))
else:
    print("No listings found in this JSON branch.")
"""

In [None]:
['price', 'address', 'addresStreet', 'zipcode', 'city', 'state', 'latitude', 'longitude', 'price', 'bathrooms', 
 'bedrooms', 'livingArea', 'homeType', 'homeStatus', 'daysOnZillow', 'isFeatured', 'timeOnZillow', 'isZillowOwned', 'brokerName', 
 'lotAreaValue', 'lotAreaUnit']

In [53]:
# -----------------------------
# 1. URL Construction for Target States
# -----------------------------
# Mapping full state names to their two-letter lowercase abbreviations as seen in the URL examples.
state_abbr = {
    "Tennessee": "tn",
    "Texas": "tx",
    "Arkansas": "ar",
    "Mississippi": "ms",
    "Louisiana": "la",
    "Alabama": "al"
}

def construct_state_url(state, page_number=1):
    abbr = state_abbr.get(state, state.lower())
    if page_number == 1:
        return f"https://www.zillow.com/{abbr}/"
    else:
        return f"https://www.zillow.com/{abbr}/{page_number}_p/"

# -----------------------------
# 2. Set Headers
# -----------------------------
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://www.google.com/"
}

# -----------------------------
# 3. Fetch Listings JSON from a Given URL
# -----------------------------
def fetch_listings_from_url(url):
    print(f"Fetching URL: {url}")
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch {url} - Status code: {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    script_tag = soup.find("script", id="__NEXT_DATA__")
    if not script_tag or not script_tag.string:
        print("No __NEXT_DATA__ JSON found on", url)
        return []
    
    try:
        data = json.loads(script_tag.string)
    except json.JSONDecodeError as e:
        print("JSON decode error on", url, e)
        return []
    
    # Drill down to the listings array; the path might be similar to:
    listings = (data.get("props", {})
                    .get("pageProps", {})
                    .get("searchPageState", {})
                    .get("cat1", {})
                    .get("searchResults", {})
                    .get("listResults", []))
    return listings

# -----------------------------
# 4. Collect Listings Across All Target States
# -----------------------------
target_states = ["Tennessee", "Texas", "Arkansas", "Mississippi", "Louisiana", "Alabama"]
target_total = 10000
all_listings = []

for state in target_states:
    print(f"\n--- Scraping listings for state: {state} ---")
    page_number = 1
    while True:
        url = construct_state_url(state, page_number)
        listings = fetch_listings_from_url(url)
        if not listings:
            print(f"No listings returned for {state} on page {page_number}. Moving to next state.")
            break
        all_listings.extend(listings)
        print(f"Total listings collected so far: {len(all_listings)}")
        if len(all_listings) >= target_total:
            break
        page_number += 1
        time.sleep(2)  # Delay between requests
    if len(all_listings) >= target_total:
        break

# Trim if more than target_total
all_listings = all_listings[:target_total]
print(f"\nCollected {len(all_listings)} listings in total.")

# -----------------------------
# 5. Extract Desired Features from Each Listing
# -----------------------------
def extract_features(listing):
    # Get detailed home information from nested hdpData.homeInfo if available.
    home_info = listing.get("hdpData", {}).get("homeInfo", {})
    return {
        "price": listing.get("price"),
        "address": listing.get("address"),
        "addressStreet": listing.get("addressStreet"),
        "zipcode": listing.get("addressZipcode"),
        "city": listing.get("addressCity"),
        "state": listing.get("addressState"),  # or use home_info.get("state")
        "latitude": home_info.get("latitude") or listing.get("latLong", {}).get("latitude"),
        "longitude": home_info.get("longitude") or listing.get("latLong", {}).get("longitude"),
        "bathrooms": listing.get("baths") or home_info.get("bathrooms"),
        "bedrooms": listing.get("beds") or home_info.get("bedrooms"),
        "livingArea": home_info.get("livingArea"),
        "homeType": home_info.get("homeType"),
        "homeStatus": home_info.get("homeStatus"),
        "daysOnZillow": home_info.get("daysOnZillow"),
        "isFeatured": home_info.get("isFeatured"),
        "timeOnZillow": home_info.get("timeOnZillow"),
        "isZillowOwned": listing.get("isZillowOwned"),
        "brokerName": listing.get("brokerName"),
        "lotAreaValue": home_info.get("lotAreaValue"),
        "lotAreaUnit": home_info.get("lotAreaUnit")
    }

extracted_data = [extract_features(listing) for listing in all_listings]


--- Scraping listings for state: Tennessee ---
Fetching URL: https://www.zillow.com/tn/
Total listings collected so far: 41
Fetching URL: https://www.zillow.com/tn/2_p/
Total listings collected so far: 82
Fetching URL: https://www.zillow.com/tn/3_p/
Total listings collected so far: 123
Fetching URL: https://www.zillow.com/tn/4_p/
Total listings collected so far: 164
Fetching URL: https://www.zillow.com/tn/5_p/
Total listings collected so far: 205
Fetching URL: https://www.zillow.com/tn/6_p/
Total listings collected so far: 246
Fetching URL: https://www.zillow.com/tn/7_p/
Total listings collected so far: 287
Fetching URL: https://www.zillow.com/tn/8_p/
Total listings collected so far: 328
Fetching URL: https://www.zillow.com/tn/9_p/
Total listings collected so far: 369
Fetching URL: https://www.zillow.com/tn/10_p/
Total listings collected so far: 410
Fetching URL: https://www.zillow.com/tn/11_p/
Total listings collected so far: 451
Fetching URL: https://www.zillow.com/tn/12_p/
Total li

In [55]:
# -----------------------------
# 6. Save the Extracted Data to CSV
# -----------------------------
csv_filename = "zillow_listings.csv"
df = pd.DataFrame(extracted_data)
df.to_csv(csv_filename, index=False)
print(f"Saved extracted data to {csv_filename}")

Saved extracted data to zillow_listings.csv


In [57]:
df.head()

Unnamed: 0,price,address,addressStreet,zipcode,city,state,latitude,longitude,bathrooms,bedrooms,livingArea,homeType,homeStatus,daysOnZillow,isFeatured,timeOnZillow,isZillowOwned,brokerName,lotAreaValue,lotAreaUnit
0,"$398,900","181 Lakeview Trl, McMinnville, TN 37110",181 Lakeview Trl,37110,McMinnville,TN,35.69636,-85.82354,3.0,3.0,2756.0,SINGLE_FAMILY,FOR_SALE,1,False,138767000,False,,2.0,acres
1,"$650,000","151 John D St, La Vergne, TN 37086",151 John D St,37086,La Vergne,TN,35.99727,-86.56773,4.0,4.0,3999.0,SINGLE_FAMILY,FOR_SALE,1,False,89395000,False,,0.46,acres
2,"$419,995","3336 Quail Run Ct, Nashville, TN 37214",3336 Quail Run Ct,37214,Nashville,TN,36.135098,-86.633286,4.0,3.0,2329.0,SINGLE_FAMILY,FOR_SALE,0,False,52367000,False,,0.31,acres
3,"$4,100,000","1772 Lockertsville Rd, Ashland City, TN 37015",1772 Lockertsville Rd,37015,Ashland City,TN,36.33061,-87.095604,5.0,5.0,7901.0,SINGLE_FAMILY,FOR_SALE,4,False,397967000,False,,46.01,acres
4,"$999,999","2123 Jc Taber Ln, Murfreesboro, TN 37130",2123 Jc Taber Ln,37130,Murfreesboro,TN,35.87594,-86.345085,4.0,5.0,3454.0,SINGLE_FAMILY,FOR_SALE,6,False,570767000,False,,5.17,acres
